In [1]:
from __future__ import print_function, division
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import sys
import pickle as pkl

requests.__path__

['/Users/gpatient/anaconda/lib/python3.6/site-packages/requests']

In [2]:
url = 'http://www.imdb.com/robots.txt'

response = requests.get(url)

In [3]:
response.status_code

200

In [4]:
print(response.text)

# robots.txt for IMDb properties
#
#
# Limit ScoutJet's crawl rate
#
User-agent: ScoutJet
Crawl-delay: 3
#
#
# Yahoo!
User-agent: Slurp
Crawl-delay: .1
Disallow: /tvschedule
Disallow: /ActorSearch
Disallow: /ActressSearch
Disallow: /AddRecommendation
Disallow: /ads/
Disallow: /AlternateVersions
Disallow: /AName
Disallow: /Awards
Disallow: /BAgent
Disallow: /Ballot/
Disallow: /BornInYear
Disallow: /BornWhere
Disallow: /BPublicity
Disallow: /BQuotes
Disallow: /BTrivia
Disallow: /BusinessThisDay
Disallow: /BWorks
Disallow: /careers
Disallow: /help/show_leaf?careeratimdb
Disallow: /CommentsAuthor
Disallow: /CommentsEnter
Disallow: /CommentsIndex
Disallow: /Companies
Disallow: /CrazyCredits
Disallow: /Credits
Disallow: /DiedInYear
Disallow: /DiedWhere
Disallow: /DVD
Disallow: /ExciteTitle
Disallow: /Find
Disallow: /FName
Disallow: /GName
Disallow: /Guests
Disallow: /harvest_me
Disallow: /HelpPage
Disallow: /Icons/
Disallow: /JointVentures
Disallow: /Laserdisc
Disallow: /List
Disallow: /Lite

In [5]:
from urllib import robotparser
rp = robotparser.RobotFileParser()
rp.set_url('http://www.imdb.com/robots.txt')
rp.read()
#rp.can_fetch('*','http://www.imdb.com/')
rp.can_fetch('*','http://www.imdb.com/search')

True

In [6]:
def good_url(url):
    if not rp.can_fetch('*',url):
        print('robots not allowed here: {}\n'.format(url))
        return False
    return True

In [7]:
scrape_page=(
        'http://www.imdb.com/search/title?count=250&sort=num_votes,desc&title_type'
        '=feature&user_rating=1.1,2.0&view=simple'
    )
scrape_soup=BeautifulSoup(requests.get(scrape_page).text,'lxml')
#scrape_soup.prettify

In [8]:
#scrape list of urls

#this takes awhile, change pickled to False to run
url_list=list()
pickled=False
if pickled:
    with open('movie_urls.pkl', 'rb') as picklefile:
        url_list=pkl.load(picklefile)
assert(not pickled)

for i in range(1,10):
    #search by rating range, set to check just 1-2 until 
    low,high=i+0.1,i+1.0
    scrape_page=(
        'http://www.imdb.com/search/title?count=250&sort=num_votes,desc&title_type'
        '=feature&user_rating={},{}&view=simple'.format(low,high)
    )
    if not good_url(scrape_page): continue
    scrape_soup=BeautifulSoup(requests.get(scrape_page).text,'lxml')
    movie_list=scrape_soup.find_all(class_='lister-item-header')
    for element in movie_list:
        if element.find('a'):
            cur_url='http://www.imdb.com'+element.find('a')['href']
            if good_url(cur_url):
                url_list.append('http://www.imdb.com'+element.find('a')['href'])
        else:
            print('ERROR: URL MISSING')
    #print(scrape_page)
    #print(url_list)
    
with open('movie_urls.pkl', 'wb') as picklefile:
    pkl.dump(url_list, picklefile)
    

In [46]:
with open('movie_urls.pkl', 'wb') as picklefile:
    pkl.dump(url_list, picklefile)
    
! ls -l movie_urls.pkl

-rw-r--r--  1 gpatient  staff  136497 Jul 11 14:54 movie_urls.pkl


In [12]:
#parse for features
features=['Name','UserRating','Budget','Genres','ReleaseDate','Runtime','Directors','Writers',
          'Stars','ContentRating','ProductionCo']

def get_movie_features(url):
    movie_soup = BeautifulSoup(requests.get(url).text, 'lxml')
    re_features = [re.compile(x,re.IGNORECASE) for x in features]
    movie_series=pd.Series(index=features)
    details_soup=movie_soup.find(id='titleDetails')
    summary_soup=movie_soup.find(class_='plot_summary_wrapper')
    
    try:
        movie_series['Name']=movie_soup.find(itemprop=re_features[0]).text.split('(')[0].strip()
    #name has extra items
    
        movie_series['UserRating']=movie_soup.find(itemprop='ratingValue').text
        
        if not details_soup.find(text=re_features[2]):
            movie_series['Budget']=0
        else:
            movie_series['Budget']=details_soup.find(text=re_features[2]).find_parent().find_parent().text
            movie_series['Budget']=movie_series['Budget'].split(':')[1].strip().split(' ')[0].replace('$','').replace(',','').replace('€','')
           
        movie_series['Genres']=[]
        if movie_soup.find(text=re_features[3]):
            genre_soup=movie_soup.find(text=re_features[3]).find_parent().find_parent()
            movie_series['Genres']=[]
            for genre in genre_soup.find_all('a'):
                movie_series['Genres'].append(genre.text)
            
        if details_soup.find(text=re.compile('Release Date',re.IGNORECASE)):
            movie_series['ReleaseDate']=details_soup.find(text=re.compile('Release Date',re.IGNORECASE)).find_parent().find_parent().text
            movie_series['ReleaseDate']=movie_series['ReleaseDate'].split(':')[1].split('(')[0].strip()
        
        if details_soup.find(text=re_features[5]):
            movie_series['Runtime']=details_soup.find(text=re_features[5]).find_parent().findNextSibling().text.split(' ')[0]
        
        movie_series['Directors']=[]
        director_soup=summary_soup.find(itemprop='director').find_parent()
        for director in director_soup.find_all(itemprop='name'):
            movie_series['Directors'].append(director.text)
        
        movie_series['Writers']=[]
        if summary_soup.find(itemprop='creator'):
            writer_soup=summary_soup.find(itemprop='creator').find_parent()
            for writer in writer_soup.find_all(itemprop='name'):
                movie_series['Writers'].append(writer.text)
        
        movie_series['Stars']=[]
        if summary_soup.find(itemprop='actors'):
            star_soup=summary_soup.find(itemprop='actors').find_parent()
            for star in star_soup.find_all(itemprop='name'):
                movie_series['Stars'].append(star.text)
                
        if not movie_soup.find(class_='title_wrapper').find(itemprop='contentRating'):
            movie_series['ContentRating']='NOT RATED'
        else:
            movie_series['ContentRating']=movie_soup.find(class_='title_wrapper').find(itemprop='contentRating')['content']
        
        movie_series['ProductionCo']=[]
        if details_soup.find(text='Production Co:'):
            production_soup=details_soup.find(text='Production Co:').find_parent().find_parent()
            for co in production_soup.find_all(itemprop='name'):
                movie_series['ProductionCo'].append(co.text)
        
    except:
        print('generic problem with url: {}'.format(url))
        print(sys.exc_info())
    return(movie_series)

In [13]:
get_movie_features('http://www.imdb.com/title/tt3315342/')

Name                                                         Logan
UserRating                                                     8.2
Budget                                                    97000000
Genres                       [ Action,  Drama,  Sci-Fi,  Thriller]
ReleaseDate                                           3 March 2017
Runtime                                                        137
Directors                                          [James Mangold]
Writers                               [James Mangold, Scott Frank]
Stars                  [Hugh Jackman, Patrick Stewart, Dafne Keen]
ContentRating                                                    R
ProductionCo     [Donners' Company, Kinberg Genre, Marvel Enter...
dtype: object

In [14]:
len(url_list)

2250

In [15]:
#testing get_movie_features()
url = 'http://www.imdb.com/title/tt4458206/'
get_movie_features(url).loc['ProductionCo']

['Parantez Yapim']

In [17]:
#super slow crawl over everything
#needs optimization badly
check_in=250
scrape_max=len(url_list)+1
movies_df=pd.DataFrame(columns=features)
i = 0
for url in url_list:
    i +=1
    if i%check_in==0:
        print(i)
    if i > scrape_max:
        print('scraping too many qq')
        break
    try:
        cur_movie=get_movie_features(url)
        movies_df=movies_df.append(cur_movie,ignore_index=True)
    except:
        print('problem with: {}'.format(url))
movies_df.head()

generic problem with url: http://www.imdb.com/title/tt0806010/?ref_=adv_li_tt
(<class 'ValueError'>, ValueError('setting an array element with a sequence.',), <traceback object at 0x11027a108>)
250
500
750
1000
1250
1500
1750
2000
generic problem with url: http://www.imdb.com/title/tt6290024/?ref_=adv_li_tt
(<class 'AttributeError'>, AttributeError("'NoneType' object has no attribute 'find_parent'",), <traceback object at 0x10ca108c8>)
generic problem with url: http://www.imdb.com/title/tt6659488/?ref_=adv_li_tt
(<class 'AttributeError'>, AttributeError("'NoneType' object has no attribute 'find_parent'",), <traceback object at 0x10bf89e08>)
generic problem with url: http://www.imdb.com/title/tt6380508/?ref_=adv_li_tt
(<class 'AttributeError'>, AttributeError("'NoneType' object has no attribute 'find_parent'",), <traceback object at 0x10f0fbd08>)
generic problem with url: http://www.imdb.com/title/tt5265738/?ref_=adv_li_tt
(<class 'AttributeError'>, AttributeError("'NoneType' object has

Unnamed: 0,Name,UserRating,Budget,Genres,ReleaseDate,Runtime,Directors,Writers,Stars,ContentRating,ProductionCo
0,Disaster Movie,1.9,20000000,[ Comedy],29 August 2008,87,"[Jason Friedberg, Aaron Seltzer]","[Jason Friedberg, Aaron Seltzer]","[Carmen Electra, Vanessa Lachey, Nicole Parker]",PG-13,"[Lionsgate, Grosvenor Park Media, 3 in the Box]"
1,Reis,1.9,8000000,"[ Biography, Drama]",3 March 2017,108,[Hüdaverdi Yavuz],[Murat Özdil],"[Orhan Aydin, Özlem Balci, Volkan Basaran]",NOT RATED,[Kafkasör Film Akademisi]
2,The Hottie & the Nottie,1.9,9000000,"[ Comedy, Romance]",21 February 2008,91,[Tom Putnam],[Heidi Ferrer],"[Paris Hilton, Joel David Moore, Christine Lakin]",PG-13,"[Purple Pictures, Adrenalina Films, Nevinny / ..."
3,House of the Dead,2.0,7000000,"[ Action, Adventure, Horror]",10 October 2003,90,[Uwe Boll],"[Mark A. Altman, Dan Bates]","[Jonathan Cherry, Tyron Leitso, Clint Howard]",R,"[Boll Kino Beteiligungs GmbH & Co. KG, Mindfir..."
4,Manos: The Hands of Fate,1.9,19000,[ Horror],15 November 1966,70,[Harold P. Warren],[Harold P. Warren],"[Tom Neyman, John Reynolds, Diane Adelson]",NOT RATED,"[Norm-Iris, Sun City Films]"


In [19]:
movies_df.to_pickle('movie_data.pkl')
    
! ls -l movie_data.pkl

-rw-r--r--  1 gpatient  staff  743861 Jul 10 17:17 movie_data.pkl


In [18]:
len(movies_df)

2250