In [1]:
from imdbpie import Imdb
imdb = Imdb()
imdb = Imdb(anonymize=True) # to proxy requests

# Creating an instance with caching enabled
# Note that the cached responses expire every 2 hours or so.
# The API response itself dictates the expiry time)
imdb = Imdb(cache=True)

from __future__ import division, print_function
import json
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from IPython.core.display import HTML, Image

In [2]:
def get_soup_from_url(url):
    '''Returns Beautiful Soup for a given webpage'''
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,'lxml')
    return soup

def findTime(soup):
    a = soup.find(text=re.compile('Runtime:'))
    try:
        return str(a.findNext('time').contents[0])
    except:
        return np.nan

def findRating(soup):
    try:
        return str(soup.find('span', {'itemprop':'contentRating'}).text)
    except:
        return "Not Rated"
    
def findMeta(soup):
    a = soup.find('div',class_ = 'titleReviewBarItem')
    try:
        return int(a.findNext('span').contents[0])
    except:
        return np.nan
    
def findGross(soup):
    a = soup.findAll('div',{'id':'titleDetails'},{'class':'article'})
    for x in a:
        try:
            #print(x)
            for i in range(len(x)):
                #print(i)
                if str(x.findAll('div','txt-block')[i].getText().encode('ascii', 'ignore')\
                       .decode('ascii')).split()[0] == 'Gross:':
                    return(str(x.findAll('div','txt-block')[i].getText()).split()[1])
        except:
            return np.nan
            

def findBudget(soup):
    a = soup.findAll('div',{'id':'titleDetails'},{'class':'article'})
    for x in a:
        try:
            #print(x)
            for i in range(len(x)):
                #print(i)
                if str(x.findAll('div','txt-block')[i].getText().encode('ascii', 'ignore')\
                       .decode('ascii')).split()[0] == 'Budget:':
                    return(str(x.findAll('div','txt-block')[i].getText()).split()[1])
        except:
            return np.nan

In [3]:
links=[]

# data = imdb.top_250()
dfimdb = pd.DataFrame(imdb.top_250())
top_250_url = 'http://www.imdb.com/chart/top?ref_=nv_mv_250_6'
soup_250= get_soup_from_url(top_250_url)

for x in dfimdb['tconst']:
    url1 = 'http://www.imdb.com/title/'
    url2 = '/?ref_=nv_sr_1'
    link = url1 + x + url2
    links.append(str(link))

In [4]:
def get_movie_info(url):
    '''Takes in a movie's URL and scrapes that webpage using Beautiful Soup
    
    Returns a list of that movie's Title, Gross, Rating and Distributor
    
    
    '''
    soup = get_soup_from_url(url)
    
    results = []
    
    userRatings = str(soup.find('span',{'itemprop':'reviewCount'}).text.split()[0])
    criticRatings = int(soup.find('span',{'itemprop':'reviewCount'}).findNextSibling().text.split()[0])
    title_string = soup.find('h1').text
    title = str(title_string.split('(')[0].strip().encode('utf8'))
    year_string = str(title_string.split('(')[1].strip().encode('utf8'))
    year = int(year_string.replace(')',''))
    time = findTime(soup)
    MPAArating = findRating(soup)
    IMDBRating = float(soup.find('span',{'itemprop':'ratingValue'}).text)
    IMDBRatings = str(soup.find('span',{'itemprop':'ratingCount'}).text)
    genre = str(soup.find('span', {'itemprop':'genre'}).text)
    director = str(soup.find('span', {'itemprop':'name'}).text.encode('utf8'))
    writer = str(soup.find('span', {'itemprop':'creator'}).getText().encode('utf8')).strip().replace(',','')
    actor = str(soup.find('span', {'itemprop':'actors'}).text.encode('utf8')).strip().replace(',','')
    description = description = str(soup.find('div', {'itemprop':'description'}).getText().encode('utf8')).strip()
#     metascore = findMeta(soup)
#     gross = findGross(soup)
#     budget = findBudget(soup)
    
    results.append(title)
    results.append(year)
    results.append(time)
    results.append(genre)
    results.append(director)
    results.append(writer)
    results.append(actor)
    results.append(MPAArating)
    results.append(description)
    results.append(userRatings)
    results.append(criticRatings)
#     results.append(budget)
#     results.append(gross)
#     results.append(metascore)
    results.append(IMDBRatings)
    results.append(IMDBRating)


    return results  

In [5]:
table = []
for url in links:
#     print(url)
    info = get_movie_info(url)
    table.append(info)

In [6]:
df = pd.DataFrame(table, columns = ['Title','Year', 'Runtime','Genre','Director','Writer','Actor', \
                                    'MPAARating', 'Description', 'numUserRatings','numCriticRatings', \
                                    'numIMDBRatings','IMDBRating'])

In [7]:
df.to_csv('movie Info.csv')

# New approach - l'API de OMDB

In [2]:
def get_top_250():
    response = requests.get('http://www.imdb.com/chart/top')
    html = response.text
    entries = re.findall("<a href.*?/title/(.*?)/", html)
    return list(set(entries))

def get_entry(entry):
    res = requests.get('http://www.omdbapi.com/?i='+entry)
    if res.status_code != 200:
        print(entry, res.status_code)
    else:
        print('.', end=" " )
    try:
        j = json.loads(res.text)
    except ValueError:
        j = None
    return j

def get_gross(entry):
    response = requests.get('http://www.imdb.com/title/'+entry)
    html = response.text
    try:
        gross_list = re.findall("Gross:</h4>[ ]*\$([^ ]*)", html)
        gross = int(gross_list[0].replace(',', ''))
        print('.', end=" ")
        return gross
    except Exception as ex:
        print
        print(ex, entry, response.status_code)
        return None

def intminutes(x):
    y = x.replace('min', '').strip()
    return int(y)

def intvotes(x):
    y = x.replace(',', '').strip()
    return int(y)



In [3]:
entries = get_top_250()
len(entries)

250

In [4]:
entries_dict_list = [get_entry(e) for e in entries]

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

In [5]:
df = pd.DataFrame(entries_dict_list)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 20 columns):
Actors        250 non-null object
Awards        250 non-null object
Country       250 non-null object
Director      250 non-null object
Genre         250 non-null object
Language      250 non-null object
Metascore     250 non-null object
Plot          250 non-null object
Poster        250 non-null object
Rated         250 non-null object
Released      250 non-null object
Response      250 non-null object
Runtime       250 non-null object
Title         250 non-null object
Type          250 non-null object
Writer        250 non-null object
Year          250 non-null object
imdbID        250 non-null object
imdbRating    250 non-null object
imdbVotes     250 non-null object
dtypes: object(20)
memory usage: 39.1+ KB


In [7]:
grosses = [(e, get_gross(e)) for e in entries]

. . . . . . . list index out of range tt0046268 200
list index out of range tt0055630 200
. . . . . list index out of range tt0057115 200
. list index out of range tt0071315 200
. . . . . list index out of range tt0074896 200
. . list index out of range tt0021749 200
. . list index out of range tt0050613 200
list index out of range tt0053125 200
. . . list index out of range tt1220719 200
list index out of range tt0025316 200
. . list index out of range tt0083922 200
. . . list index out of range tt0072684 200
. list index out of range tt0074958 200
. . . . list index out of range tt0036775 200
. . . . . . list index out of range tt0978762 200
. . list index out of range tt0080678 200
. . list index out of range tt0056592 200
. list index out of range tt0095327 200
. . . . . . . . . list index out of range tt0046438 200
. list index out of range tt0015864 200
. list index out of range tt0045152 200
. . . . list index out of range tt0046911 200
. list index out of range tt0050986 200
. 

In [8]:
df1 = pd.DataFrame(grosses, columns=['imdbID', 'Gross'])
df1.head()

Unnamed: 0,imdbID,Gross
0,tt2582802,13092000.0
1,tt0047478,269061.0
2,tt0082971,242374454.0
3,tt0050212,27200000.0
4,tt0848228,623279547.0


In [9]:
df1['Gross'].isnull().sum()

65

In [10]:
df = df.replace('N/A', np.nan)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 20 columns):
Actors        250 non-null object
Awards        245 non-null object
Country       250 non-null object
Director      250 non-null object
Genre         250 non-null object
Language      249 non-null object
Metascore     168 non-null object
Plot          250 non-null object
Poster        250 non-null object
Rated         249 non-null object
Released      249 non-null object
Response      250 non-null object
Runtime       250 non-null object
Title         250 non-null object
Type          250 non-null object
Writer        250 non-null object
Year          250 non-null object
imdbID        250 non-null object
imdbRating    250 non-null object
imdbVotes     250 non-null object
dtypes: object(20)
memory usage: 39.1+ KB


In [12]:
df.Released = pd.to_datetime(df.Released)
df.Runtime = df.Runtime.apply(intminutes)
df.Year = df.Year.astype(int)
df.imdbRating = df.imdbRating.astype(float)
df.imdbVotes = df.imdbVotes.apply(intvotes)

In [13]:
df = pd.merge(df, df1)

In [14]:
cv = CountVectorizer()
data = cv.fit_transform(df.Genre).todense()
columns = ['genre_'+c for c in cv.get_feature_names()]
genredf = pd.DataFrame(data, columns=columns)
genredf.head()

Unnamed: 0,genre_action,genre_adventure,genre_animation,genre_biography,genre_comedy,genre_crime,genre_drama,genre_family,genre_fantasy,genre_fi,...,genre_music,genre_musical,genre_mystery,genre_noir,genre_romance,genre_sci,genre_sport,genre_thriller,genre_war,genre_western
0,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [15]:
df = pd.concat([df, genredf], axis = 1)

In [16]:
cv = CountVectorizer(token_pattern=u'(?u)\\w+\.?\\w?\.? \\w+')
data = cv.fit_transform(df.Actors).todense()
columns = ['actor: '+c for c in cv.get_feature_names()]
actorsdf = pd.DataFrame(data, columns=columns)
actorsdf.head()

Unnamed: 0,actor: aamir khan,actor: aaron eckhart,actor: abdel ahmed,actor: adam baldwin,actor: adam driver,actor: adolphe menjou,actor: adrien brody,actor: agnes moorehead,actor: ahney her,actor: akemi yamaguchi,...,actor: woody harrelson,actor: xolani mali,actor: yacef saadi,actor: yoshiko shinohara,actor: yukiko shimazaki,actor: yves montand,actor: yôko tsukasa,actor: zach grenier,actor: zoe saldana,actor: álvaro guerrero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df = pd.concat([df, actorsdf], axis = 1)

In [19]:
df.columns

Index([                  u'Actors',                   u'Awards',
                        u'Country',                 u'Director',
                          u'Genre',                 u'Language',
                      u'Metascore',                     u'Plot',
                         u'Poster',                    u'Rated',
       ...
         u'actor: woody harrelson',       u'actor: xolani mali',
             u'actor: yacef saadi', u'actor: yoshiko shinohara',
        u'actor: yukiko shimazaki',      u'actor: yves montand',
            u'actor: yôko tsukasa',      u'actor: zach grenier',
             u'actor: zoe saldana',   u'actor: álvaro guerrero'],
      dtype='object', length=856)

In [5]:
df = pd.read_csv('movieInfo.csv')

In [6]:
null_data = df[df.isnull().any(axis=1)]
len(null_data)

1

In [98]:
df.drop('Metascore',1, inplace = True)
null_data = df[df.isnull().any(axis=1)]
len(null_data)

84

In [99]:
df.drop('Gross', 1, inplace=True)
null_data = df[df.isnull().any(axis=1)]
len(null_data)

34

In [100]:
df.drop('Budget',1,inplace=True)
null_data = df[df.isnull().any(axis=1)]
len(null_data)

1

In [7]:
def ratingCleaner(x):
    y = str(x).split()[0]
    if y == 'Rated':
        return str(x).split()[1]
    elif y == 'Not':
        return x
    else:
        return y

def ratingCombiner(x):
    if x == 'M':
        return 'X'
    elif x == 'Unrated':
        return 'Not Rated'
    elif x == 'Passed':
        return 'Not Rated'
    elif x == 'Approved':
        return 'G'
    else:
        return x

In [8]:
df['numIMDBRatings'] = df['numIMDBRatings'].apply(lambda x: x.replace(',',''))
df['numUserRatings'] = df['numUserRatings'].apply(lambda x: x.replace(',',''))
df['numUserRatings'] = df['numUserRatings'].apply(lambda x: int(x))
df['numCriticRatings'] = df['numCriticRatings'].apply(lambda x: int(x))
df['numIMDBRatings'] = df['numIMDBRatings'].apply(lambda x: int(x))
df['Runtime'].fillna('146 min',inplace = True)
df['Runtime'] = df['Runtime'].apply(lambda x: x.split()[0])
df['Runtime'] = df['Runtime'].apply(lambda x: int(x))
df['MPAARating'] = df['MPAARating'].apply(lambda x: ratingCleaner(x))
df['MPAARating'] = df['MPAARating'].apply(lambda x: ratingCombiner(x))
df['Writer'] = df['Writer'].apply(lambda x: x.split('(')[0])
df.drop('Unnamed: 0',1,inplace=True)
df.head()

Unnamed: 0,Title,Year,Runtime,Genre,Director,Writer,Actor,MPAARating,Description,numUserRatings,numCriticRatings,numIMDBRatings,IMDBRating
0,The Shawshank Redemption,1994,142,Crime,Frank Darabont,Stephen King,Tim Robbins,R,Two imprisoned men bond over a number of years...,4120,198,1679862,9.3
1,The Godfather,1972,175,Crime,Francis Ford Coppola,Mario Puzo,Marlon Brando,R,The aging patriarch of an organized crime dyna...,2224,207,1149324,9.2
2,The Godfather: Part II,1974,202,Crime,Francis Ford Coppola,Francis Ford Coppola,Al Pacino,R,The early life and career of Vito Corleone in ...,652,149,786009,9.0
3,The Dark Knight,2008,152,Action,Christopher Nolan,Jonathan Nolan,Christian Bale,PG-13,When the menace known as the Joker wreaks havo...,4646,644,1664897,9.0
4,Schindler's List,1993,195,Biography,Steven Spielberg,Thomas Keneally,Liam Neeson,R,"In Poland during World War II, Oskar Schindler...",1266,174,859751,8.9


In [9]:
df['MPAARating'].value_counts()

R            104
Not Rated     49
PG            37
PG-13         34
G             24
X              2
Name: MPAARating, dtype: int64

# Modeling!

In [10]:
df.head()

Unnamed: 0,Title,Year,Runtime,Genre,Director,Writer,Actor,MPAARating,Description,numUserRatings,numCriticRatings,numIMDBRatings,IMDBRating
0,The Shawshank Redemption,1994,142,Crime,Frank Darabont,Stephen King,Tim Robbins,R,Two imprisoned men bond over a number of years...,4120,198,1679862,9.3
1,The Godfather,1972,175,Crime,Francis Ford Coppola,Mario Puzo,Marlon Brando,R,The aging patriarch of an organized crime dyna...,2224,207,1149324,9.2
2,The Godfather: Part II,1974,202,Crime,Francis Ford Coppola,Francis Ford Coppola,Al Pacino,R,The early life and career of Vito Corleone in ...,652,149,786009,9.0
3,The Dark Knight,2008,152,Action,Christopher Nolan,Jonathan Nolan,Christian Bale,PG-13,When the menace known as the Joker wreaks havo...,4646,644,1664897,9.0
4,Schindler's List,1993,195,Biography,Steven Spielberg,Thomas Keneally,Liam Neeson,R,"In Poland during World War II, Oskar Schindler...",1266,174,859751,8.9


In [11]:
df.columns

Index([u'Title', u'Year', u'Runtime', u'Genre', u'Director', u'Writer',
       u'Actor', u'MPAARating', u'Description', u'numUserRatings',
       u'numCriticRatings', u'numIMDBRatings', u'IMDBRating'],
      dtype='object')

In [12]:
dummydf = df[['Genre','Director','Writer', 'Actor','MPAARating']]
dummydf = pd.get_dummies(dummydf)

In [13]:
scalingdf = df[['Year','Runtime','numUserRatings','numCriticRatings','numIMDBRatings']]
scalingdf = MinMaxScaler().fit_transform(scalingdf.astype(float))
scalingdf = pd.DataFrame(scalingdf, columns = ['Year','Runtime','numUserRatings','numCriticRatings','numIMDBRatings'])

In [14]:
y = df['IMDBRating']

In [15]:
tfidfvect = TfidfVectorizer(stop_words = 'english')
tfidfvect.fit(df['Description'])

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [16]:
nlpdf  = pd.DataFrame(tfidfvect.transform(df['Description']).todense(),columns=tfidfvect.get_feature_names(),index=df['Description'])
nlpdf.transpose().sort_values(df['Description'][0], ascending=False).head(10).transpose()
nlpdf.reset_index(inplace=True)
nlpdf.drop('Description',1,inplace = True)
nlpdf.head()

Unnamed: 0,000,10,12,15,16th,1820s,18th,1920s,1936,1937,...,year,years,yoda,york,young,younger,youngest,zero,zone,zubrowka
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.252166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217608,0.0,0.0,...,0.0,0.0,0.0,0.170867,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
X = pd.concat([dummydf,scalingdf,nlpdf], axis =1)

In [18]:
def bucketingY(x):
    if x >= 9.0:
        return 1
    elif x >= 8.5:
        return 2
    else:
        return 3

In [19]:
y = y.apply(lambda x: bucketingY(x))

In [20]:
X.head()

  return key in self._engine


Unnamed: 0,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Biography,Genre_Comedy,Genre_Crime,Genre_Drama,Genre_Film-Noir,Genre_Horror,Genre_Mystery,...,year,years,yoda,york,young,younger,youngest,zero,zone,zubrowka
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.252166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.170867,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
