## Data Extraction and Manipulation from IMDB text files

Links

<ul>
  <li>
  <a href="http://imdbpy.sourceforge.net">http://imdbpy.sourceforge.net </a>
  </li>
</ul>

In [2]:
import pandas as pd
from imdb import IMDb
import datetime 
import time
import re
import numpy as np

Read in movie (imbd_ids) to be downloaded 
The list of ids comes from the movie lense data set

In [None]:
movie_list = pd.read_csv('./datasets/ml_data.csv', usecols = ["imdbId"]) 

#for re-nrun on failures from first pass
#movie_list = pd.read_csv('./datasets/pass2/imdb_fail.csv', usecols=["imbd_id"]) 

#size of data frame
print 'number of movies:', movie_list.shape[0]

movie_list.head(10)

Role master table (people labels in imdb XML)

In [21]:
roles    = ["director", "writer","cast", "production-manager"
            ,"original music","casting director","visual effects"]
role_ids = [1,2,3,4,5,6,7] 
df_roles = pd.DataFrame({'role_id' : role_ids, 'role' : roles })

#Roles available but excluded
#-music-department
#-art direction
#-sound-crew
#-art-department

<h2>imdb helper functions</h2>  
  
gets all people for a given role & movie

In [22]:
#returns a dataframe (imbd_id,role_id,person_id,name)
def people_in_role(movie, imdb_id, role = "cast", role_id = 1):
    names=[]
    person_ids=[]
    n = 0

    try: 
        people_count = len(movie[role])
    
        for person in movie[role]:
            name = str(person)

            #Extract the Person ID
            person_xml = person.asXML()     
            try:
                # returns <person id="#">
                found = re.search('(<person id="([^"]|"")*")'
                                  , person_xml).group(1) 
                # returns "#"
                found = re.search('("([^"]|"")*")', found).group(1) 

                # returns #
                person_id = int(found[1:-1])                                 
            except AttributeError:
                found = '0' 

            names.append(name)
            person_ids.append(person_id) 
            n = n + 1
    except:
            #no people in this movie have this role
            n = 0
            
    return pd.DataFrame({   'imbd_id'   : [imdb_id] * n,
                             'role_id'   : [role_id] * n,
                             'person_id' : person_ids,
                             'name'      : names
                           })

gets all people in all roles for a given movie

In [23]:
#returns a dataframe (imbd_id,role_id,person_id,name)
def people_in_movie(movie, imdb_id):

    ls_movie_people = []

    df_movie_people = pd.DataFrame({   'imbd_id'   : [0]  * 0,
                                       'role_id'   : [0]  * 0,
                                       'person_id' : [0]  * 0,
                                       'name'      : [''] * 0
                                  })

    #gather people for each role
    for i in xrange((len(df_roles.index))):
        role_id = df_roles.role_id[i] #df_roles.loc[i,0].value
        role    = df_roles.role[i]

        p = people_in_role(movie = movie, imdb_id = imdb_id, role_id = role_id, role = role)
        ls_movie_people.append(p)

    #gather roles into single dataframe    
    df_movie_people = pd.concat(ls_movie_people)

    return df_movie_people

Gets an XML tag for a movie with a default value if the tag is missing

In [24]:
#wrapper for imdbpy funciton movie
#adds error handeling incase xml tag is missing 
def movie_isnull(movie, tag_name, default_value=""):
  try: 
    return_val = movie[tag_name]
  except:
    return_val = default_value
  return return_val  

<h2>IMDb extract main funciton</h2>  
Extract matching imdb movies that are in the movie lense list

In [None]:
#connect to imdb web service
ia = IMDb()

ls_movie = []
ls_people = []
ls_fail = []

for i in xrange(len(movie_list.index)):
    
    #Slow the script to avoid overloading the imdb server
    time.sleep(.2)
    
    #retrieve movie object form imdb
    imdb_id = movie_list.iloc[i,0]
    
    try:
        #get movie object (contains movie XML data and helper functions)
        movie = ia.get_movie(str(imdb_id).zfill(7))

        #get 1:1 flat file fields and simple 1:[1D array] fields for movie
        d_movie = { 'imdb_id'        : imdb_id,
                    'title'          : movie_isnull(movie, "title", ""),
                    'rating'         : movie_isnull(movie, "rating", np.nan),
                    'votes'          : movie_isnull(movie, "votes", np.nan),
                    'runtime'        : movie_isnull(movie, "runtimes", [np.nan])[0],
                    'year'           : movie_isnull(movie, "year", np.nan),
                    'cover_url'      : movie_isnull(movie, "cover url", ""),
                    'cover_url_full' : movie_isnull(movie, "cover_url_full", ""),
                    'plot_outline'   : movie_isnull(movie, "plot outline", ""), 
                    'kind'           : movie_isnull(movie, "kind", "None"),
                    'genres'         : '|'.join(movie_isnull(movie, "genres", "")),
                    'language_codes' : '|'.join(movie_isnull(movie, "language codes", "EN")),
                    'country_codes'  : '|'.join(movie_isnull(movie, "country codes", "")),
                    'plot'           : '\n'.join(movie_isnull(movie, "plot", ""))     
                  }

        #add movie record to list of successful downloads
        ls_movie.append(pd.DataFrame.from_records([d_movie]))

        #get people (actors, directors, writers ect)
        ls_people.append(people_in_movie(movie, imdb_id=imdb_id))
    
    except:
        #record failures for diagnostics and re-run
        ls_fail.append(imdb_id)

    #every thousand records: print status update 
    if i % 100 == 0:    
        Print status update 
        print i, len(ls_fail), datetime.datetime.now().time()
        
df_movies = pd.concat(ls_movie)
df_people = pd.concat(ls_people)

In [30]:
#write results to file
df_movies.to_csv('imdb_movies.csv', encoding='utf-8')
df_people.to_csv('imdb_people.csv', encoding='utf-8')
df_roles.to_csv('imdb_roles.csv', encoding='utf-8')

In [31]:
#write failures to csv
df_fail = pd.DataFrame({'imbd_id'  : ls_fail})
df_fail.to_csv('imdb_fail.csv', encoding='utf-8')
print 'number of failures:', df_fail.shape[0]

number of failures: 0


In [None]:
#Combine Pass 1 and pass 2
df_movies1 = pd.read_csv('./datasets/pass1/imdb_movies.csv') 
df_movies2 = pd.read_csv('./datasets/pass2/imdb_movies.csv') 
df_movies = pd.concat([df_movies1,df_movies2])
df_movies.to_csv('imdb_movies.csv', encoding='utf-8')
print len(df_movies.index)

df_people1 = pd.read_csv('./datasets/pass1/imdb_people.csv',encoding='utf-8') 
df_people2 = pd.read_csv('./datasets/pass2/imdb_people.csv',encoding='utf-8')
df_people = pd.concat([df_people1,df_people2])
df_people.to_csv('imdb_people.csv', encoding='utf-8')