## Import libraries and read in list of movie IDs

In [1]:
import imdb
import pandas
import time
import numpy as np

In [7]:
movies_and_ids = pandas.read_csv("CS109B_FinalProject-CurrentEDA/EDA_movies.csv", names=["movie", "id"])

In [8]:
movies_and_ids.head()

Unnamed: 0,movie,id
0,Tempus fugit,tt0390545
1,Sanam,tt0254749
2,Bike Week Exposed: Saints and Sinners,tt0367544
3,Dancer in the Dark,tt0168629
4,My Life Without Me,tt0314412


## Create IMDB object and test out connection

In [9]:
ia = imdb.IMDb()

In [10]:
s_result = ia.search_movie('The Princess Bride')

# Retrieves default information for the first result (a Movie object).
the_unt = s_result[0]
ia.update(the_unt)

# Print some information.
print the_unt['runtime']
print the_unt['rating']
director = the_unt['director'] # get a list of Person objects.


[u'98']
8.1


In [11]:
s2 = ia.get_movie(s_result[0].movieID)

In [12]:
movies_and_ids['id'][10]

'tt0040789'

## Function to extract data from IMDB

In [11]:
def get_data(offset, interval, q):
    all_results = np.array(['id','title', 'canonical title', 'kind', 'director', 'rating', 'year','votes', 
                         'mpaa','runtimes', 'color info', 'genres','Action', 
                         'Adventure', 'Adult', 'Animation', 'Comedy', 'Crime', 
                         'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
                         'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Short', 'Thriller',
                         'War', 'Western'])
                        
    all_genres = ['Action', 'Adventure', 'Adult', 'Animation',
    'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
    'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Short', 'Thriller',
    'War', 'Western']

    for num,each in enumerate(movies_and_ids['id'][offset:offset+interval]):
        try:
            result = ia.get_movie(each[3:])
            if (num+1)%5==0: print num
            results = [each]
            for index, each in enumerate(['title', 'canonical title', 'kind', 
                                          'director', 'rating', 'year','votes', 
                                          'mpaa','runtimes', 'color info', 'genres']):
                try:
                    test = result[each]
                    if each in ['genres','director', 'runtimes']: results.append(str(test[0]))
                    else: results.append(str(test))
                except:
                    results.append("null")
            for index, each in enumerate(all_genres):
                try:
                    genre_result = result['genres']
                    if each in genre_result:
                        results.append(1)
                    else:
                        results.append(0)
                except:
                    results.append("null")
            all_results = np.vstack((all_results, results))
        except:
            continue
    features_df = pandas.DataFrame(data=all_results[1:,],  columns=all_results[0,])
    q.put(features_df)

## Use two processes to speed up the extraction

In [1]:
from multiprocessing import Process, Queue

if __name__ == '__main__':
    q = Queue()
    for offset in [0,1000]:
        interval = 1000
        Process(target=get_data, args=(offset,interval,q)).start()


## Collect dataframes from the q and combine

In [13]:
all_dfs = []
for _ in range(2):
    all_dfs.append(q.get())

In [15]:
output_result = all_dfs[0]
for each in all_dfs[1:]:
    output_result = output_result.append(each,ignore_index=True)

## Check data and export to csv

In [16]:
output_result.head()

Unnamed: 0,id,title,canonical title,kind,director,rating,year,votes,mpaa,runtimes,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Short,Thriller,War,Western
0,tt0179099,,,movie,Haroldo Marinho Barbosa,5.5,1986,43.0,,110,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tt0402362,Margam,Margam,movie,Rajiv Vijayaraghavan,8.4,2003,11.0,,India:108,...,,,,,,,,,,
2,tt0156075,The Shrunken City,"Shrunken City, The",video movie,Ted Nicolaou,4.5,1998,163.0,Rated PG for fantasy action violence,USA:90,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,tt4094088,Sweet as You Are,Sweet as You Are,episode,Angela Pope,7.2,1988,53.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tt0494546,The Ranch Girl,"Ranch Girl, The",movie,Allan Dwan,,1911,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [22]:
output_result.to_csv("imdb_features_sample_2.csv")

### possible keys we can get from IMDB
title; string; the "usual" title of the movie, like "The Untouchables".
long imdb title; string; "Uncommon Valor (1983/II) (TV)"
canonical title; string; the title in the canonical format,
                         like "Untouchables, The".
long imdb canonical title; string; "Patriot, The (2000)".
year; string; the year of release or '????' if unknown.
kind; string; one in ('movie', 'tv series', 'tv mini series', 'video game',
                      'video movie', 'tv movie', 'episode')
imdbIndex; string; the roman number for movies with the same title/year.
director; Person list; a list of director's name (e.g.: ['Brian De Palma'])
cast; Person list; list of actor/actress, with the currentRole instance
                   variable set to a Character object which describe his
                   role/duty.
cover url; string; the link to the image of the poster.
writer; Person list; list of writers ['Oscar Fraley (novel)']
plot; list; list of plots and authors of the plot.
rating; string; user rating on IMDb from 1 to 10 (e.g. '7.8')
votes; string; number of votes (e.g. '24,101')
runtimes; string list; in minutes ['119'] or something like ['USA:118',
          'UK:116']
number of episodes; int; number or episodes for a series.
color info; string list; ["Color (Technicolor)"]
countries; string list; production's country ['USA', 'Italy']
genres; string list; one or more in (Action, Adventure, Adult, Animation,
		Comedy, Crime, Documentary, Drama, Family, Fantasy, Film-Noir,
		Horror, Musical, Mystery, Romance, Sci-Fi, Short, Thriller,
		War, Western) and other genres defined by IMDb.
akas; string list; list of aka for this movie
languages; string list; list of languages
certificates; string list; ['UK:15', 'USA:R']
mpaa; string; the mpaa rating
episodes (series only); dictionary of dictionary; one key for every season,
                        one key for every episode in the season.
number of episodes (series only); int; total number of episodes.
number of seasons (series only); int; total number of seasons.
series years (series only); string; range of years when the series was produced.
episode of (episode only); Movie object; the parent series for an episode.
season (episode only); int; the season number.
episode (episode only); int; the number of the episode in the season.
long imdb episode title (episode only); string; episode and series title.
series title; string.
canonical series title; string.