## Import libraries and read in list of movie IDs

In [1]:
import imdb
import pandas
import time
import numpy as np

In [2]:
tmdb_data = pandas.read_csv("Data_with_pixels.csv")

In [3]:
movies_and_ids = tmdb_data[["imdb_id", "title"]]
movies_and_ids.columns = ['id','movie']

In [4]:
movies_and_ids['id'][27000:27010]

27000           NaN
27001           NaN
27002           NaN
27003    tt5478632/
27004           NaN
27005           NaN
27006           NaN
27007     tt4489128
27008           NaN
27009           NaN
Name: id, dtype: object

In [5]:
print(movies_and_ids.shape)
print(tmdb_data.shape)
movies_and_ids.head()

(36599, 2)
(36599, 16)


Unnamed: 0,id,movie
0,tt0168629,Dancer in the Dark
1,tt0314412,My Life Without Me
2,tt0120586,American History X
3,tt0315543,Open Hearts
4,tt0416320,Match Point


## Create IMDB object and test out connection

In [6]:
from imdb import IMDb

In [7]:
#ia = IMDb('http')
#print ia.get_keyword(u'story')

In [8]:
ia = imdb.IMDb()

In [9]:
s_result = ia.search_movie('The Princess Bride')

# Retrieves default information for the first result (a Movie object).
the_unt = s_result[0]
ia.update(the_unt)

# Print some information.
#print the_unt['runtimes']
print the_unt['rating']
director = the_unt['director'] # get a list of Person objects.


8.1


In [9]:
s2 = ia.get_movie(s_result[0].movieID)

In [11]:
movies_and_ids[movies_and_ids['id']=='tt5112996']

Unnamed: 0,id,movie
24003,tt5112996,Hip Hip Hooray


In [14]:
movie = ia.search_movie('Hip Hip Hooray')[0] # a Movie instance.
print 'The movieID for The Untouchables:', movie.movieID
print 'The imdbID used by the site:', ia.get_imdbMovieID(movie.movieID)
print 'Same ID, smarter function:', ia.get_imdbID(movie)

The movieID for The Untouchables: 5112996
The imdbID used by the site: 5112996
Same ID, smarter function: 5112996


In [21]:
sample_id = 'tt5112996'
sample_id = sample_id[2:]
sample_movie = ia.get_movie(sample_id)

In [22]:
sample_movie.summary()

u'Movie\n=====\nTitle: Hip Hip Hooray (2016)\nGenres: Short, Comedy, Drama.\nDirector: Lizzy Sanford.\nWriter: Anna Cordell, Lizzy Sanford.\nCast: Anna Cordell (Anna), Josh Fadem (Josh), Kirby Howell-Baptiste (Kirby), Matt Ingebretson (Harry), Hayley Magnus (Hayley).\nRuntime: 12.\nCountry: USA.\nRating: 7.5 (10 votes).\n'

In [13]:
#ia.update(sample_movie)
sample_movie.summary()

u'Movie\n=====\nTitle: Existenz unter dem Minimum (1995)\nGenres: Short.\nDirector: Christine Pramhas.\nRuntime: 25.\nCountry: Austria.\nLanguage: German.\n'

## Function to extract data from IMDB

In [24]:
results

[]

In [11]:
import time
class Timer(object):
    def __init__(self, verbose=True):
        self.verbose = verbose

    def __enter__(self):
        self.start = time.time()
        return self

    def __exit__(self, *args):
        self.end = time.time()
        self.secs = self.end - self.start
        self.msecs = self.secs * 1000  # millisecs
        if self.verbose:
            print('elapsed time: %f ms %f projected finish' % (self.msecs, self.msecs*50000/(4*1000*3600)))

In [12]:
with Timer() as t:
    for i in range(1000):
        a = i*34

elapsed time: 0.694990 ms 0.002413 projected finish


In [69]:
movies_and_ids['id'][27003]

'tt5478632/'

In [72]:
def get_data(offset, interval, q, start):

    non_genre_string = np.array(['id','title', 'canonical title', 'imdbIndex','kind', 
                                 'year', 'rating','mpaa','votes','runtimes',
                                 'color info', 'genres','languages','plot','countries']) # 15 non-genre strings
    non_genre_person_list = np.array(['director', 'writer', 'cast','certificates'])  # 4 person lists 

    all_genres = np.array(['Action', 'Adventure', 'Adult', 'Animation',
    'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
    'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Short', 'Thriller',
    'War', 'Western','Talk-Show', 'News', 'Game-Show', 'Reality-TV', 'History', 'Sport']) # 26 genres
    all_results = np.concatenate((non_genre_string, non_genre_person_list, all_genres))
    movie_ids = np.unique(movies_and_ids['id'][offset:offset+interval])
    for num,each in enumerate(movie_ids):
        #with Timer() as t:
                try:
                    each=each.replace("/","")
                    result = ia.get_movie(each[2:])
                    if num%10==0: 
                        print(num,movie_ids.shape[0]-num, (time.time()-start)/60.0)
                    results = [each]
                    for index, each in enumerate(non_genre_string[1:]):
                        try:
                            test = result[each]
                            results.append(str(test))                
                        except:
                            results.append("null")

                    for index, each in enumerate(non_genre_person_list):
                        try:
                            test = result[each]
                            person_list = []
                            for index,person in enumerate(test):
                                person_list.append(str(person['name']))
                            results.append(str(person_list))
                        except:
                            results.append("null")

                    for index, each in enumerate(all_genres):
                        try:
                            genre_result = result['genres']
                            if each in genre_result:
                                results.append(1)

                            else:
                                results.append(0)
                        except:
                            results.append("null")

                except:
                    print("something didn't work for id", each, num)
                    time.sleep(7)
                    continue
                all_results = np.vstack((all_results, np.array(results)))
                time.sleep(0.1)

    features_df = pandas.DataFrame(data=all_results[1:,],  columns=all_results[0,])
    print("all set")
    q.put(features_df)

## Use multiple processes to speed up the extraction

In [74]:
from multiprocessing import Process, Queue

elapsed_time = time.time()

if __name__ == '__main__':
    q = Queue()
    for offset in [27000,28000,29000,30000]:
        interval = 1000
        Process(target=get_data, args=(offset,interval,q, time.time())).start()

("something didn't work for id", nan, 0)
("something didn't work for id", nan, 0)
("something didn't work for id", nan, 0)
("something didn't work for id", nan, 0)
(10, 609, 0.348053514957428)
(10, 621, 0.35834149916966757)
(10, 584, 0.3616960843404134)
(10, 573, 0.36872411568959557)
(20, 599, 0.5864330331484476)
(20, 574, 0.5995242516199748)
(20, 611, 0.6018765171368917)
(20, 563, 0.6085160175959269)
(30, 589, 0.8286088307698568)
(30, 601, 0.8452760179837545)
(30, 553, 0.8626572330792744)
(30, 564, 0.8768999338150024)
(40, 591, 1.0645524342854817)
(40, 579, 1.0653608163197836)
(40, 554, 1.1208269357681275)
(40, 543, 1.1246557513872781)
(50, 581, 1.2775986830393473)
(50, 569, 1.2885062972704568)
(50, 544, 1.3802138686180114)
(50, 533, 1.407025118668874)
(60, 559, 1.5173701643943787)
(60, 571, 1.5282713174819946)
(60, 534, 1.6081041852633158)
(60, 523, 1.6523966352144877)
(70, 561, 1.7663340330123902)
(70, 549, 1.7732317487398783)
(70, 524, 1.8577103694279988)
(70, 513, 1.86276659965515

## Collect dataframes from the q and combine

In [13]:
all_dfs = []
for _ in range(4):
    all_dfs.append(q.get())

In [14]:
output_result = all_dfs[0]
for each in all_dfs[1:]:
    output_result = output_result.append(each,ignore_index=True)

## Check data and export to csv

In [15]:
output_result.shape

(2526, 45)

In [17]:
output_result.head()

Unnamed: 0,id,title,canonical title,imdbIndex,kind,year,rating,mpaa,votes,runtimes,...,Short,Thriller,War,Western,Talk-Show,News,Game-Show,Reality-TV,History,Sport
0,tt3569732,Episode dated 10 June 1983,Episode dated 10 June 1983,,episode,1983,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tt0751118,Existence,Existence,,episode,2001,9.0,,1634.0,"[u'45', u'Argentina:60']",...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tt5112996,Existenz unter dem Minimum,Existenz unter dem Minimum,,movie,1995,,,,[u'25'],...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tt0100548,Sapore di donna,Sapore di donna,,movie,1990,5.2,,32.0,[u'89'],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tt3335458,,,,movie,1992,,,,[u'80'],...,,,,,,,,,,


### Rename file to reflect range of ids

In [16]:
output_result.to_csv("imdb_full_dataset_23000_26999.csv")

### possible keys we can get from IMDB
title; string; the "usual" title of the movie, like "The Untouchables".
long imdb title; string; "Uncommon Valor (1983/II) (TV)"
canonical title; string; the title in the canonical format,
                         like "Untouchables, The".
long imdb canonical title; string; "Patriot, The (2000)".
year; string; the year of release or '????' if unknown.
kind; string; one in ('movie', 'tv series', 'tv mini series', 'video game',
                      'video movie', 'tv movie', 'episode')
imdbIndex; string; the roman number for movies with the same title/year.
director; Person list; a list of director's name (e.g.: ['Brian De Palma'])
cast; Person list; list of actor/actress, with the currentRole instance
                   variable set to a Character object which describe his
                   role/duty.
cover url; string; the link to the image of the poster.
writer; Person list; list of writers ['Oscar Fraley (novel)']
plot; list; list of plots and authors of the plot.
rating; string; user rating on IMDb from 1 to 10 (e.g. '7.8')
votes; string; number of votes (e.g. '24,101')
runtimes; string list; in minutes ['119'] or something like ['USA:118',
          'UK:116']
number of episodes; int; number or episodes for a series.
color info; string list; ["Color (Technicolor)"]
countries; string list; production's country ['USA', 'Italy']
genres; string list; one or more in (Action, Adventure, Adult, Animation,
		Comedy, Crime, Documentary, Drama, Family, Fantasy, Film-Noir,
		Horror, Musical, Mystery, Romance, Sci-Fi, Short, Thriller,
		War, Western) and other genres defined by IMDb.
akas; string list; list of aka for this movie
languages; string list; list of languages
certificates; string list; ['UK:15', 'USA:R']
mpaa; string; the mpaa rating
episodes (series only); dictionary of dictionary; one key for every season,
                        one key for every episode in the season.
number of episodes (series only); int; total number of episodes.
number of seasons (series only); int; total number of seasons.
series years (series only); string; range of years when the series was produced.
episode of (episode only); Movie object; the parent series for an episode.
season (episode only); int; the season number.
episode (episode only); int; the number of the episode in the season.
long imdb episode title (episode only); string; episode and series title.
series title; string.
canonical series title; string.