In [1]:
import pandas as pd
import re
import numpy as np
import json
import datetime

In [2]:
df_movies = pd.read_csv("../data/movies.csv")
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


In [3]:
def extract_movie_title(x):
    split_list = re.split('[()]', x)
    movie_title = split_list[0]
    movie_year = split_list[len(split_list)-2]
    return movie_title

def extract_movie_year(x):
    split_list = re.split('[()]', x)
    movie_title = split_list[0]
    movie_year = split_list[len(split_list)-2]
    return movie_year

def extract_movie_genres(x):
    genre_list = re.split('[|]', x)
    return genre_list

In [4]:
df_movies['movie_title']=df_movies['title'].apply(lambda x: extract_movie_title(x))
df_movies['movie_year']=df_movies['title'].apply(lambda x: extract_movie_year(x))
df_movies['movie_genre']=df_movies['genres'].apply(lambda x: extract_movie_genres(x))
df_movies

Unnamed: 0,movieId,title,genres,movie_title,movie_year,movie_genre
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995,"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995,"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,[Comedy]
...,...,...,...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy,Kein Bund für's Leben,2007,[Comedy]
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,"Feuer, Eis & Dosenbier",2002,[Comedy]
27275,131258,The Pirates (2014),Adventure,The Pirates,2014,[Adventure]
27276,131260,Rentun Ruusu (2001),(no genres listed),Rentun Ruusu,2001,[(no genres listed)]


In [5]:
genre_1 = "Drama"
df_movie_in_genre = df_movies[df_movies['movie_genre'].apply(lambda x: genre_1 in x)]
df_movie_in_genre.shape[0]

13344

In [6]:
df_movie_in_genre

Unnamed: 0,movieId,title,genres,movie_title,movie_year,movie_genre
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995,"[Comedy, Drama, Romance]"
10,11,"American President, The (1995)",Comedy|Drama|Romance,"American President, The",1995,"[Comedy, Drama, Romance]"
13,14,Nixon (1995),Drama,Nixon,1995,[Drama]
15,16,Casino (1995),Crime|Drama,Casino,1995,"[Crime, Drama]"
16,17,Sense and Sensibility (1995),Drama|Romance,Sense and Sensibility,1995,"[Drama, Romance]"
...,...,...,...,...,...,...
27256,131162,Por un puñado de besos (2014),Drama|Romance,Por un puñado de besos,2014,"[Drama, Romance]"
27259,131168,Phoenix (2014),Drama,Phoenix,2014,[Drama]
27262,131174,Gentlemen (2014),Drama|Romance|Thriller,Gentlemen,2014,"[Drama, Romance, Thriller]"
27263,131176,A Second Chance (2014),Drama,A Second Chance,2014,[Drama]


In [7]:
df_links = pd.read_csv("../data/links.csv")
df_links

Unnamed: 0,imdbId,movieId,tmdbId
0,tt0114709,1,862.0
1,tt0113497,2,8844.0
2,tt0113228,3,15602.0
3,tt0114885,4,31357.0
4,tt0113041,5,11862.0
...,...,...,...
9737,tt5476944,193581,432131.0
9738,tt5914996,193583,445030.0
9739,tt6397426,193585,479308.0
9740,tt8391976,193587,483455.0


In [8]:
df_movie_abt_1 = pd.merge(df_movies, df_links, 
                                    on=['movieId'], 
                                    suffixes=('', '_DROP'), 
                                    how='outer').filter(regex='^(?!.*_DROP)')
df_movie_abt_1

Unnamed: 0,movieId,title,genres,movie_title,movie_year,movie_genre,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,"[Adventure, Animation, Children, Comedy, Fantasy]",tt0114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995,"[Adventure, Children, Fantasy]",tt0113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995,"[Comedy, Romance]",tt0113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995,"[Comedy, Drama, Romance]",tt0114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,[Comedy],tt0113041,11862.0
...,...,...,...,...,...,...,...,...
28248,193581,,,,,,tt5476944,432131.0
28249,193583,,,,,,tt5914996,445030.0
28250,193585,,,,,,tt6397426,479308.0
28251,193587,,,,,,tt8391976,483455.0


In [9]:
df_tmdb = pd.read_csv("../data/tmdb_movies_data.csv")
df_tmdb

Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,...,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,...,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,5562,6.5,2015,1.379999e+08,1.392446e+09
1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,...,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,6185,7.1,2015,1.379999e+08,3.481613e+08
2,262500,tt2908446,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,http://www.thedivergentseries.movie/#insurgent,Robert Schwentke,One Choice Can Destroy You,...,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/2015,2480,6.3,2015,1.012000e+08,2.716190e+08
3,140607,tt2488496,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,http://www.starwars.com/films/star-wars-episod...,J.J. Abrams,Every generation has a story.,...,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/2015,5292,7.5,2015,1.839999e+08,1.902723e+09
4,168259,tt2820852,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,http://www.furious7.com/,James Wan,Vengeance Hits Home,...,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/2015,2947,7.3,2015,1.747999e+08,1.385749e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10861,21,tt0060371,0.080598,0,0,The Endless Summer,Michael Hynson|Robert August|Lord 'Tally Ho' B...,,Bruce Brown,,...,"The Endless Summer, by Bruce Brown, is one of ...",95,Documentary,Bruce Brown Films,6/15/1966,11,7.4,1966,0.000000e+00,0.000000e+00
10862,20379,tt0060472,0.065543,0,0,Grand Prix,James Garner|Eva Marie Saint|Yves Montand|Tosh...,,John Frankenheimer,Cinerama sweeps YOU into a drama of speed and ...,...,Grand Prix driver Pete Aron is fired by his te...,176,Action|Adventure|Drama,Cherokee Productions|Joel Productions|Douglas ...,12/21/1966,20,5.7,1966,0.000000e+00,0.000000e+00
10863,39768,tt0060161,0.065141,0,0,Beregis Avtomobilya,Innokentiy Smoktunovskiy|Oleg Efremov|Georgi Z...,,Eldar Ryazanov,,...,An insurance agent who moonlights as a carthie...,94,Mystery|Comedy,Mosfilm,1/1/1966,11,6.5,1966,0.000000e+00,0.000000e+00
10864,21449,tt0061177,0.064317,0,0,"What's Up, Tiger Lily?",Tatsuya Mihashi|Akiko Wakabayashi|Mie Hama|Joh...,,Woody Allen,WOODY ALLEN STRIKES BACK!,...,"In comic Woody Allen's film debut, he took the...",80,Action|Comedy,Benedict Pictures Corp.,11/2/1966,22,5.4,1966,0.000000e+00,0.000000e+00


In [11]:
df_movie_abt_2 = pd.merge(df_movie_abt_1, df_tmdb, 
                                    left_on='tmdbId', 
                                    right_on='id',
                                    suffixes=('', '_TMDB'), 
                                    how='outer')
df_movie_abt_2.loc[df_movie_abt_2["movie_title"].isnull(),'movie_title'] = df_movie_abt_2["original_title"]

df_movie_abt_2

Unnamed: 0,movieId,title,genres,movie_title,movie_year,movie_genre,imdbId,tmdbId,id,imdb_id,...,overview,runtime,genres_TMDB,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,"[Adventure, Animation, Children, Comedy, Fantasy]",tt0114709,862.0,862.0,tt0114709,...,Woody the cowboy is young Andyâ€™s favorite to...,81.0,Animation|Comedy|Family,Walt Disney Pictures|Pixar Animation Studios,10/30/1995,3141.0,7.5,1995.0,4.292901e+07,517950958.5
1,2.0,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995,"[Adventure, Children, Fantasy]",tt0113497,8844.0,8844.0,tt0113497,...,When siblings Judy and Peter discover an encha...,104.0,Adventure|Fantasy|Family,TriStar Pictures|Teitler Film|Interscope Commu...,12/15/1995,1105.0,6.6,1995.0,9.301285e+07,376054156.1
2,3.0,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995,"[Comedy, Romance]",tt0113228,15602.0,15602.0,tt0113228,...,A family wedding reignites the ancient feud be...,101.0,Romance|Comedy,Warner Bros.|Lancaster Gate,12/22/1995,45.0,6.7,1995.0,0.000000e+00,0.0
3,4.0,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995,"[Comedy, Drama, Romance]",tt0114885,31357.0,31357.0,tt0114885,...,"Cheated on, mistreated and stepped on, the wom...",127.0,Comedy|Drama|Romance,Twentieth Century Fox Film Corporation,12/22/1995,18.0,6.1,1995.0,2.289547e+07,116555336.5
4,5.0,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,[Comedy],tt0113041,11862.0,11862.0,tt0113041,...,Just when George Banks has recovered from his ...,106.0,Comedy,Sandollar Productions|Touchstone Pictures,12/8/1995,82.0,5.7,1995.0,0.000000e+00,109581884.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32901,,,,The Ghost & Mr. Chicken,,,,,13343.0,tt0059221,...,Luther Heggs aspires to being a reporter for h...,90.0,Comedy|Family|Mystery|Romance,Universal Pictures,1/20/1966,14.0,6.1,1966.0,4.702610e+06,0.0
32902,,,,The Ugly Dachshund,,,,,20277.0,tt0061135,...,The Garrisons (Dean Jones and Suzanne Pleshett...,93.0,Comedy|Drama|Family,Walt Disney Pictures,2/16/1966,14.0,5.7,1966.0,0.000000e+00,0.0
32903,,,,Carry On Screaming!,,,,,5060.0,tt0060214,...,The sinister Dr Watt has an evil scheme going....,87.0,Comedy,Peter Rogers Productions|Anglo-Amalgamated Fil...,5/20/1966,13.0,7.0,1966.0,0.000000e+00,0.0
32904,,,,Grand Prix,,,,,20379.0,tt0060472,...,Grand Prix driver Pete Aron is fired by his te...,176.0,Action|Adventure|Drama,Cherokee Productions|Joel Productions|Douglas ...,12/21/1966,20.0,5.7,1966.0,0.000000e+00,0.0


In [12]:
df_imdb = pd.read_csv("../data/imdb_attributes.csv")
df_imdb

Unnamed: 0,tid,title,url,imdbRating,duration,year,nrOfWins,nrOfNominations
0,tt0012349,Der Vagabund und das Kind (1921),http://www.imdb.com/title/tt0012349/,8.4,3240,1921,1,0
1,tt0015864,Goldrausch (1925),http://www.imdb.com/title/tt0015864/,8.3,5700,1925,2,1
2,tt0017136,Metropolis (1927),http://www.imdb.com/title/tt0017136/,8.4,9180,1927,3,4
3,tt0017925,Der General (1926),http://www.imdb.com/title/tt0017925/,8.3,6420,1926,1,1
4,tt0021749,Lichter der Gro√üstadt (1931),http://www.imdb.com/title/tt0021749/,8.7,5220,1931,2,0
...,...,...,...,...,...,...,...,...
14756,tt0672488,Peep Show Sectioning (TV Episode 2005),http://www.imdb.com/title/tt0672488/,7.7,1440,2005,0,0
14757,tt0675644,Playhouse 90 The Miracle Worker (TV Episode 1957),http://www.imdb.com/title/tt0675644/,7.3,5400,1957,0,0
14758,tt0679222,Private Screenings Robert Mitchum and Jane Rus...,http://www.imdb.com/title/tt0679222/,7,3600,1996,0,0
14759,tt0680064,Providence All the King's Men (TV Episode 2002),http://www.imdb.com/title/tt0680064/,,3600,2002,0,0


In [13]:
df_movie_abt_3 = pd.merge(df_movie_abt_2, df_imdb, 
                                    left_on='imdbId', 
                                    right_on='tid',
                                    suffixes=('', '_IMDB'), 
                                    how='outer')
df_movie_abt_3


Unnamed: 0,movieId,title,genres,movie_title,movie_year,movie_genre,imdbId,tmdbId,id,imdb_id,...,budget_adj,revenue_adj,tid,title_IMDB,url,imdbRating,duration,year,nrOfWins,nrOfNominations
0,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,"[Adventure, Animation, Children, Comedy, Fantasy]",tt0114709,862.0,862.0,tt0114709,...,42929006.01,517950958.5,tt0114709,Toy Story (1995),http://www.imdb.com/title/tt0114709/,8.3,4860,1995,19,13
1,2.0,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995,"[Adventure, Children, Fantasy]",tt0113497,8844.0,8844.0,tt0113497,...,93012846.36,376054156.1,tt0113497,Jumanji (1995),http://www.imdb.com/title/tt0113497/,6.8,6240,1995,4,8
2,3.0,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995,"[Comedy, Romance]",tt0113228,15602.0,15602.0,tt0113228,...,0.00,0.0,tt0113228,Der dritte Fr√ºhling (1995),http://www.imdb.com/title/tt0113228/,6.5,6060,1995,2,1
3,4.0,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995,"[Comedy, Drama, Romance]",tt0114885,31357.0,31357.0,tt0114885,...,22895469.87,116555336.5,tt0114885,Waiting to Exhale - Warten auf Mr. Right (1995),http://www.imdb.com/title/tt0114885/,5.5,7440,1995,8,8
4,5.0,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,[Comedy],tt0113041,11862.0,11862.0,tt0113041,...,0.00,109581884.4,tt0113041,Ein Geschenk des Himmels - Vater der Braut 2 (...,http://www.imdb.com/title/tt0113041/,5.8,6360,1995,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42169,,,,,,,,,,,...,,,tt0672488,Peep Show Sectioning (TV Episode 2005),http://www.imdb.com/title/tt0672488/,7.7,1440,2005,0,0
42170,,,,,,,,,,,...,,,tt0675644,Playhouse 90 The Miracle Worker (TV Episode 1957),http://www.imdb.com/title/tt0675644/,7.3,5400,1957,0,0
42171,,,,,,,,,,,...,,,tt0679222,Private Screenings Robert Mitchum and Jane Rus...,http://www.imdb.com/title/tt0679222/,7,3600,1996,0,0
42172,,,,,,,,,,,...,,,tt0680064,Providence All the King's Men (TV Episode 2002),http://www.imdb.com/title/tt0680064/,,3600,2002,0,0


In [14]:
movie_json_list = json.loads(df_movie_abt_3.to_json(orient='records'))

In [17]:
movie_list = []
movie_json_list = json.loads(df_movie_abt_3.to_json(orient='records'))
for i in range(len(movie_json_list)):
    if movie_json_list[i]['movie_title'] is None:
        if movie_json_list[i]['original_title'] is not None:
            movie_json_list[i]['movie_title'] = extract_movie_title(movie_json_list[i]['original_title'])
    if movie_json_list[i]['movie_title'] is None:
        if movie_json_list[i]['title_IMDB'] is not None:
            movie_json_list[i]['movie_title'] = extract_movie_title(movie_json_list[i]['title_IMDB'])
    if movie_json_list[i]['movie_year'] is None:
        if movie_json_list[i]['release_date'] is not None:
            movie_json_list[i]['movie_year'] = movie_json_list[i]['release_year']
    if movie_json_list[i]['movie_year'] is None:
        if movie_json_list[i]['year'] is not None:
            movie_json_list[i]['movie_year'] = extract_movie_year(movie_json_list[i]['year'])
    if movie_json_list[i]['movie_title'] not in movie_list:
        movie_list.append(movie_json_list[i]['movie_title'])
len(movie_list)

38871

In [18]:
df_movie_abt = pd.DataFrame.from_dict(movie_json_list, orient = 'columns')

In [19]:
df_movie_abt.to_csv("../data/movie_abt.csv")

In [20]:
df_movie_list = pd.DataFrame.from_dict(movie_list, orient='columns')
df_movie_list

Unnamed: 0,0
0,Toy Story
1,Jumanji
2,Grumpier Old Men
3,Waiting to Exhale
4,Father of the Bride Part II
...,...
38866,Peep Show Sectioning
38867,Playhouse 90 The Miracle Worker
38868,Private Screenings Robert Mitchum and Jane Rus...
38869,Providence All the King's Men


In [21]:
df_movie_list.to_csv('../data/movie_list.csv')