In [1]:
import pandas as pd
import numpy as np
import re
import datetime
import ast
import time
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## Load Dataset

In [2]:
# Load datasets
ratings = pd.read_csv('movie/ratings.csv')
# print ('Shape of the ratings data frame:', ratings.shape)

keywords = pd.read_csv('movie/keywords.csv')
print ('Shape of the keywords data frame:', keywords.shape)

movies = pd.read_csv('movie/movies_metadata.csv')
print ('Shape of the movies data frame:', movies.shape)

# links = pd.read_csv('movie/links.csv')
# print ('Shape of the links data frame:', links.shape)

credits = pd.read_csv('movie/credits.csv')
print ('Shape of the credits data frame:', credits.shape)

Shape of the keywords data frame: (46419, 2)
Shape of the movies data frame: (45466, 24)
Shape of the credits data frame: (45476, 3)


### Movies_metadata dataframe
- adult: Indicates if the movie is X-Rated or Adult.
- belongs_to_collection: A stringified dictionary that gives information on the movie series the particular film belongs to.
- budget: The budget of the movie in dollars.
- genres: A stringified list of dictionaries that list out all the genres associated with the movie.
- homepage: The Official Homepage of the move.
- id: The TMDB ID of the movie.
- imdb_id: The IMDB ID of the movie.
- original_language: The language in which the movie was originally shot in.
- original_title: The original title of the movie.
- overview: A brief blurb of the movie.
- popularity: The Popularity Score assigned by TMDB.
- poster_path: The URL of the poster image.
- production_companies: A stringified list of production companies involved with the making of the movie.
- production_countries: A stringified list of countries where the movie was shot/produced in.
- release_date: Theatrical Release Date of the movie.
- revenue: The total revenue of the movie in dollars.
- runtime: The runtime of the movie in minutes.
- spoken_languages: A stringified list of spoken languages in the film.
- status: The status of the movie (Released, To Be Released, Announced, etc.)
- tagline: The tagline of the movie.
- title: The Official Title of the movie.
- video: Indicates if there is a video present of the movie with TMDB.
- vote_average: The average rating of the movie.
- vote_count: The number of votes by users, as counted by TMDB.

In [3]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [4]:
## check the number of null values of each column
movies.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

### Rating dataframe
- userId: It is id for User
- movieId: It is TMDb movie id.
- rating: Rating given for the particular movie by specific user
- timestamp: Time stamp when rating has been given by user (we need to transform)

***Optimization by the method of list comprehension**

In [5]:
ratings = pd.read_csv('movie/ratings.csv')

In [6]:
'''
## BEFORE Optimization  (### 30.863416s)
start_time = time.time()
ratings['timestamp'] = ratings['timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x))
end_time = time.time()
print("run_time: %f" % (end_time - start_time))
''';

In [7]:
ratings = pd.read_csv('movie/ratings.csv')

In [8]:
## AFTER Optimization (### 24.033216s)
def fun_timestamp():
    ratings['timestamp'] = [datetime.datetime.fromtimestamp(ts) for ts in ratings['timestamp']]
start_time = time.time()
fun_timestamp()
end_time = time.time()
print("run_time: %f" % (end_time - start_time))

run_time: 18.324526


Time speeds up!

According to the above results, we could see that by using list comprehension, the execution time was reduced from 30.86s to 24.03s. 

In [9]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,2015-03-09 18:52:09
1,1,147,4.5,2015-03-09 19:07:15
2,1,858,5.0,2015-03-09 18:52:03
3,1,1221,5.0,2015-03-09 18:52:26
4,1,1246,5.0,2015-03-09 18:52:36


In [10]:
'''
%%writefile movieid_min.py

from mpi4py import MPI
import numpy as np

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

#Comm.Scatter

if rank == 0:
    #initialize 
    data = np.array(ratings['movieId'])     
else:
    data = None

buf = np.empty(int(ratings.shape[0]/size),dtype='f')

#Scatter the data to all processes
comm.Scatter(data, buf, root = 0)

#local operations
local_min = min(buf)
local_max = max(buf)
# print('Process {}, local min = {:f}'.format(rank, local_min))

#Reduce local min and max to global min and max
global_min = comm.allreduce(local_min, MPI.MIN)
global_max = comm.allreduce(local_max, MPI.MAX)

if rank == 0:
    print('Min = ', global_min, ', Max = ', global_max)
''';

In [11]:
'''
!mpirun -n 16 python3 movieid_min.py
''';

In [12]:
ratings['movieId'].min()

1

In [13]:
ratings['movieId'].max()

176275

In [14]:
ratings.describe()

Unnamed: 0,userId,movieId,rating
count,26024290.0,26024290.0,26024290.0
mean,135037.1,15849.11,3.52809
std,78176.2,31085.26,1.065443
min,1.0,1.0,0.5
25%,67164.0,1073.0,3.0
50%,135163.0,2583.0,3.5
75%,202693.0,6503.0,4.0
max,270896.0,176275.0,5.0


### Link dataframe
- movieId: It's a serial number for movie
- imbdId: Movie id given on IMDb platform
- tmdbId: Movie id given on TMDb platform. (it also appears in other dataframe as name "id")

In [15]:
links = pd.read_csv('movie/links.csv')
links.head(n=10)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
5,6,113277,949.0
6,7,114319,11860.0
7,8,112302,45325.0
8,9,114576,9091.0
9,10,113189,710.0


### Keywords dataframe
- id: It's movie ID given by TMDb
- Keywords: Tags/keywords for the movie. It list of tags/keywords

In [16]:
keywords.head(5)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


### Credit dataframe
- cast: Information about casting. Name of actor, gender and it's character name in movie
- crew: Information about crew members. Like who directed the movie, editor of the movie and so on.
- id: It's movie ID given by TMDb

In [17]:
credits.head(5)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


## Pre-processing

In [18]:
movies = pd.read_csv('movie/movies_metadata.csv')

In [19]:
## change "True" to 1 and "False" to 0 in 'adult' column
f = lambda x:1 if x == True else 0
movies['adult'] = movies['adult'].apply(f)


In [20]:
g = movies[['id', 'genres']]
genlist = []

for i in range(g.shape[0]):
    gen = ast.literal_eval(g.iloc[i]['genres'])
    for each in gen:
        each['id'] = g['id'][i]
    genlist.extend(gen)
genre = pd.DataFrame(genlist)
genre = genre.rename(columns={'name':'genre'})
genre = genre[['id', 'genre']]
genre['tmp'] = 1
pivot = genre.pivot_table('tmp','id', 'genre', fill_value=0)
flattened = pd.DataFrame(pivot.to_records())
movies = pd.merge(movies, flattened, on = 'id', how = 'left')
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,Romance,Science Fiction,Sentai Filmworks,TV Movie,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western
0,0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'Action', 'Adventure', 'Animation',
       'Aniplex', 'BROSTA TV', 'Carousel Productions', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'GoHands',
       'History', 'Horror', 'Mardock Scramble Production Committee', 'Music',
       'Mystery', 'Odyssey Media', 'Pulser Productions', 'Rogue State',
       'Romance', 'Science Fiction', 'Sentai Filmworks', 'TV Movie',
       'Telescene Film Group Productions', 'The Cartel', 'Thriller',
       'Vision View Entertainment', 'War', 'Western'],
      dtype='object')

In [22]:
movies = movies[movies.id!='1997-08-20']
movies = movies[movies.id!='2012-09-29']
movies = movies[movies.id!='2014-01-01']

In [23]:
movies['id'] = movies['id'].astype(int)

***Optimize the algorithm of filtering out casts and crews from the credits dataset**

* Before Optimization: tried two ways

In [24]:
'''
## First trial: for loop (### 80.125805s)
credits = pd.read_csv('movie/credits.csv')

def filter_cast(dat):
    all_casts = []
    for i in range(dat.shape[0]):
        cast = ast.literal_eval(dat.iloc[i]['cast'])  
        for x in cast:
            x['movie_id'] = dat['id'][i]
        all_casts.extend(cast)
    return pd.DataFrame(all_casts)
def filter_crew(dat):
    all_crews = []
    for i in range(dat.shape[0]):
        crew = ast.literal_eval(dat.iloc[i]['crew'])
        for x in crew:
            x['movie_id'] = dat['id'][i]
        all_crews.extend(crew)
    return pd.DataFrame(all_crews)

start_time = time.time()

all_casts = filter_cast(credits)
all_crews = filter_crew(credits)

end_time = time.time()
print("run_time: %f" % (end_time - start_time))

''';

In [25]:
'''
## Second trial vectorization and the application of python built-in function (### 99.226750s)
credits = pd.read_csv('movie/credits.csv')

credits['cast'] = [eval(x) for x in credits['cast'] ]
credits['crew'] = [eval(x) for x in credits['crew'] ]
credits.rename(columns={'id':'movie_id'}, inplace=True)

def filter_order0(x):
    return list(filter(lambda order0: order0['order'] == 0, x))
def filter_job0(x):
    return list(filter(lambda job0: job0['job'] == 'Director', x))

def filter_fun(y):
    sub_dat = credits[[y, 'movie_id']]
    if y == 'cast':
        sub_dat[y] = [filter_order0(x) for x in sub_dat[y] ]
    else:
        sub_dat[y] = [filter_job0(x) for x in sub_dat[y] ]
    sub_dat = sub_dat[sub_dat[y].map(lambda d: len(d)) > 0] ##remove empty list rows
    sub_dat = sub_dat.apply(lambda x: x.apply(pd.Series).stack()).reset_index()
    sub_dat = pd.concat([sub_dat.drop(y, axis=1), sub_dat[y].apply(pd.Series)], axis=1)
    sub_dat = sub_dat.fillna(method='ffill')
    return sub_dat
  
start_time = time.time()

all_casts = filter_fun('cast')
all_crews = filter_fun('crew')
end_time = time.time()
print("run_time: %f" % (end_time - start_time))

''';

***Performance tuning: line_profiler**

* According to the above result we could see that the for loop did better than the vectorization method. In order to further imporve the efficiency the for loop method, we apply the line_profiler to time the execution of each individual line inside the function, so that we could optimize the specific line

In [26]:
%load_ext line_profiler

In [27]:
'''
credits = pd.read_csv('movie/credits.csv')

def filter_cast(dat):
    all_casts = []
    for i in range(dat.shape[0]):
        cast = ast.literal_eval(dat.iloc[i]['cast'])
        for x in cast:
            x['movie_id'] = dat['id'][i]
        all_casts.extend(cast)
    return pd.DataFrame(all_casts)

def filter_crew(dat):
    all_crews = []
    for i in range(dat.shape[0]):
        crew = ast.literal_eval(dat.iloc[i]['crew'])
        for x in crew:
            x['movie_id'] = dat['id'][i]
        all_crews.extend(crew)
    return pd.DataFrame(all_crews)
    
''';

In [28]:
'''
%lprun -f filter_cast filter_cast(credits)

''';

* From the line_profiler results, we could see that ast.literal_eval consumes a lot of time, so we want to work on this line to optimize the whole process

In [29]:
'''
### AFTER Optimization:  ( ### 48.237104s )
credits = pd.read_csv('movie/credits.csv')
def filter_fun(dat, y):
    filter_result = []
    for i in range(dat.shape[0]):
        sub_dat = eval(dat[y][i])
        for x in sub_dat:
            x['movie_id'] = dat['id'][i]
        filter_result.extend(sub_dat)
    return pd.DataFrame(filter_result)

start_time = time.time()

all_casts = filter_fun(credits, 'cast')
all_crews = filter_fun(credits, 'crew')

end_time = time.time()
print("run_time: %f" % (end_time - start_time))
''';

In [30]:
'''
credits = pd.read_csv('movie/credits.csv')
def filter_fun(dat, y):
    filter_result = []
    for i in range(dat.shape[0]):
        sub_dat = eval(dat[y][i])
        for x in sub_dat:
            x['movie_id'] = dat['id'][i]
        filter_result.extend(sub_dat)
    return pd.DataFrame(filter_result)
    
''';

In [31]:
'''
%lprun -f filter_fun filter_fun(credits, 'cast')
''';

Time speeds up a lot!

According to the above results, we could see that the time was reduced from 80.126s to 48.237s. The speed of execution was improved for about 40%. The line_profiler result also indicated that the time of executing the eval function is much less than the time of executing the ast.literal_eval function.

In [32]:
credits = pd.read_csv('movie/credits.csv')
def filter_fun(dat, y):
    filter_result = []
    for i in range(dat.shape[0]):
        sub_dat = eval(dat[y][i])
        for x in sub_dat:
            x['movie_id'] = dat['id'][i]
        filter_result.extend(sub_dat)
    return pd.DataFrame(filter_result)

all_casts = filter_fun(credits, 'cast')
all_crews = filter_fun(credits, 'crew')

cast = all_casts[['name', 'order', 'movie_id']]
cast.head(15)

Unnamed: 0,name,order,movie_id
0,Tom Hanks,0,862
1,Tim Allen,1,862
2,Don Rickles,2,862
3,Jim Varney,3,862
4,Wallace Shawn,4,862
5,John Ratzenberger,5,862
6,Annie Potts,6,862
7,John Morris,7,862
8,Erik von Detten,8,862
9,Laurie Metcalf,9,862


In [33]:
crew = all_crews[['name', 'job', 'movie_id']]
crew.head(10)

Unnamed: 0,name,job,movie_id
0,John Lasseter,Director,862
1,Joss Whedon,Screenplay,862
2,Andrew Stanton,Screenplay,862
3,Joel Cohen,Screenplay,862
4,Alec Sokolow,Screenplay,862
5,Bonnie Arnold,Producer,862
6,Ed Catmull,Executive Producer,862
7,Ralph Guggenheim,Producer,862
8,Steve Jobs,Executive Producer,862
9,Lee Unkrich,Editor,862


In [34]:
## from cast dataframe, we only extract the leader as "order" = 0 for each "id". 
lead = cast[cast['order'] == 0]
lead = lead[['movie_id', 'name']]
lead = lead.rename(columns={'movie_id':'id'})
lead = lead.rename(columns={'name':'lead'})
lead.head(5)


Unnamed: 0,id,lead
0,862,Tom Hanks
13,8844,Robin Williams
39,15602,Walter Matthau
46,31357,Whitney Houston
56,11862,Steve Martin


In [35]:
# merge to "movies" dataframe
lead['id'] = lead['id'].astype('int')
movies = movies.merge(lead, on = 'id', how='left')


In [36]:
## we only extract 'Director'
director = crew[crew['job'] == 'Director']
director = director[['movie_id', 'name']]
director = director.rename(columns={'movie_id':'id', 'name':'director'})
director.head(5)

Unnamed: 0,id,director
0,862,John Lasseter
109,8844,Joe Johnston
122,15602,Howard Deutch
126,31357,Forest Whitaker
141,11862,Charles Shyer


In [37]:
## merge to "movies" dataframe
director['id'] = director['id'].astype('int')
movies = movies.merge(director, on = 'id', how='left')

In [38]:
## separate 'production_companies' column
movies['production_companies'] = movies['production_companies'].fillna('[]').apply(ast.literal_eval)
movies['production_companies'] = movies['production_companies'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else np.nan)


In [39]:
## separate 'production_countries' column
movies['production_countries'] = movies['production_countries'].fillna('[]').apply(ast.literal_eval)
movies['production_countries'] = movies['production_countries'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else np.nan)


In [40]:
## pre-processing 'spoken_languages'
movies['spoken_languages'] = movies['spoken_languages'].fillna('[]').apply(ast.literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else np.nan)


In [41]:
## pre-processing 'genres' column 
features = ['genres']
for feature in features:
    movies[feature] = movies[feature].apply(ast.literal_eval)

def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return np.nan

for feature in features:
    movies[feature] = movies[feature].apply(get_list)

In [42]:
movies = movies.dropna(subset=['release_date'], axis=0)
movies['release_date'] = movies['release_date'].fillna(0)

In [43]:
movies.isnull().sum()

adult                                        0
belongs_to_collection                    44855
budget                                       0
genres                                       0
homepage                                 41204
id                                           0
imdb_id                                     15
original_language                           15
original_title                               0
overview                                  1003
popularity                                   0
poster_path                                361
production_companies                         0
production_countries                         0
release_date                                 0
revenue                                      0
runtime                                    260
spoken_languages                             0
status                                      86
tagline                                  27789
title                                        0
video        

In [44]:
# Pre-processing step for getting year from date by splliting it using '-'
movies['year'] = pd.to_datetime(movies['release_date']).apply(
    lambda x: str(x).split('-')[0] if x != np.nan else np.nan)


In [45]:
movies

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,TV Movie,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western,lead,director,year
0,0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tom Hanks,John Lasseter,1995
1,0,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Robin Williams,Joe Johnston,1995
2,0,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Walter Matthau,Howard Deutch,1995
3,0,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Whitney Houston,Forest Whitaker,1995
4,0,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Steve Martin,Charles Shyer,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50161,0,,0,"[Drama, Action, Romance]",,30840,tt0102797,en,Robin Hood,"Yet another version of the classic epic, with ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Patrick Bergin,John Irvin,1991
50163,0,,0,[Drama],,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Angel Aquino,Lav Diaz,2011
50164,0,,0,"[Action, Drama, Thriller]",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Erika Eleniak,Mark L. Lester,2003
50165,0,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,,,,,,,,Iwan Mosschuchin,Yakov Protazanov,1917


#### Merge keywords dataframe to movies

In [46]:
keywords['id'] = keywords['id'].astype('int')

In [47]:
## separate 'keywords' column
movies = movies.merge(keywords,on=["id"], how='left')
movies['keywords'] = movies['keywords'].fillna('[]').apply(ast.literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else np.nan)


In [48]:
movies.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western,lead,director,year,keywords
0,0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,0.0,0.0,0.0,0.0,0.0,0.0,Tom Hanks,John Lasseter,1995,"[jealousy, toy, boy, friendship, friends, riva..."
1,0,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,0.0,0.0,0.0,0.0,0.0,0.0,Robin Williams,Joe Johnston,1995,"[board game, disappearance, based on children'..."
2,0,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,0.0,0.0,0.0,0.0,0.0,Walter Matthau,Howard Deutch,1995,"[fishing, best friend, duringcreditsstinger, o..."
3,0,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,0.0,0.0,0.0,0.0,0.0,0.0,Whitney Houston,Forest Whitaker,1995,"[based on novel, interracial relationship, sin..."
4,0,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,0.0,0.0,0.0,0.0,0.0,0.0,Steve Martin,Charles Shyer,1995,"[baby, midlife crisis, confidence, aging, daug..."


In [49]:
movies.shape

(51429, 60)

In [50]:
movies.isnull().sum() 

adult                                        0
belongs_to_collection                    46085
budget                                       0
genres                                       0
homepage                                 42276
id                                           0
imdb_id                                     15
original_language                           15
original_title                               0
overview                                  1047
popularity                                   0
poster_path                                373
production_companies                         0
production_countries                         0
release_date                                 0
revenue                                      0
runtime                                    268
spoken_languages                             0
status                                      87
tagline                                  28674
title                                        0
video        

In [51]:
## replace null values in some categorical columns with 0
movies['tagline'].fillna(0, inplace = True)
movies['title'].fillna(0, inplace = True)
movies['overview'].fillna(0, inplace = True)
movies['lead'].fillna('0', inplace = True)
movies['director'].fillna('0', inplace = True)

***Optimization by the method of multiprocessing (pooling)**

In [52]:
# ## replace 0 in column 'budget' with mean
# movies['budget'] = movies['budget'].astype('int')
# movies['budget'] = movies['budget'].replace(0, movies['budget'].mean())
# movies['budget'] = round(movies['budget'], 0)

# ## replace nan in column 'revenue' with mean 
# movies['revenue'].fillna('0', inplace = True)
# movies['revenue'] = movies['revenue'].astype('int')
# movies['revenue'] = movies['revenue'].replace(0, movies['revenue'].mean())

# ## replace nan in column 'vote_average' with mean
# movies['vote_average'].fillna('0', inplace = True)
# movies['vote_average'] = movies['vote_average'].astype('float')
# movies['vote_average'] = movies['vote_average'].replace(0, movies['vote_average'].mean())
# movies['vote_average'] = round(movies['vote_average'], 1)

# ## replace nan in column 'vote_count' with mean
# movies['vote_count'].fillna('0', inplace = True)
# movies['vote_count'] = movies['vote_count'].astype('int')
# movies['vote_count'] = movies['vote_count'].replace(0, movies['vote_count'].mean())
# movies['vote_count'] = round(movies['vote_count'], 0)

# ## replace nan in column 'popularity' with mean
# movies['popularity'].fillna('0', inplace = True)
# movies['popularity'] = movies['popularity'].astype('float')
# movies['popularity'] = movies['popularity'].replace(0, movies['popularity'].mean())
# movies['popularity'] = round(movies['popularity'], 1)

# ## replace nan in column 'runtime' with mean
# movies['runtime'].fillna('0', inplace = True)
# movies['runtime'] = movies['runtime'].astype('int')
# movies['runtime'] = movies['runtime'].replace(0, movies['runtime'].mean())
# movies['runtime'] = round(movies['runtime'], 0)

In [53]:
from multiprocessing import cpu_count
print("number of CPU cores:", cpu_count())

number of CPU cores: 12


In [54]:
from time import time
from multiprocessing.pool import Pool
c = ["budget","revenue",'vote_average','vote_count','popularity','runtime']
d = ['int','int','float','int','float','int']
z = zip(c,d)

def setup_fillna(column):
    col = column[0]
    movies[col].fillna('0', inplace = True)
    movies[col] = movies[col].astype(column[1])
    movies[col] = movies[col].replace(0, movies[col].mean())
    movies[col] = round(movies[col], 0)

def multi_processes_fillna():
    ts = time()   
    with Pool(12) as p:
        p.map(setup_fillna, z)
        
    print('Took {}s'.format(time() - ts))

multi_processes_fillna()

Took 0.30428099632263184s


The speed improved by 50%.

In [55]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'Action', 'Adventure', 'Animation',
       'Aniplex', 'BROSTA TV', 'Carousel Productions', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'GoHands',
       'History', 'Horror', 'Mardock Scramble Production Committee', 'Music',
       'Mystery', 'Odyssey Media', 'Pulser Productions', 'Rogue State',
       'Romance', 'Science Fiction', 'Sentai Filmworks', 'TV Movie',
       'Telescene Film Group Productions', 'The Cartel', 'Thriller',
       'Vision View Entertainment', 'War', 'Western', 'lead', 'director',
       'year', 'keywords'],
      dtype='object')

### To determine the target variable
- weighting rating: $wr =\frac{v}{v+m}R+\frac{m}{v+m}C$
- v: vote_count
- m: minimum vote count
- R: vote_average
- C: mean of vote_average

In [56]:
'''
%%writefile vote_avg_mean.py

from mpi4py import MPI
import numpy as np

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

#Comm.Scatter

if rank == 0:
    #initialize 
    data = np.array(movies['vote_average'])     
else:
    data = None

buf = np.empty(int(movies.shape[0]/size),dtype='f')

#Scatter the data to all processes
comm.Scatter(data, buf, root = 0)

#local operations
local_sum = sum(buf)

#Reduce local sum to global sum
global_sum = comm.allreduce(local_sum, MPI.SUM)
C = global_sum/movies.shape[0]

if rank == 0:
    print('Mean = ', global_sum/movies.shape[0])
    ''';

In [57]:
'''
!mpirun -n 16 python3 vote_avg_mean.py
''';

In [58]:
C = movies['vote_average'].mean()

m = movies['vote_count'].quantile(0.75)

In [59]:
m

34.0

***Optimization by the method of vectorization**

In [60]:
from time import time
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return round((v/(v+m) * R) + (m/(m+v) * C), 1)

start_time = time()
movies['score'] = movies.apply(weighted_rating, axis=1)
end_time = time()

print("run_time: %f" % (end_time - start_time))

run_time: 1.595829


In [61]:
start_time = time()
v = movies['vote_count']
R = movies['vote_average']

movies['score'] = round((v/(v+m) * R) + (m/(m+v) * C), 1)
end_time = time()

print("run_time: %f" % (end_time - start_time))

run_time: 0.052843


The speed has improved by 95%

In [62]:
movies = movies.sort_values('id')
movies = movies.drop_duplicates("id")

In [63]:
movies

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,The Cartel,Thriller,Vision View Entertainment,War,Western,lead,director,year,keywords,score
4687,0,,0,"[Drama, Crime]",,2,tt0094675,fi,Ariel,Taisto Kasurinen is a Finnish coal miner whose...,...,0.0,0.0,0.0,0.0,0.0,Turo Pajala,Aki Kaurismäki,1988,"[underdog, prison, factory worker, prisoner, h...",6.5
14029,0,,0,"[Drama, Comedy]",,3,tt0092149,fi,Varjoja paratiisissa,"An episode in the life of Nikander, a garbage ...",...,0.0,0.0,0.0,0.0,0.0,Matti Pellonpää,Aki Kaurismäki,1986,"[salesclerk, helsinki, garbage, independent film]",6.4
20,0,,4000000,"[Crime, Comedy]",,5,tt0113101,en,Four Rooms,It's Ted the Bellhop's first night on the job....,...,0.0,0.0,0.0,0.0,0.0,Tim Roth,Quentin Tarantino,1995,"[hotel, new year's eve, witch, bet, hotel room...",6.4
491,0,,0,"[Action, Thriller, Crime]",,6,tt0107286,en,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",...,0.0,1.0,0.0,0.0,0.0,Emilio Estevez,Stephen Hopkins,1993,"[chicago, drug dealer, boxing match, escape, o...",6.2
266,0,"{'id': 10, 'name': 'Star Wars Collection', 'po...",11000000,"[Adventure, Action, Science Fiction]",http://www.starwars.com/films/star-wars-episod...,11,tt0076759,en,Star Wars,Princess Leia is captured and held hostage by ...,...,0.0,0.0,0.0,0.0,0.0,Mark Hamill,George Lucas,1977,"[android, galaxy, hermit, death star, lightsab...",8.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50981,0,,0,"[Fantasy, Drama]",,465044,tt5943940,en,Abduction,A horror comedy spoofing conspiracy theory mov...,...,0.0,0.0,0.0,0.0,0.0,0,Molly Smith,2017,[],5.6
51227,0,,0,[Drama],,467731,tt0507700,en,Tragedy in a Temporary Town,Fifteen-year-old girl Dotty Fisher is assaulte...,...,0.0,0.0,0.0,0.0,0.0,0,Sidney Lumet,1956,[],5.6
24200,0,,0,"[Drama, Romance]",,468343,tt0133202,fi,Silja - nuorena nukkunut,"In the 1910s, beautiful young Silja loses both...",...,0.0,0.0,0.0,0.0,0.0,0,Jack Witikka,1956,[],5.6
51358,0,,1254040,"[Romance, Comedy]",http://lmtr.fi/,468707,tt5742932,fi,Lauri Mäntyvaaran tuuheet ripset,0,...,0.0,0.0,0.0,0.0,0.0,0,Hannaleena Hauru,2017,"[fantasy, youth, weird]",5.7


### Final dataset with selected features

In [64]:
metadata = movies[['id', 'title', 'runtime', 'budget', 'revenue', 
                   'popularity', 'year', 'lead', 
  'director','production_companies', 'production_countries', 'overview', 'genres', 
  'Action', 'Adventure', 'Animation',
       'Aniplex', 'BROSTA TV', 'Carousel Productions', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'GoHands',
       'History', 'Horror', 'Mardock Scramble Production Committee', 'Music',
       'Mystery', 'Odyssey Media', 'Pulser Productions', 'Rogue State',
       'Romance', 'Science Fiction', 'Sentai Filmworks', 'TV Movie',
       'Telescene Film Group Productions', 'The Cartel', 'Thriller',
       'Vision View Entertainment', 'War', 'Western', 'tagline',
  'keywords', 'vote_average', 'vote_count', 'score']]

In [65]:
metadata['genres']

4687                           [Drama, Crime]
14029                         [Drama, Comedy]
20                            [Crime, Comedy]
491                 [Action, Thriller, Crime]
266      [Adventure, Action, Science Fiction]
                         ...                 
50981                        [Fantasy, Drama]
51227                                 [Drama]
24200                        [Drama, Romance]
51358                       [Romance, Comedy]
22258                        [Fantasy, Drama]
Name: genres, Length: 45346, dtype: object

In [66]:
metadata.isnull().sum()

id                                          0
title                                       0
runtime                                   246
budget                                      0
revenue                                     0
popularity                                  0
year                                        0
lead                                        0
director                                    0
production_companies                        0
production_countries                        0
overview                                    0
genres                                      0
Action                                   2384
Adventure                                2384
Animation                                2384
Aniplex                                  2384
BROSTA TV                                2384
Carousel Productions                     2384
Comedy                                   2384
Crime                                    2384
Documentary                       

In [67]:
metadata = metadata.set_index("id")

In [68]:
metadata

Unnamed: 0_level_0,title,runtime,budget,revenue,popularity,year,lead,director,production_companies,production_countries,...,The Cartel,Thriller,Vision View Entertainment,War,Western,tagline,keywords,vote_average,vote_count,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,Ariel,69.0,0,0.0,3.86049,1988,Turo Pajala,Aki Kaurismäki,"[Villealfa Filmproduction Oy, Finnish Film Fou...",[Finland],...,0.0,0.0,0.0,0.0,0.0,0,"[underdog, prison, factory worker, prisoner, h...",7.1,44.0,6.5
3,Shadows in Paradise,76.0,0,0.0,2.29211,1986,Matti Pellonpää,Aki Kaurismäki,[Villealfa Filmproduction Oy],[Finland],...,0.0,0.0,0.0,0.0,0.0,0,"[salesclerk, helsinki, garbage, independent film]",7.1,35.0,6.4
5,Four Rooms,98.0,4000000,4300000.0,9.02659,1995,Tim Roth,Quentin Tarantino,"[Miramax Films, A Band Apart]",[United States of America],...,0.0,0.0,0.0,0.0,0.0,Twelve outrageous guests. Four scandalous requ...,"[hotel, new year's eve, witch, bet, hotel room...",6.5,539.0,6.4
6,Judgment Night,110.0,0,12136938.0,5.53867,1993,Emilio Estevez,Stephen Hopkins,"[Universal Pictures, Largo Entertainment, JVC ...","[Japan, United States of America]",...,0.0,1.0,0.0,0.0,0.0,Don't move. Don't whisper. Don't even breathe.,"[chicago, drug dealer, boxing match, escape, o...",6.4,79.0,6.2
11,Star Wars,121.0,11000000,775398007.0,42.1497,1977,Mark Hamill,George Lucas,"[Lucasfilm, Twentieth Century Fox Film Corpora...",[United States of America],...,0.0,0.0,0.0,0.0,0.0,"A long time ago in a galaxy far, far away...","[android, galaxy, hermit, death star, lightsab...",8.1,6778.0,8.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465044,Abduction,90.0,0,0.0,0.281008,2017,0,Molly Smith,[],[United Kingdom],...,0.0,0.0,0.0,0.0,0.0,Horrifically Funny,[],0.0,0.0,5.6
467731,Tragedy in a Temporary Town,60.0,0,0.0,0.001189,1956,0,Sidney Lumet,[],[United States of America],...,0.0,0.0,0.0,0.0,0.0,0,[],0.0,0.0,5.6
468343,Silja - nuorena nukkunut,87.0,0,0.0,0.001202,1956,0,Jack Witikka,[],[Finland],...,0.0,0.0,0.0,0.0,0.0,0,[],0.0,0.0,5.6
468707,Thick Lashes of Lauri Mäntyvaara,90.0,1254040,0.0,0.347806,2017,0,Hannaleena Hauru,[Elokuvayhtiö Oy Aamu],[Finland],...,0.0,0.0,0.0,0.0,0.0,0,"[fantasy, youth, weird]",8.0,1.0,5.7


In [69]:
metadata[['Action', 'Adventure', 'Animation', 'Aniplex', 'BROSTA TV',
       'Carousel Productions', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Family', 'Fantasy', 'Foreign', 'GoHands', 'History', 'Horror',
       'Mardock Scramble Production Committee', 'Music', 'Mystery',
       'Odyssey Media', 'Pulser Productions', 'Rogue State', 'Romance',
       'Science Fiction', 'Sentai Filmworks', 'TV Movie',
       'Telescene Film Group Productions', 'The Cartel', 'Thriller',
       'Vision View Entertainment', 'War', 'Western']] = metadata[['Action', 'Adventure', 'Animation', 'Aniplex', 'BROSTA TV',
       'Carousel Productions', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Family', 'Fantasy', 'Foreign', 'GoHands', 'History', 'Horror',
       'Mardock Scramble Production Committee', 'Music', 'Mystery',
       'Odyssey Media', 'Pulser Productions', 'Rogue State', 'Romance',
       'Science Fiction', 'Sentai Filmworks', 'TV Movie',
       'Telescene Film Group Productions', 'The Cartel', 'Thriller',
       'Vision View Entertainment', 'War', 'Western']].fillna(0)

### Export the final dataset for exploration in the next

In [70]:

metadata.to_csv("final_metadata.csv")