In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sbn
import string
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords


%matplotlib inline

In [2]:
df = pd.read_csv('D:\\ProjectData\\TMDB\\tmdb-movies.csv')

# Data Evaluation and Cleanup Process

## Basic Cleanup

Steps taken to evaluate and clean data:

1. Basic evaluation of dataframe; 
2. Drop unnecessary dataframe columns;
3. Evaluate and populate or drop duplicates;
5. Round the amount decimals used, and convert columns to appropriate datatypes;

### Basic Dataframe Evaluation

In [3]:
df.shape

(10866, 21)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10866 entries, 0 to 10865
Data columns (total 21 columns):
id                      10866 non-null int64
imdb_id                 10856 non-null object
popularity              10866 non-null float64
budget                  10866 non-null int64
revenue                 10866 non-null int64
original_title          10866 non-null object
cast                    10790 non-null object
homepage                2936 non-null object
director                10822 non-null object
tagline                 8042 non-null object
keywords                9373 non-null object
overview                10862 non-null object
runtime                 10866 non-null int64
genres                  10843 non-null object
production_companies    9836 non-null object
release_date            10866 non-null object
vote_count              10866 non-null int64
vote_average            10866 non-null float64
release_year            10866 non-null int64
budget_adj              1

In [5]:
df.head()

Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,...,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,...,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/15,5562,6.5,2015,137999900.0,1392446000.0
1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,...,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/15,6185,7.1,2015,137999900.0,348161300.0
2,262500,tt2908446,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,http://www.thedivergentseries.movie/#insurgent,Robert Schwentke,One Choice Can Destroy You,...,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/15,2480,6.3,2015,101200000.0,271619000.0
3,140607,tt2488496,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,http://www.starwars.com/films/star-wars-episod...,J.J. Abrams,Every generation has a story.,...,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/15,5292,7.5,2015,183999900.0,1902723000.0
4,168259,tt2820852,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,http://www.furious7.com/,James Wan,Vengeance Hits Home,...,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/15,2947,7.3,2015,174799900.0,1385749000.0


### Not-Needed Column Removals

To make the dataset more manageable, I've removed the ID, Homepage, Tagline and Overview columns.

In [6]:
cols_list = list(df.columns.values)
cols_list

['id',
 'imdb_id',
 'popularity',
 'budget',
 'revenue',
 'original_title',
 'cast',
 'homepage',
 'director',
 'tagline',
 'keywords',
 'overview',
 'runtime',
 'genres',
 'production_companies',
 'release_date',
 'vote_count',
 'vote_average',
 'release_year',
 'budget_adj',
 'revenue_adj']

In [7]:
#remove ID, homepage, tagline and overview columns

cols_keep = [
 'popularity',
 'budget',
 'revenue',
 'original_title',
 'cast',
 'director',
 'keywords',
 'runtime',
 'genres',
 'production_companies',
 'release_date',
 'vote_count',
 'vote_average',
 'release_year',
 'budget_adj',
 'revenue_adj',
 'overview']

df = df[cols_keep]
df.head()

Unnamed: 0,popularity,budget,revenue,original_title,cast,director,keywords,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj,overview
0,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,monster|dna|tyrannosaurus rex|velociraptor|island,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/15,5562,6.5,2015,137999900.0,1392446000.0,Twenty-two years after the events of Jurassic ...
1,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,future|chase|post-apocalyptic|dystopia|australia,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/15,6185,7.1,2015,137999900.0,348161300.0,An apocalyptic story set in the furthest reach...
2,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,based on novel|revolution|dystopia|sequel|dyst...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/15,2480,6.3,2015,101200000.0,271619000.0,Beatrice Prior must confront her inner demons ...
3,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,android|spaceship|jedi|space opera|3d,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/15,5292,7.5,2015,183999900.0,1902723000.0,Thirty years after defeating the Galactic Empi...
4,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,car race|speed|revenge|suspense|car,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/15,2947,7.3,2015,174799900.0,1385749000.0,Deckard Shaw seeks revenge against Dominic Tor...


### Duplicate Evaluation

First step is to evaluate the amount of duplicates to see how reliable the dataset is. 

In [8]:
sum(df.duplicated())

1

Because there is only one duplicated entry, I will remove this entry from the dataframe.

In [9]:
duplicated_entry = df[df.duplicated() == True]
duplicated_entry

Unnamed: 0,popularity,budget,revenue,original_title,cast,director,keywords,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj,overview
2090,0.59643,30000000,967000,TEKKEN,Jon Foo|Kelly Overton|Cary-Hiroyuki Tagawa|Ian...,Dwight H. Little,martial arts|dystopia|based on video game|mart...,92,Crime|Drama|Action|Thriller|Science Fiction,Namco|Light Song Films,3/20/10,110,5.0,2010,30000000.0,967000.0,"In the year of 2039, after World Wars destroy ..."


In [10]:
df.drop_duplicates(inplace = True)

In [11]:
df.reindex()

Unnamed: 0,popularity,budget,revenue,original_title,cast,director,keywords,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj,overview
0,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,monster|dna|tyrannosaurus rex|velociraptor|island,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/15,5562,6.5,2015,1.379999e+08,1.392446e+09,Twenty-two years after the events of Jurassic ...
1,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,future|chase|post-apocalyptic|dystopia|australia,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/15,6185,7.1,2015,1.379999e+08,3.481613e+08,An apocalyptic story set in the furthest reach...
2,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,based on novel|revolution|dystopia|sequel|dyst...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/15,2480,6.3,2015,1.012000e+08,2.716190e+08,Beatrice Prior must confront her inner demons ...
3,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,android|spaceship|jedi|space opera|3d,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/15,5292,7.5,2015,1.839999e+08,1.902723e+09,Thirty years after defeating the Galactic Empi...
4,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,car race|speed|revenge|suspense|car,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/15,2947,7.3,2015,1.747999e+08,1.385749e+09,Deckard Shaw seeks revenge against Dominic Tor...
5,9.110700,135000000,532950503,The Revenant,Leonardo DiCaprio|Tom Hardy|Will Poulter|Domhn...,Alejandro GonzÃ¡lez IÃ±Ã¡rritu,father-son relationship|rape|based on novel|mo...,156,Western|Drama|Adventure|Thriller,Regency Enterprises|Appian Way|CatchPlay|Anony...,12/25/15,3929,7.2,2015,1.241999e+08,4.903142e+08,"In the 1820s, a frontiersman, Hugh Glass, sets..."
6,8.654359,155000000,440603537,Terminator Genisys,Arnold Schwarzenegger|Jason Clarke|Emilia Clar...,Alan Taylor,saving the world|artificial intelligence|cybor...,125,Science Fiction|Action|Thriller|Adventure,Paramount Pictures|Skydance Productions,6/23/15,2598,5.8,2015,1.425999e+08,4.053551e+08,"The year is 2029. John Connor, leader of the r..."
7,7.667400,108000000,595380321,The Martian,Matt Damon|Jessica Chastain|Kristen Wiig|Jeff ...,Ridley Scott,based on novel|mars|nasa|isolation|botanist,141,Drama|Adventure|Science Fiction,Twentieth Century Fox Film Corporation|Scott F...,9/30/15,4572,7.6,2015,9.935996e+07,5.477497e+08,"During a manned mission to Mars, Astronaut Mar..."
8,7.404165,74000000,1156730962,Minions,Sandra Bullock|Jon Hamm|Michael Keaton|Allison...,Kyle Balda|Pierre Coffin,assistant|aftercreditsstinger|duringcreditssti...,91,Family|Animation|Adventure|Comedy,Universal Pictures|Illumination Entertainment,6/17/15,2893,6.5,2015,6.807997e+07,1.064192e+09,"Minions Stuart, Kevin and Bob are recruited by..."
9,6.326804,175000000,853708609,Inside Out,Amy Poehler|Phyllis Smith|Richard Kind|Bill Ha...,Pete Docter,dream|cartoon|imaginary friend|animation|kid,94,Comedy|Animation|Family,Walt Disney Pictures|Pixar Animation Studios|W...,6/9/15,3935,8.0,2015,1.609999e+08,7.854116e+08,"Growing up can be a bumpy road, and it's no ex..."


### Datatype Conversion and Roundings 

The dollar values for the adjusted budget and revenue columns only need to be accurate to the nearest dollar, so I will be rounding the adjusted budget and adjusted revenue columns so that there are no decimals, and converting the datatypes to integers. At the same time, the release date column will be converted to pandas datetime datatype so that I can take advantage of pre-built functionality. As part of the transformation process, I will also be rounding the popularity score and vote average columns to two decimals. 

In [12]:
#round popularity, vote_average, budget_adj and revenue_adj columns
df = df.round({'popularity' : 2, 'vote_average' : 2, 'budget_adj' : 0, 'revenue_adj' : 0})

In [13]:
#convert budget_adj and revenue_adj to int
int_columns = ['budget_adj', 'revenue_adj']
df[int_columns] = df[int_columns].astype(int)

#convert release_date to datetime
df['release_date'] = pd.to_datetime(df['release_date'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10865 entries, 0 to 10865
Data columns (total 17 columns):
popularity              10865 non-null float64
budget                  10865 non-null int64
revenue                 10865 non-null int64
original_title          10865 non-null object
cast                    10789 non-null object
director                10821 non-null object
keywords                9372 non-null object
runtime                 10865 non-null int64
genres                  10842 non-null object
production_companies    9835 non-null object
release_date            10865 non-null datetime64[ns]
vote_count              10865 non-null int64
vote_average            10865 non-null float64
release_year            10865 non-null int64
budget_adj              10865 non-null int32
revenue_adj             10865 non-null int32
overview                10861 non-null object
dtypes: datetime64[ns](1), float64(2), int32(2), int64(5), object(7)
memory usage: 1.4+ MB


In [14]:
#save cleaned-up data
df.to_csv('D:\\ProjectData\\TMDB\\clean_tmdb-movies.csv', index=False)

## Addressing Missing Values

### Basic Evaluation

In [15]:
df = pd.read_csv('D:\\ProjectData\\TMDB\\clean_tmdb-movies.csv')

In [16]:
#Check to see amount and percentage of null values

sum_null_values = df.isnull().sum() #sum of null values
sum_null_values

popularity                 0
budget                     0
revenue                    0
original_title             0
cast                      76
director                  44
keywords                1493
runtime                    0
genres                    23
production_companies    1030
release_date               0
vote_count                 0
vote_average               0
release_year               0
budget_adj                 0
revenue_adj                0
overview                   4
dtype: int64

In [17]:
perc_null_values = (df.isnull().sum())/df.shape[0]*100 #percentage of null values
perc_null_values

popularity               0.000000
budget                   0.000000
revenue                  0.000000
original_title           0.000000
cast                     0.699494
director                 0.404970
keywords                13.741371
runtime                  0.000000
genres                   0.211689
production_companies     9.479982
release_date             0.000000
vote_count               0.000000
vote_average             0.000000
release_year             0.000000
budget_adj               0.000000
revenue_adj              0.000000
overview                 0.036815
dtype: float64

### Missing Values In Order of Significance 

1. Keywords - 13.74% missing;
2. Production Companies - 9.48% missing;
3. Cast - 0.7% missing;
4. Director - 0.40% missing;
5. Genres - 0.21% missing;

The keywords and production companies columns are the only ones with a significant amount of results missing. 

For keywords, the strategy will be to normalize the results as much as possible and compare the title with normalized keyword. If the title has any of these keywords, I will be adding the keyword to the results. 

This will be a three step process - first I will be grouping the keywords by their root word using the Porter Stemmer Algorithm, and replacing similarly grouped words with a single word for consistency. 

For example, within the dataset, addiction (7), addict (1) and addicted (1) are listed. For the purposes of the analysis, these words can be considered the same. 

Then, I will be grouping words by common synonyms. If the title of a movie consists of any of the grouped synonyms, I will be adding the most commonly used synonym to the results. As a last step, I will be replacing the less used synonyms with the most commonly used synonym.

For missing production companies, while I would love to address this, I could not come up with a strategy without relying on additional data sources. 

### Keywords

Steps to cleanup keywords:
    
    1. Group words by root;
    2. Group words by common synonyms;
    3. Compare words in title and fill with cleaned up keyword; 

#### Grouping Words by Word Root

**Helper Functions:**

In [18]:
#convert keywords into a list per movie, and remove the pipe line separator

def to_list_no_pipe(df, col):
    """
    Function takes all values for a row found within a column, and places them into their own list. 
    Takes pandas dataframe (df) and a column name (col) as input.
    """
    df[col] = df[col].str.split('|').values
    
    return df[col]

In [19]:
#Group all occurrences of keywords, and count how many times they appear

def word_occurrence(df, df_col):
    
    """
    For columns that contain lists: This function will take all data from each of a single column, and create a list of lists
    with each list representing a row. From this, the occurrences for each word will be calculated. Takes as input a dataframe
    (df) and a dataframe column (df_col). 
    """
    keylist = df[df_col].tolist() #adds values from column to a list of lists - each row is represented as a list
    keylist
    
    
    flattened_keylist = [] #flattens list so all values are in a single list
    
    for sublist in keylist:
        if type(sublist) == float:
            continue
        else:
            for item in sublist:
                flattened_keylist.append(item)
                
    keylist_counts = {} #adds list values to a dict, and counts the amount of occurences per value
    
    for i in range(len(flattened_keylist)):
        if flattened_keylist[i] not in keylist_counts:
            keylist_counts[flattened_keylist[i]] = 1
        else:
            keylist_counts[flattened_keylist[i]] += 1
            
    sorted_keylist_counts = [] #transfer values to a list so that they can be sorted

    for k,v in keylist_counts.items():
        sorted_keylist_counts.append([k,v])
        
    sorted_keylist_counts.sort(key = lambda x:x[1], reverse=True)
            
    return sorted_keylist_counts

In [20]:
#function to evaluate and group keywords by their word roots

def stem_words_and_group(keyword_counts):
    """
    Purpose of function: Group keywords based on their root (Porter Stemmer Algorithm) and from there,
    create a dict for mapping purposes. Takes the keyword_counts variable - which is returned from the word_occurrences function.
    """
    root_words = {}
    ps = PorterStemmer()

    for row in keyword_counts: #transform keyword to lowercase, and stems keyword - omits nulls
        word_stem = row[0].lower()
        word_stem = ps.stem(word_stem)

        if word_stem not in root_words: #add results to root_words dict
            root_words[word_stem] = [row[0]]
        else: 
            root_words[word_stem].append(row[0])
            
    
    root_word_values = root_words.values() #Take values and put them into a list so that grouped words can be counted and sorted
    root_word_group = list(root_word_values)
    keywords = [] #To keep of tally of words to compare titles with
    
    
    for word_group in root_word_values:
        counts = len(word_group)
        word_group.append(counts)
        keywords.append(word_group[:-1])
        
    #keywords is a list of list - flatten to one list
    flat_keywords = []
    
    for kw in keywords:
        for w in kw:
            flat_keywords.append(w)
    
    
    #sort by amount of words that share the same stem - from highest to lowest
    root_word_group.sort(key = lambda x:x[-1], reverse=True)
    
    root_words_list = []
    
    #list for words that have at least two keywords that share the same stem - if the stem of a word is unique, the word is 
    #omitted from the results
    
    for grouped_words in root_word_group:
        if grouped_words[-1] > 1:
            root_words_list.append(grouped_words)

    word_mapping_dict = {}
    
    #Determines the keyword that will be considered the main word per word stemp grouping for mapping purposes. Key will
    #be the word that is being replaced, and value is the replacement word.            
    
    for grouped_words_by_root in root_words_list:
        main_word = grouped_words_by_root[0]
        
        for w in grouped_words_by_root:
            if w == main_word:
                continue
                
            elif type(w) == str:
                word_mapping_dict[w] = main_word
    
    
    return flat_keywords, root_words_list, word_mapping_dict

In [21]:
#keyword replacement function so that we can have a better representation of keyword counts - 
#function to replace keywords from the word_mapping dictionary with their corresponding main word

def replace_word(df, col, word_map):
    """
    Takes pandas dataframe (df), and column name(col) along with word_map dict from root_word_func.
    The purpose is to replace a word that shares the same stem with a common word.
    """
    for row in df[col]:
        if type(row) == float:
            continue
        else:
            for i in range(len(row)):
                if row[i] in word_map:
                    row[i] = word_map[row[i]]
    
    return df[col]

##### Group Words by Root

**Prepare Words:**

In [22]:
df['keywords'] = to_list_no_pipe(df, 'keywords')

In [23]:
#pipe has been split, and keywords are now in a list for ease of references
df['keywords'].head()

0    [monster, dna, tyrannosaurus rex, velociraptor...
1    [future, chase, post-apocalyptic, dystopia, au...
2    [based on novel, revolution, dystopia, sequel,...
3          [android, spaceship, jedi, space opera, 3d]
4            [car race, speed, revenge, suspense, car]
Name: keywords, dtype: object

In [24]:
#Group and total keywords
keyword_counts = word_occurrence(df, 'keywords')
keyword_counts

[['woman director', 413],
 ['independent film', 396],
 ['based on novel', 278],
 ['sex', 272],
 ['sport', 216],
 ['murder', 204],
 ['biography', 169],
 ['musical', 169],
 ['new york', 163],
 ['suspense', 159],
 ['nudity', 154],
 ['duringcreditsstinger', 152],
 ['female nudity', 149],
 ['prison', 140],
 ['revenge', 137],
 ['dystopia', 136],
 ['high school', 135],
 ['sequel', 130],
 ['london', 123],
 ['suicide', 115],
 ['friendship', 111],
 ['police', 103],
 ['holiday', 100],
 ['rape', 96],
 ['love', 96],
 ['detective', 93],
 ['world war ii', 91],
 ['father-son relationship', 90],
 ['gay', 88],
 ['teenager', 87],
 ['brother brother relationship', 85],
 ['robbery', 85],
 ['monster', 83],
 ['prostitute', 82],
 ['paris', 81],
 ['corruption', 81],
 ['serial killer', 80],
 ['secret', 79],
 ['vampire', 77],
 ['alien', 76],
 ['fight', 75],
 ['martial arts', 74],
 ['superhero', 73],
 ['england', 73],
 ['journalist', 73],
 ['aftercreditsstinger', 71],
 ['alcohol', 71],
 ['cia', 70],
 ['witch', 68

In [25]:
len(keyword_counts)

7878

In [26]:
#keyword counts before word transformation for comparison purposes - addiction, addicted and addict will be grouped together
#once the transformation occurs

original_keyword_counts = keyword_counts

original_keyword_dict = {}

for item in original_keyword_counts:
    original_keyword_dict[item[0]] = item[1]

In [27]:
original_keyword_dict['addiction']

7

In [28]:
original_keyword_dict['addict']

1

In [29]:
original_keyword_dict['addicted']

1

**Replace Keywords:**

In [30]:
#stem and group words based on common root words
keyword_list, stem_words_list, root_map = stem_words_and_group(keyword_counts)

In [31]:
#replace keywords in keyword column of dataframe with grouped keywords
df['keywords'] = replace_word(df, 'keywords', root_map)

**Tests:**

In [32]:
new_keylist_count = word_occurrence(df, 'keywords')
new_keylist_count

[['woman director', 413],
 ['independent film', 396],
 ['based on novel', 278],
 ['sex', 272],
 ['musical', 221],
 ['sport', 216],
 ['murder', 206],
 ['biography', 169],
 ['prison', 166],
 ['new york', 163],
 ['suspense', 161],
 ['nudity', 154],
 ['duringcreditsstinger', 152],
 ['female nudity', 149],
 ['revenge', 137],
 ['dystopia', 136],
 ['high school', 135],
 ['sequel', 130],
 ['london', 123],
 ['suicide', 117],
 ['friendship', 111],
 ['police', 103],
 ['assassin', 100],
 ['holiday', 100],
 ['rape', 96],
 ['love', 96],
 ['detective', 93],
 ['alcohol', 92],
 ['prostitute', 91],
 ['world war ii', 91],
 ['father-son relationship', 90],
 ['gay', 88],
 ['teenager', 87],
 ['brother brother relationship', 85],
 ['robbery', 85],
 ['monster', 83],
 ['paris', 81],
 ['corruption', 81],
 ['serial killer', 80],
 ['fight', 80],
 ['dancing', 79],
 ['secret', 79],
 ['vampire', 78],
 ['alien', 76],
 ['martial arts', 74],
 ['superhero', 73],
 ['england', 73],
 ['journalist', 73],
 ['aftercreditsstin

In [33]:
len(new_keylist_count)

7626

In [34]:
new_keyword_dict = {}

for item in new_keylist_count:
    new_keyword_dict[item[0]] = item[1]

new_keyword_dict['addiction']

9

#### Grouping Words Based on Synonyms

**Helper Functions:**

In [35]:
#Find synonyms based on the WordNet Database using the NLTK-WordNet interface, and create mapping back to keyword

def find_synonyms(flat_keywords):

    s_map = {}
    
    for w in flat_keywords:
        for syn in wn.synsets(w):
            for l in syn.lemma_names():
                s_map[l] = w
    
    synonym_map = {}
    
    #add items from s_map to synonym_map where dict key does not match dict value
    for k, v in s_map.items():
        if k == v:
            continue
        else:
            synonym_map[k] = v
        
    return synonym_map    



In [36]:
#merge the synonym mapping and root word mapping dicts so that synonyms are also pointing to the main word

def merge_syn_root_dicts(syn_dict, root_dict):
    
    mapping_dict = root_dict
    
    #if synonym (v from syn_dict) matches a key from root_dict, synonym will be mapped to value found in root_dict
    #if synonym matches a value from root_dict, synonym will be mapped to same value
    
    for k,v in syn_dict.items():
        if v in mapping_dict.keys():
            mapping_dict[k] = mapping_dict[v]
        elif v in mapping_dict.values():
            mapping_dict[k] = v
            
    return mapping_dict

In [37]:
#function to remove stop words from overview or title columns

def remove_stopwords_new_col(df, col):
    
    words_to_process = df[col].tolist()
    clean_list = []
    
    for row in words_to_process:
        
        if type(row) == str:
            wordlist = row.split(" ")
            temp_list = []

            for word in wordlist:
                if word not in stopwords.words("english"):
                    temp_list.append(word)

            clean_list.append(temp_list[:])
        
        else:
            clean_list.append(np.nan)
            
    clean_overview = pd.Series(clean_list[:]) 
    df = df.assign(updated_col=clean_overview.values)
    
    return df

In [38]:
## Function to compare words with combined mapping dictionary

def word_comp(df, col, mapping):
    
    words_to_compare = df[col].tolist()
    keywords = []
    
    for row in words_to_compare:
        temp_list = []
        
        if type(row) == float:
            temp_list = np.nan
            keywords.append(temp_list)
            continue
        
        for word in row:
            if word in mapping.keys():
                temp_list.append(mapping[word])
            elif word in mapping.values():
                temp_list.append(word)

        if len(temp_list) == 0:
            temp_list = np.nan

        keywords.append(temp_list)


    new_keywords = pd.Series(keywords[:]) 
    df = df.assign(new_keywords=new_keywords.values)

    return df

##### Group Words by Synonyms

**Prepare Words:**

In [39]:
#synonym mapping dictionary
syn_map = find_synonyms(keyword_list)
syn_map

{'sexual_activity': 'sex',
 'sexual_practice': 'sex',
 'sex': 'sexuality',
 'sex_activity': 'sex',
 'sexual_urge': 'sex',
 'gender': 'sexuality',
 'arouse': 'wake',
 'excite': 'sex',
 'turn_on': 'trip',
 'wind_up': 'wound',
 'sport': 'play',
 'athletics': 'sport',
 'summercater': 'sport',
 'sportsman': 'sport',
 'sportswoman': 'sport',
 'mutant': 'mutation',
 'variation': 'mutation',
 'fun': 'play',
 'play': 'wager',
 'feature': 'sport',
 'boast': 'blow',
 'frolic': 'play',
 'lark': 'escapade',
 'rollick': 'sport',
 'skylark': 'sport',
 'disport': 'sport',
 'cavort': 'sport',
 'gambol': 'play',
 'frisk': 'sport',
 'romp': 'tomboy',
 'run_around': 'sport',
 'lark_about': 'sport',
 'murder': 'hit',
 'slaying': 'execution',
 'execution': 'performance',
 'slay': 'hit',
 'hit': 'gain',
 'dispatch': 'hit',
 'bump_off': 'hit',
 'off': 'hit',
 'polish_off': 'hit',
 'remove': 'hit',
 'mangle': 'murder',
 'mutilate': 'mars',
 'liquidator': 'murderer',
 'manslayer': 'murderer',
 'life': 'lifetime

In [40]:
#combine root word and synonym mapping dictionaries
mapping = merge_syn_root_dicts(syn_map, root_map)
mapping

{'alcoholism': 'alcohol',
 'alcoholic': 'alcohol',
 'drugged': 'drug',
 'drugs': 'drug',
 'drug dealing': 'drug dealer',
 'drug deal': 'drug dealer',
 'terrorism': 'terror',
 'terrorizing': 'terror',
 'possessed': 'possession',
 'possessiveness': 'possession',
 'mountaineering': 'mountains',
 'mountaineer': 'mountains',
 'skateboarder': 'skateboarding',
 'skateboard': 'skateboarding',
 'explosion': 'explosive',
 'explosives': 'explosive',
 'addicted': 'addiction',
 'addict': 'addiction',
 'satan': 'satanism',
 'satanic': 'satanism',
 'engineering': 'engineer',
 'engine': 'engineer',
 'community': 'commune',
 'communism': 'commune',
 'corporate': 'corporation',
 'corporations': 'corporation',
 'races': 'racing',
 'race': 'racing',
 'recruitment': 'recruit',
 'recruiting': 'recruit',
 'murderer': 'murder',
 'music': 'musical',
 'suspension': 'suspense',
 'prisoner': 'prison',
 'suicidal': 'suicide',
 'prostitution': 'prostitute',
 'vampires': 'vampire',
 'fighting': 'fight',
 'zombie': '

**Replace Words**

In [41]:
#consolidate keywords
df['keywords'] = replace_word(df, 'keywords', mapping)
df['keywords']

0        [monster, dna, tyrannosaurus rex, velociraptor...
1        [future, chase, post-apocalyptic, dystopia, au...
2        [based on novel, revolution, dystopia, sequel,...
3              [android, spaceship, jedi, space opera, 3d]
4               [car race, racing, revenge, suspense, car]
5        [father-son relationship, rape, based on novel...
6        [saving the world, artificial intelligence, cy...
7        [based on novel, mars, nasa, isolation, botanist]
8        [assistant, aftercreditsstinger, duringcredits...
9        [dream, cartoon, imaginary friend, animation, ...
10       [spy, based on novel, secret agent, sequel, ja...
11       [jupiter, space, woman director, 3d, interspec...
12       [dancing, artificial intelligence, helicopter,...
13            [video game, nerd, alien attack, 3d, pixels]
14        [marvel comic, comic, sequel, superhero, vision]
15       [bounty hunter, wyoming, mountains, hangman, v...
16         [revenge, murder, on the run, fugitive, frame

#### Compare Overview to Keywords to Populate Null Keyword Values

In [42]:
df_missing_keywords = df[df['keywords'].isnull() == True]
df_missing_keywords

Unnamed: 0,popularity,budget,revenue,original_title,cast,director,keywords,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj,overview
130,1.28,0,4719695,True Story,Jonah Hill|James Franco|Felicity Jones|Maria D...,Rupert Goold,,100,Crime|Drama|Mystery,Plan B Entertainment|Regency Enterprises|New R...,2015-04-17,354,6.0,2015,0,4342117,A drama centered around the relationship betwe...
149,1.04,20000000,0,Momentum,Olga Kurylenko|Morgan Freeman|James Purefoy|Je...,Stephen S. Campanelli,,96,Thriller|Action,Thaba Media|Azari Media,2015-08-01,100,5.8,2015,18399992,0,"When Alex, an infiltration expert with a secre..."
154,1.01,12000000,0,Il racconto dei racconti,Salma Hayek|Vincent Cassel|John C. Reilly|Toby...,Matteo Garrone,,125,Romance|Fantasy|Horror,HanWay Films|Rai Cinema|Le Pacte|Fonds Eurimag...,2015-05-14,211,5.7,2015,11039995,0,"A fantasy film with horror elements, ""The Tale..."
155,1.01,11000000,27391084,Irrational Man,Emma Stone|Joaquin Phoenix|Jamie Blackley|Park...,Woody Allen,,95,Mystery|Drama,Sony Pictures|Gravier Productions,2015-07-17,319,6.1,2015,10119996,25199786,"On a small town college campus, a philosophy p..."
160,0.95,12000000,60273173,The Perfect Guy,Michael Ealy|Sanaa Lathan|Morris Chestnut|Kath...,David M. Rosenthal,,100,Drama|Thriller,Screen Gems,2015-09-11,122,5.6,2015,11039995,55451295,Leah Vaughn appears to have the ideal life. Sh...
164,0.92,0,0,Kidnapping Mr. Heineken,Anthony Hopkins|Jim Sturgess|Sam Worthington|R...,Daniel Alfredson,,95,Drama|Action|Crime|Thriller,Umedia|Informant Europe SPRL|European Film Com...,2015-03-12,131,5.8,2015,0,0,The true story of the kidnapping of Freddy Hei...
178,0.84,0,3002884,The End of the Tour,Jason Segel|Jesse Eisenberg|Anna Chlumsky|Mami...,James Ponsoldt,,106,Drama,A24,2015-07-31,126,7.3,2015,0,2762652,The story of the five-day interview between Ro...
180,0.83,20000000,6420319,Little Boy,Jakob Salvati|Emily Watson|Cary-Hiroyuki Tagaw...,Alejandro Monteverde,,106,Comedy|Drama|War,Metanoia Films,2015-04-23,113,7.1,2015,18399992,5906691,An eight-year-old boy is willing to do whateve...
182,0.82,0,1986615,Remember,Christopher Plummer|Martin Landau|Dean Norris|...,Atom Egoyan,,94,Drama|Thriller,Serendipity Point Films|Egoli Tossell Film AG,2015-10-23,75,7.7,2015,0,1827685,With the aid of a fellow Auschwitz survivor an...
189,0.78,0,0,The Benefactor,Dakota Fanning|Theo James|Richard Gere|Clarke ...,Andrew Renzi,,90,Drama,KSM,2015-04-17,55,4.5,2015,0,0,A newly married couple are forced to navigate ...


In [43]:
df_missing_keywords['overview'] = df_missing_keywords['overview'].str.replace('[^\w\s]','')
df_missing_keywords['overview'] = df_missing_keywords['overview'].str.lower()

df_missing_keywords

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,popularity,budget,revenue,original_title,cast,director,keywords,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj,overview
130,1.28,0,4719695,True Story,Jonah Hill|James Franco|Felicity Jones|Maria D...,Rupert Goold,,100,Crime|Drama|Mystery,Plan B Entertainment|Regency Enterprises|New R...,2015-04-17,354,6.0,2015,0,4342117,a drama centered around the relationship betwe...
149,1.04,20000000,0,Momentum,Olga Kurylenko|Morgan Freeman|James Purefoy|Je...,Stephen S. Campanelli,,96,Thriller|Action,Thaba Media|Azari Media,2015-08-01,100,5.8,2015,18399992,0,when alex an infiltration expert with a secret...
154,1.01,12000000,0,Il racconto dei racconti,Salma Hayek|Vincent Cassel|John C. Reilly|Toby...,Matteo Garrone,,125,Romance|Fantasy|Horror,HanWay Films|Rai Cinema|Le Pacte|Fonds Eurimag...,2015-05-14,211,5.7,2015,11039995,0,a fantasy film with horror elements the tale o...
155,1.01,11000000,27391084,Irrational Man,Emma Stone|Joaquin Phoenix|Jamie Blackley|Park...,Woody Allen,,95,Mystery|Drama,Sony Pictures|Gravier Productions,2015-07-17,319,6.1,2015,10119996,25199786,on a small town college campus a philosophy pr...
160,0.95,12000000,60273173,The Perfect Guy,Michael Ealy|Sanaa Lathan|Morris Chestnut|Kath...,David M. Rosenthal,,100,Drama|Thriller,Screen Gems,2015-09-11,122,5.6,2015,11039995,55451295,leah vaughn appears to have the ideal life she...
164,0.92,0,0,Kidnapping Mr. Heineken,Anthony Hopkins|Jim Sturgess|Sam Worthington|R...,Daniel Alfredson,,95,Drama|Action|Crime|Thriller,Umedia|Informant Europe SPRL|European Film Com...,2015-03-12,131,5.8,2015,0,0,the true story of the kidnapping of freddy hei...
178,0.84,0,3002884,The End of the Tour,Jason Segel|Jesse Eisenberg|Anna Chlumsky|Mami...,James Ponsoldt,,106,Drama,A24,2015-07-31,126,7.3,2015,0,2762652,the story of the fiveday interview between rol...
180,0.83,20000000,6420319,Little Boy,Jakob Salvati|Emily Watson|Cary-Hiroyuki Tagaw...,Alejandro Monteverde,,106,Comedy|Drama|War,Metanoia Films,2015-04-23,113,7.1,2015,18399992,5906691,an eightyearold boy is willing to do whatever ...
182,0.82,0,1986615,Remember,Christopher Plummer|Martin Landau|Dean Norris|...,Atom Egoyan,,94,Drama|Thriller,Serendipity Point Films|Egoli Tossell Film AG,2015-10-23,75,7.7,2015,0,1827685,with the aid of a fellow auschwitz survivor an...
189,0.78,0,0,The Benefactor,Dakota Fanning|Theo James|Richard Gere|Clarke ...,Andrew Renzi,,90,Drama,KSM,2015-04-17,55,4.5,2015,0,0,a newly married couple are forced to navigate ...


In [44]:
df_missing_keywords = remove_stopwords_new_col(df_missing_keywords, 'overview')
df_missing_keywords

Unnamed: 0,popularity,budget,revenue,original_title,cast,director,keywords,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj,overview,updated_col
130,1.28,0,4719695,True Story,Jonah Hill|James Franco|Felicity Jones|Maria D...,Rupert Goold,,100,Crime|Drama|Mystery,Plan B Entertainment|Regency Enterprises|New R...,2015-04-17,354,6.0,2015,0,4342117,a drama centered around the relationship betwe...,"[drama, centered, around, relationship, journa..."
149,1.04,20000000,0,Momentum,Olga Kurylenko|Morgan Freeman|James Purefoy|Je...,Stephen S. Campanelli,,96,Thriller|Action,Thaba Media|Azari Media,2015-08-01,100,5.8,2015,18399992,0,when alex an infiltration expert with a secret...,"[alex, infiltration, expert, secret, past, acc..."
154,1.01,12000000,0,Il racconto dei racconti,Salma Hayek|Vincent Cassel|John C. Reilly|Toby...,Matteo Garrone,,125,Romance|Fantasy|Horror,HanWay Films|Rai Cinema|Le Pacte|Fonds Eurimag...,2015-05-14,211,5.7,2015,11039995,0,a fantasy film with horror elements the tale o...,"[fantasy, film, horror, elements, tale, tales,..."
155,1.01,11000000,27391084,Irrational Man,Emma Stone|Joaquin Phoenix|Jamie Blackley|Park...,Woody Allen,,95,Mystery|Drama,Sony Pictures|Gravier Productions,2015-07-17,319,6.1,2015,10119996,25199786,on a small town college campus a philosophy pr...,"[small, town, college, campus, philosophy, pro..."
160,0.95,12000000,60273173,The Perfect Guy,Michael Ealy|Sanaa Lathan|Morris Chestnut|Kath...,David M. Rosenthal,,100,Drama|Thriller,Screen Gems,2015-09-11,122,5.6,2015,11039995,55451295,leah vaughn appears to have the ideal life she...,"[leah, vaughn, appears, ideal, life, enjoys, c..."
164,0.92,0,0,Kidnapping Mr. Heineken,Anthony Hopkins|Jim Sturgess|Sam Worthington|R...,Daniel Alfredson,,95,Drama|Action|Crime|Thriller,Umedia|Informant Europe SPRL|European Film Com...,2015-03-12,131,5.8,2015,0,0,the true story of the kidnapping of freddy hei...,"[true, story, kidnapping, freddy, heineken, gr..."
178,0.84,0,3002884,The End of the Tour,Jason Segel|Jesse Eisenberg|Anna Chlumsky|Mami...,James Ponsoldt,,106,Drama,A24,2015-07-31,126,7.3,2015,0,2762652,the story of the fiveday interview between rol...,"[story, fiveday, interview, rolling, stone, re..."
180,0.83,20000000,6420319,Little Boy,Jakob Salvati|Emily Watson|Cary-Hiroyuki Tagaw...,Alejandro Monteverde,,106,Comedy|Drama|War,Metanoia Films,2015-04-23,113,7.1,2015,18399992,5906691,an eightyearold boy is willing to do whatever ...,"[eightyearold, boy, willing, whatever, takes, ..."
182,0.82,0,1986615,Remember,Christopher Plummer|Martin Landau|Dean Norris|...,Atom Egoyan,,94,Drama|Thriller,Serendipity Point Films|Egoli Tossell Film AG,2015-10-23,75,7.7,2015,0,1827685,with the aid of a fellow auschwitz survivor an...,"[aid, fellow, auschwitz, survivor, handwritten..."
189,0.78,0,0,The Benefactor,Dakota Fanning|Theo James|Richard Gere|Clarke ...,Andrew Renzi,,90,Drama,KSM,2015-04-17,55,4.5,2015,0,0,a newly married couple are forced to navigate ...,"[newly, married, couple, forced, navigate, all..."


In [45]:
df_missing_keywords = word_comp(df_missing_keywords, 'updated_col', mapping)
df_missing_keywords

Unnamed: 0,popularity,budget,revenue,original_title,cast,director,keywords,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj,overview,updated_col,new_keywords
130,1.28,0,4719695,True Story,Jonah Hill|James Franco|Felicity Jones|Maria D...,Rupert Goold,,100,Crime|Drama|Mystery,Plan B Entertainment|Regency Enterprises|New R...,2015-04-17,354,6.0,2015,0,4342117,a drama centered around the relationship betwe...,"[drama, centered, around, relationship, journa...","[christian, murder, intern]"
149,1.04,20000000,0,Momentum,Olga Kurylenko|Morgan Freeman|James Purefoy|Je...,Stephen S. Campanelli,,96,Thriller|Action,Thaba Media|Azari Media,2015-08-01,100,5.8,2015,18399992,0,when alex an infiltration expert with a secret...,"[alex, infiltration, expert, secret, past, acc...","[government, games, assassin, friends]"
154,1.01,12000000,0,Il racconto dei racconti,Salma Hayek|Vincent Cassel|John C. Reilly|Toby...,Matteo Garrone,,125,Romance|Fantasy|Horror,HanWay Films|Rai Cinema|Le Pacte|Fonds Eurimag...,2015-05-14,211,5.7,2015,11039995,0,a fantasy film with horror elements the tale o...,"[fantasy, film, horror, elements, tale, tales,...",[author]
155,1.01,11000000,27391084,Irrational Man,Emma Stone|Joaquin Phoenix|Jamie Blackley|Park...,Woody Allen,,95,Mystery|Drama,Sony Pictures|Gravier Productions,2015-07-17,319,6.1,2015,10119996,25199786,on a small town college campus a philosophy pr...,"[small, town, college, campus, philosophy, pro...",
160,0.95,12000000,60273173,The Perfect Guy,Michael Ealy|Sanaa Lathan|Morris Chestnut|Kath...,David M. Rosenthal,,100,Drama|Thriller,Screen Gems,2015-09-11,122,5.6,2015,11039995,55451295,leah vaughn appears to have the ideal life she...,"[leah, vaughn, appears, ideal, life, enjoys, c...","[cook, moving, liberation, friends, well, love..."
164,0.92,0,0,Kidnapping Mr. Heineken,Anthony Hopkins|Jim Sturgess|Sam Worthington|R...,Daniel Alfredson,,95,Drama|Action|Crime|Thriller,Umedia|Informant Europe SPRL|European Film Com...,2015-03-12,131,5.8,2015,0,0,the true story of the kidnapping of freddy hei...,"[true, story, kidnapping, freddy, heineken, gr...",[kidnapping]
178,0.84,0,3002884,The End of the Tour,Jason Segel|Jesse Eisenberg|Anna Chlumsky|Mami...,James Ponsoldt,,106,Drama,A24,2015-07-31,126,7.3,2015,0,2762652,the story of the fiveday interview between rol...,"[story, fiveday, interview, rolling, stone, re...",
180,0.83,20000000,6420319,Little Boy,Jakob Salvati|Emily Watson|Cary-Hiroyuki Tagaw...,Alejandro Monteverde,,106,Comedy|Drama|War,Metanoia Films,2015-04-23,113,7.1,2015,18399992,5906691,an eightyearold boy is willing to do whatever ...,"[eightyearold, boy, willing, whatever, takes, ...",
182,0.82,0,1986615,Remember,Christopher Plummer|Martin Landau|Dean Norris|...,Atom Egoyan,,94,Drama|Thriller,Serendipity Point Films|Egoli Tossell Film AG,2015-10-23,75,7.7,2015,0,1827685,with the aid of a fellow auschwitz survivor an...,"[aid, fellow, auschwitz, survivor, handwritten...",[hunting]
189,0.78,0,0,The Benefactor,Dakota Fanning|Theo James|Richard Gere|Clarke ...,Andrew Renzi,,90,Drama,KSM,2015-04-17,55,4.5,2015,0,0,a newly married couple are forced to navigate ...,"[newly, married, couple, forced, navigate, all...",[occult]


In [46]:
df_missing_keywords['keywords'] = df_missing_keywords['new_keywords']
df_missing_keywords = df_missing_keywords.drop(columns=['overview','updated_col', 'new_keywords'])
df_missing_keywords

Unnamed: 0,popularity,budget,revenue,original_title,cast,director,keywords,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
130,1.28,0,4719695,True Story,Jonah Hill|James Franco|Felicity Jones|Maria D...,Rupert Goold,"[christian, murder, intern]",100,Crime|Drama|Mystery,Plan B Entertainment|Regency Enterprises|New R...,2015-04-17,354,6.0,2015,0,4342117
149,1.04,20000000,0,Momentum,Olga Kurylenko|Morgan Freeman|James Purefoy|Je...,Stephen S. Campanelli,"[government, games, assassin, friends]",96,Thriller|Action,Thaba Media|Azari Media,2015-08-01,100,5.8,2015,18399992,0
154,1.01,12000000,0,Il racconto dei racconti,Salma Hayek|Vincent Cassel|John C. Reilly|Toby...,Matteo Garrone,[author],125,Romance|Fantasy|Horror,HanWay Films|Rai Cinema|Le Pacte|Fonds Eurimag...,2015-05-14,211,5.7,2015,11039995,0
155,1.01,11000000,27391084,Irrational Man,Emma Stone|Joaquin Phoenix|Jamie Blackley|Park...,Woody Allen,,95,Mystery|Drama,Sony Pictures|Gravier Productions,2015-07-17,319,6.1,2015,10119996,25199786
160,0.95,12000000,60273173,The Perfect Guy,Michael Ealy|Sanaa Lathan|Morris Chestnut|Kath...,David M. Rosenthal,"[cook, moving, liberation, friends, well, love...",100,Drama|Thriller,Screen Gems,2015-09-11,122,5.6,2015,11039995,55451295
164,0.92,0,0,Kidnapping Mr. Heineken,Anthony Hopkins|Jim Sturgess|Sam Worthington|R...,Daniel Alfredson,[kidnapping],95,Drama|Action|Crime|Thriller,Umedia|Informant Europe SPRL|European Film Com...,2015-03-12,131,5.8,2015,0,0
178,0.84,0,3002884,The End of the Tour,Jason Segel|Jesse Eisenberg|Anna Chlumsky|Mami...,James Ponsoldt,,106,Drama,A24,2015-07-31,126,7.3,2015,0,2762652
180,0.83,20000000,6420319,Little Boy,Jakob Salvati|Emily Watson|Cary-Hiroyuki Tagaw...,Alejandro Monteverde,,106,Comedy|Drama|War,Metanoia Films,2015-04-23,113,7.1,2015,18399992,5906691
182,0.82,0,1986615,Remember,Christopher Plummer|Martin Landau|Dean Norris|...,Atom Egoyan,[hunting],94,Drama|Thriller,Serendipity Point Films|Egoli Tossell Film AG,2015-10-23,75,7.7,2015,0,1827685
189,0.78,0,0,The Benefactor,Dakota Fanning|Theo James|Richard Gere|Clarke ...,Andrew Renzi,[occult],90,Drama,KSM,2015-04-17,55,4.5,2015,0,0


In [50]:
df['keywords'] = df['keywords'].fillna(df_missing_keywords['keywords'])
df

Unnamed: 0,popularity,budget,revenue,original_title,cast,director,keywords,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj,overview
0,32.99,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,"[monster, dna, tyrannosaurus rex, velociraptor...",124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,2015-06-09,5562,6.5,2015,137999939,1392445893,Twenty-two years after the events of Jurassic ...
1,28.42,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,"[future, chase, post-apocalyptic, dystopia, au...",120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,2015-05-13,6185,7.1,2015,137999939,348161292,An apocalyptic story set in the furthest reach...
2,13.11,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,"[based on novel, revolution, dystopia, sequel,...",119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,2015-03-18,2480,6.3,2015,101199955,271619025,Beatrice Prior must confront her inner demons ...
3,11.17,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,"[android, spaceship, jedi, space opera, 3d]",136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,2015-12-15,5292,7.5,2015,183999919,1902723130,Thirty years after defeating the Galactic Empi...
4,9.34,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,"[car race, racing, revenge, suspense, car]",137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,2015-04-01,2947,7.3,2015,174799923,1385748801,Deckard Shaw seeks revenge against Dominic Tor...
5,9.11,135000000,532950503,The Revenant,Leonardo DiCaprio|Tom Hardy|Will Poulter|Domhn...,Alejandro GonzÃ¡lez IÃ±Ã¡rritu,"[father-son relationship, rape, based on novel...",156,Western|Drama|Adventure|Thriller,Regency Enterprises|Appian Way|CatchPlay|Anony...,2015-12-25,3929,7.2,2015,124199945,490314247,"In the 1820s, a frontiersman, Hugh Glass, sets..."
6,8.65,155000000,440603537,Terminator Genisys,Arnold Schwarzenegger|Jason Clarke|Emilia Clar...,Alan Taylor,"[saving the world, artificial intelligence, cy...",125,Science Fiction|Action|Thriller|Adventure,Paramount Pictures|Skydance Productions,2015-06-23,2598,5.8,2015,142599937,405355076,"The year is 2029. John Connor, leader of the r..."
7,7.67,108000000,595380321,The Martian,Matt Damon|Jessica Chastain|Kristen Wiig|Jeff ...,Ridley Scott,"[based on novel, mars, nasa, isolation, botanist]",141,Drama|Adventure|Science Fiction,Twentieth Century Fox Film Corporation|Scott F...,2015-09-30,4572,7.6,2015,99359956,547749654,"During a manned mission to Mars, Astronaut Mar..."
8,7.40,74000000,1156730962,Minions,Sandra Bullock|Jon Hamm|Michael Keaton|Allison...,Kyle Balda|Pierre Coffin,"[assistant, aftercreditsstinger, duringcredits...",91,Family|Animation|Adventure|Comedy,Universal Pictures|Illumination Entertainment,2015-06-17,2893,6.5,2015,68079970,1064192017,"Minions Stuart, Kevin and Bob are recruited by..."
9,6.33,175000000,853708609,Inside Out,Amy Poehler|Phyllis Smith|Richard Kind|Bill Ha...,Pete Docter,"[dream, cartoon, imaginary friend, animation, ...",94,Comedy|Animation|Family,Walt Disney Pictures|Pixar Animation Studios|W...,2015-06-09,3935,8.0,2015,160999929,785411575,"Growing up can be a bumpy road, and it's no ex..."


In [53]:
df.isnull().sum()

popularity                 0
budget                     0
revenue                    0
original_title             0
cast                      76
director                  44
keywords                 301
runtime                    0
genres                    23
production_companies    1030
release_date               0
vote_count                 0
vote_average               0
release_year               0
budget_adj                 0
revenue_adj                0
overview                   4
dtype: int64

In [54]:
df[df['keywords'].isnull() == True]

Unnamed: 0,popularity,budget,revenue,original_title,cast,director,keywords,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj,overview
155,1.01,11000000,27391084,Irrational Man,Emma Stone|Joaquin Phoenix|Jamie Blackley|Park...,Woody Allen,,95,Mystery|Drama,Sony Pictures|Gravier Productions,2015-07-17,319,6.1,2015,10119996,25199786,"On a small town college campus, a philosophy p..."
178,0.84,0,3002884,The End of the Tour,Jason Segel|Jesse Eisenberg|Anna Chlumsky|Mami...,James Ponsoldt,,106,Drama,A24,2015-07-31,126,7.3,2015,0,2762652,The story of the five-day interview between Ro...
180,0.83,20000000,6420319,Little Boy,Jakob Salvati|Emily Watson|Cary-Hiroyuki Tagaw...,Alejandro Monteverde,,106,Comedy|Drama|War,Metanoia Films,2015-04-23,113,7.1,2015,18399992,5906691,An eight-year-old boy is willing to do whateve...
217,0.64,0,0,Air,Norman Reedus|Djimon Hounsou|Sandrine Holt|Pet...,Christian Cantamessa,,94,Thriller|Science Fiction,Automatik Entertainment|Circle of Confusion|Sk...,2015-08-14,135,4.7,2015,0,0,"In the near future, breathable air is nonexist..."
231,0.57,0,0,Cobain: Montage of Heck,Kurt Cobain|Dave Grohl|Courtney Love|Krist Nov...,Brett Morgen,,132,Music|Documentary,HBO Documentary Films|Public Road Productions,2015-03-23,197,7.5,2015,0,0,The authorized documentary on late Guitar/lead...
232,0.56,0,0,Hyena Road,Paul Gross|Rossif Sutherland|Clark Johnson|All...,Paul Gross,,120,War|Drama,Rhombus Media|International Traders|Buffalo Ga...,2015-10-09,29,5.7,2015,0,0,"Three different men, three different worlds, t..."
240,0.53,0,0,The Unspoken,Jodelle Ferland|Sunny Suljic|Neal McDonough|Ma...,Sheldon Wilson,,90,Thriller|Horror,Lighthouse Pictures|Sapphire Fire Limited,2015-10-24,10,4.1,2015,0,0,In 1997 the close-knit Anderson family vanishe...
278,0.44,0,0,The Boy,David Morse|Rainn Wilson|Jared Breeze|Mike Vog...,Craig Macneill,,110,Thriller|Drama|Horror,Chiller Films|SpectreVision,2015-08-14,22,6.1,2015,0,0,An intimate portrait of a 9 year old sociopath...
305,0.25,0,0,The Lion Guard: Return of the Roar,Max Charles|Jeff Bennett|Dusan Brown|Sarah Hyl...,Howy Parkins,,44,Family|TV Movie|Animation,Walt Disney Television Animation,2015-11-22,48,5.9,2015,0,0,"Set in the African savannah, the film follows ..."
309,0.39,1950000,0,The Marine 4: Moving Target,Mike Mizanin|Melissa Roxburgh|Josh Blacker|Mat...,William Kaufman,,90,Thriller|Action,WWE Studios,2015-04-10,35,5.7,2015,1793999,0,"WWE Superstar Mike ""The Miz"" Mizanin returns a..."


In [55]:
(df.isnull().sum())/df.shape[0]*100

popularity              0.000000
budget                  0.000000
revenue                 0.000000
original_title          0.000000
cast                    0.699494
director                0.404970
keywords                2.770364
runtime                 0.000000
genres                  0.211689
production_companies    9.479982
release_date            0.000000
vote_count              0.000000
vote_average            0.000000
release_year            0.000000
budget_adj              0.000000
revenue_adj             0.000000
overview                0.036815
dtype: float64