In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as py
import seaborn as sbn

%matplotlib inline

# Evaluate and Clean Data

Steps taken to evaluate and clean data:

1. Drop ID, homepage and tagline columns;
2. Move the "Overview" column to a different table - it would be interesting to see if there is a correspondence between the plot listed, and the keywords selected;
3. Evaluate and drop duplicates;
4. Evaluate the amount of nulls, and the amount of rows nulls represent per column;
5. Round the results in the adjusted budget and revenue columns to zero decimals, and convert to int;
6. Round the results in the popularity score and vote average columns to 2 decimals;
7. For columns that contained multiple results separated by a pipe, moved results into a list;

In [2]:
df = pd.read_csv('D:\\ProjectData\\TMDB\\tmdb-movies.csv')

In [3]:
df.shape

(10866, 21)

In [4]:
df.head()

Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,...,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,...,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/15,5562,6.5,2015,137999900.0,1392446000.0
1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,...,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/15,6185,7.1,2015,137999900.0,348161300.0
2,262500,tt2908446,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,http://www.thedivergentseries.movie/#insurgent,Robert Schwentke,One Choice Can Destroy You,...,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/15,2480,6.3,2015,101200000.0,271619000.0
3,140607,tt2488496,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,http://www.starwars.com/films/star-wars-episod...,J.J. Abrams,Every generation has a story.,...,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/15,5292,7.5,2015,183999900.0,1902723000.0
4,168259,tt2820852,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,http://www.furious7.com/,James Wan,Vengeance Hits Home,...,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/15,2947,7.3,2015,174799900.0,1385749000.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10866 entries, 0 to 10865
Data columns (total 21 columns):
id                      10866 non-null int64
imdb_id                 10856 non-null object
popularity              10866 non-null float64
budget                  10866 non-null int64
revenue                 10866 non-null int64
original_title          10866 non-null object
cast                    10790 non-null object
homepage                2936 non-null object
director                10822 non-null object
tagline                 8042 non-null object
keywords                9373 non-null object
overview                10862 non-null object
runtime                 10866 non-null int64
genres                  10843 non-null object
production_companies    9836 non-null object
release_date            10866 non-null object
vote_count              10866 non-null int64
vote_average            10866 non-null float64
release_year            10866 non-null int64
budget_adj              1

In [6]:
sum(df.duplicated())

1

In [7]:
duplicated_entry = df[df.duplicated() == True]
duplicated_entry

Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,...,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
2090,42194,tt0411951,0.59643,30000000,967000,TEKKEN,Jon Foo|Kelly Overton|Cary-Hiroyuki Tagawa|Ian...,,Dwight H. Little,Survival is no game,...,"In the year of 2039, after World Wars destroy ...",92,Crime|Drama|Action|Thriller|Science Fiction,Namco|Light Song Films,3/20/10,110,5.0,2010,30000000.0,967000.0


In [8]:
df.drop_duplicates(inplace = True)

In [9]:
cols_list = list(df.columns.values)
cols_list

['id',
 'imdb_id',
 'popularity',
 'budget',
 'revenue',
 'original_title',
 'cast',
 'homepage',
 'director',
 'tagline',
 'keywords',
 'overview',
 'runtime',
 'genres',
 'production_companies',
 'release_date',
 'vote_count',
 'vote_average',
 'release_year',
 'budget_adj',
 'revenue_adj']

In [10]:
#remove ID, homepage and tagline columns

cols_keep = [
 'popularity',
 'budget',
 'revenue',
 'original_title',
 'cast',
 'director',
 'keywords',
 'overview',
 'runtime',
 'genres',
 'production_companies',
 'release_date',
 'vote_count',
 'vote_average',
 'release_year',
 'budget_adj',
 'revenue_adj']
df = df[cols_keep]

In [11]:
#move overview to a different dataframe

overview_df = df.filter(['original_title', 'overview'], axis=1)
overview_df.head()

Unnamed: 0,original_title,overview
0,Jurassic World,Twenty-two years after the events of Jurassic ...
1,Mad Max: Fury Road,An apocalyptic story set in the furthest reach...
2,Insurgent,Beatrice Prior must confront her inner demons ...
3,Star Wars: The Force Awakens,Thirty years after defeating the Galactic Empi...
4,Furious 7,Deckard Shaw seeks revenge against Dominic Tor...


In [12]:
#drop overview from main dataframe
df.drop('overview', axis=1, inplace=True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10865 entries, 0 to 10865
Data columns (total 16 columns):
popularity              10865 non-null float64
budget                  10865 non-null int64
revenue                 10865 non-null int64
original_title          10865 non-null object
cast                    10789 non-null object
director                10821 non-null object
keywords                9372 non-null object
runtime                 10865 non-null int64
genres                  10842 non-null object
production_companies    9835 non-null object
release_date            10865 non-null object
vote_count              10865 non-null int64
vote_average            10865 non-null float64
release_year            10865 non-null int64
budget_adj              10865 non-null float64
revenue_adj             10865 non-null float64
dtypes: float64(4), int64(5), object(7)
memory usage: 1.4+ MB


In [14]:
#Check to see amount and percentage of null values

sum_null_values = df.isnull().sum() #sum of null values
sum_null_values

popularity                 0
budget                     0
revenue                    0
original_title             0
cast                      76
director                  44
keywords                1493
runtime                    0
genres                    23
production_companies    1030
release_date               0
vote_count                 0
vote_average               0
release_year               0
budget_adj                 0
revenue_adj                0
dtype: int64

In [15]:
perc_null_values = (df.isnull().sum())/df.shape[0]*100 #percentage of null values
perc_null_values

popularity               0.000000
budget                   0.000000
revenue                  0.000000
original_title           0.000000
cast                     0.699494
director                 0.404970
keywords                13.741371
runtime                  0.000000
genres                   0.211689
production_companies     9.479982
release_date             0.000000
vote_count               0.000000
vote_average             0.000000
release_year             0.000000
budget_adj               0.000000
revenue_adj              0.000000
dtype: float64

In [16]:
df.describe()

Unnamed: 0,popularity,budget,revenue,runtime,vote_count,vote_average,release_year,budget_adj,revenue_adj
count,10865.0,10865.0,10865.0,10865.0,10865.0,10865.0,10865.0,10865.0,10865.0
mean,0.646446,14624290.0,39826900.0,102.07179,217.399632,5.975012,2001.321859,17549890.0,51369000.0
std,1.000231,30914280.0,117008300.0,31.382701,575.644627,0.935138,12.81326,34307530.0,144638300.0
min,6.5e-05,0.0,0.0,0.0,10.0,1.5,1960.0,0.0,0.0
25%,0.207575,0.0,0.0,90.0,17.0,5.4,1995.0,0.0,0.0
50%,0.383831,0.0,0.0,99.0,38.0,6.0,2006.0,0.0,0.0
75%,0.713857,15000000.0,24000000.0,111.0,146.0,6.6,2011.0,20853250.0,33701730.0
max,32.985763,425000000.0,2781506000.0,900.0,9767.0,9.2,2015.0,425000000.0,2827124000.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10865 entries, 0 to 10865
Data columns (total 16 columns):
popularity              10865 non-null float64
budget                  10865 non-null int64
revenue                 10865 non-null int64
original_title          10865 non-null object
cast                    10789 non-null object
director                10821 non-null object
keywords                9372 non-null object
runtime                 10865 non-null int64
genres                  10842 non-null object
production_companies    9835 non-null object
release_date            10865 non-null object
vote_count              10865 non-null int64
vote_average            10865 non-null float64
release_year            10865 non-null int64
budget_adj              10865 non-null float64
revenue_adj             10865 non-null float64
dtypes: float64(4), int64(5), object(7)
memory usage: 1.4+ MB


In [18]:
df = df.round({'popularity' : 2, 'vote_average' : 2, 'budget_adj' : 0, 'revenue_adj' : 0})

In [19]:
int_columns = ['budget_adj', 'revenue_adj']

df[int_columns] = df[int_columns].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10865 entries, 0 to 10865
Data columns (total 16 columns):
popularity              10865 non-null float64
budget                  10865 non-null int64
revenue                 10865 non-null int64
original_title          10865 non-null object
cast                    10789 non-null object
director                10821 non-null object
keywords                9372 non-null object
runtime                 10865 non-null int64
genres                  10842 non-null object
production_companies    9835 non-null object
release_date            10865 non-null object
vote_count              10865 non-null int64
vote_average            10865 non-null float64
release_year            10865 non-null int64
budget_adj              10865 non-null int32
revenue_adj             10865 non-null int32
dtypes: float64(2), int32(2), int64(5), object(7)
memory usage: 1.3+ MB


In [20]:
df.head()

Unnamed: 0,popularity,budget,revenue,original_title,cast,director,keywords,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,32.99,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,monster|dna|tyrannosaurus rex|velociraptor|island,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/15,5562,6.5,2015,137999939,1392445893
1,28.42,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,future|chase|post-apocalyptic|dystopia|australia,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/15,6185,7.1,2015,137999939,348161292
2,13.11,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,based on novel|revolution|dystopia|sequel|dyst...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/15,2480,6.3,2015,101199955,271619025
3,11.17,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,android|spaceship|jedi|space opera|3d,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/15,5292,7.5,2015,183999919,1902723130
4,9.34,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,car race|speed|revenge|suspense|car,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/15,2947,7.3,2015,174799923,1385748801


In [21]:
df['release_date'] = pd.to_datetime(df['release_date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10865 entries, 0 to 10865
Data columns (total 16 columns):
popularity              10865 non-null float64
budget                  10865 non-null int64
revenue                 10865 non-null int64
original_title          10865 non-null object
cast                    10789 non-null object
director                10821 non-null object
keywords                9372 non-null object
runtime                 10865 non-null int64
genres                  10842 non-null object
production_companies    9835 non-null object
release_date            10865 non-null datetime64[ns]
vote_count              10865 non-null int64
vote_average            10865 non-null float64
release_year            10865 non-null int64
budget_adj              10865 non-null int32
revenue_adj             10865 non-null int32
dtypes: datetime64[ns](1), float64(2), int32(2), int64(5), object(6)
memory usage: 1.3+ MB


In [22]:
df.head()

Unnamed: 0,popularity,budget,revenue,original_title,cast,director,keywords,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,32.99,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,monster|dna|tyrannosaurus rex|velociraptor|island,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,2015-06-09,5562,6.5,2015,137999939,1392445893
1,28.42,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,future|chase|post-apocalyptic|dystopia|australia,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,2015-05-13,6185,7.1,2015,137999939,348161292
2,13.11,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,based on novel|revolution|dystopia|sequel|dyst...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,2015-03-18,2480,6.3,2015,101199955,271619025
3,11.17,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,android|spaceship|jedi|space opera|3d,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,2015-12-15,5292,7.5,2015,183999919,1902723130
4,9.34,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,car race|speed|revenge|suspense|car,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,2015-04-01,2947,7.3,2015,174799923,1385748801


In [23]:
def pipe_values_to_list(df, col):
    '''
    Function takes all values for a row found within a single column, and adds them to their own list.
    Takes pandas dataframe (df) and the dataframe column (col) as input.
    '''
    df[col] = df[col].str.split('|').values
    return df[col]

In [24]:
pipe_values_to_list(df,'keywords')
pipe_values_to_list(df,'genres')
pipe_values_to_list(df,'production_companies')
pipe_values_to_list(df,'cast')

0        [Chris Pratt, Bryce Dallas Howard, Irrfan Khan...
1        [Tom Hardy, Charlize Theron, Hugh Keays-Byrne,...
2        [Shailene Woodley, Theo James, Kate Winslet, A...
3        [Harrison Ford, Mark Hamill, Carrie Fisher, Ad...
4        [Vin Diesel, Paul Walker, Jason Statham, Miche...
5        [Leonardo DiCaprio, Tom Hardy, Will Poulter, D...
6        [Arnold Schwarzenegger, Jason Clarke, Emilia C...
7        [Matt Damon, Jessica Chastain, Kristen Wiig, J...
8        [Sandra Bullock, Jon Hamm, Michael Keaton, All...
9        [Amy Poehler, Phyllis Smith, Richard Kind, Bil...
10       [Daniel Craig, Christoph Waltz, LÃ©a Seydoux, ...
11       [Mila Kunis, Channing Tatum, Sean Bean, Eddie ...
12       [Domhnall Gleeson, Alicia Vikander, Oscar Isaa...
13       [Adam Sandler, Michelle Monaghan, Peter Dinkla...
14       [Robert Downey Jr., Chris Hemsworth, Mark Ruff...
15       [Samuel L. Jackson, Kurt Russell, Jennifer Jas...
16       [Liam Neeson, Forest Whitaker, Maggie Grace, F.

In [25]:
df.head()

Unnamed: 0,popularity,budget,revenue,original_title,cast,director,keywords,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,32.99,150000000,1513528810,Jurassic World,"[Chris Pratt, Bryce Dallas Howard, Irrfan Khan...",Colin Trevorrow,"[monster, dna, tyrannosaurus rex, velociraptor...",124,"[Action, Adventure, Science Fiction, Thriller]","[Universal Studios, Amblin Entertainment, Lege...",2015-06-09,5562,6.5,2015,137999939,1392445893
1,28.42,150000000,378436354,Mad Max: Fury Road,"[Tom Hardy, Charlize Theron, Hugh Keays-Byrne,...",George Miller,"[future, chase, post-apocalyptic, dystopia, au...",120,"[Action, Adventure, Science Fiction, Thriller]","[Village Roadshow Pictures, Kennedy Miller Pro...",2015-05-13,6185,7.1,2015,137999939,348161292
2,13.11,110000000,295238201,Insurgent,"[Shailene Woodley, Theo James, Kate Winslet, A...",Robert Schwentke,"[based on novel, revolution, dystopia, sequel,...",119,"[Adventure, Science Fiction, Thriller]","[Summit Entertainment, Mandeville Films, Red W...",2015-03-18,2480,6.3,2015,101199955,271619025
3,11.17,200000000,2068178225,Star Wars: The Force Awakens,"[Harrison Ford, Mark Hamill, Carrie Fisher, Ad...",J.J. Abrams,"[android, spaceship, jedi, space opera, 3d]",136,"[Action, Adventure, Science Fiction, Fantasy]","[Lucasfilm, Truenorth Productions, Bad Robot]",2015-12-15,5292,7.5,2015,183999919,1902723130
4,9.34,190000000,1506249360,Furious 7,"[Vin Diesel, Paul Walker, Jason Statham, Miche...",James Wan,"[car race, speed, revenge, suspense, car]",137,"[Action, Crime, Thriller]","[Universal Pictures, Original Film, Media Righ...",2015-04-01,2947,7.3,2015,174799923,1385748801


## Missing Values

1. cast;
2. director;
3. keywords;
4. genres;
5. production_companies


### Keywords

In [38]:
keywords_list = df['keywords'].tolist()
keywords_list

[['monster', 'dna', 'tyrannosaurus rex', 'velociraptor', 'island'],
 ['future', 'chase', 'post-apocalyptic', 'dystopia', 'australia'],
 ['based on novel', 'revolution', 'dystopia', 'sequel', 'dystopic future'],
 ['android', 'spaceship', 'jedi', 'space opera', '3d'],
 ['car race', 'speed', 'revenge', 'suspense', 'car'],
 ['father-son relationship', 'rape', 'based on novel', 'mountains', 'winter'],
 ['saving the world',
  'artificial intelligence',
  'cyborg',
  'killer robot',
  'future'],
 ['based on novel', 'mars', 'nasa', 'isolation', 'botanist'],
 ['assistant',
  'aftercreditsstinger',
  'duringcreditsstinger',
  'evil mastermind',
  'minions'],
 ['dream', 'cartoon', 'imaginary friend', 'animation', 'kid'],
 ['spy', 'based on novel', 'secret agent', 'sequel', 'james bond'],
 ['jupiter', 'space', 'woman director', '3d', 'interspecies romance'],
 ['dancing', 'artificial intelligence', 'helicopter', 'distrust', 'isolation'],
 ['video game', 'nerd', 'alien attack', '3d', 'pixels'],
 ['m

In [43]:
flattened_keyword_list = []

for sublist in keywords_list:
    if type(sublist) == float:
        continue
    else:
        for item in sublist:
            flattened_keyword_list.append(item)
    
flattened_keyword_list
        

['monster',
 'dna',
 'tyrannosaurus rex',
 'velociraptor',
 'island',
 'future',
 'chase',
 'post-apocalyptic',
 'dystopia',
 'australia',
 'based on novel',
 'revolution',
 'dystopia',
 'sequel',
 'dystopic future',
 'android',
 'spaceship',
 'jedi',
 'space opera',
 '3d',
 'car race',
 'speed',
 'revenge',
 'suspense',
 'car',
 'father-son relationship',
 'rape',
 'based on novel',
 'mountains',
 'winter',
 'saving the world',
 'artificial intelligence',
 'cyborg',
 'killer robot',
 'future',
 'based on novel',
 'mars',
 'nasa',
 'isolation',
 'botanist',
 'assistant',
 'aftercreditsstinger',
 'duringcreditsstinger',
 'evil mastermind',
 'minions',
 'dream',
 'cartoon',
 'imaginary friend',
 'animation',
 'kid',
 'spy',
 'based on novel',
 'secret agent',
 'sequel',
 'james bond',
 'jupiter',
 'space',
 'woman director',
 '3d',
 'interspecies romance',
 'dancing',
 'artificial intelligence',
 'helicopter',
 'distrust',
 'isolation',
 'video game',
 'nerd',
 'alien attack',
 '3d',
 'p

In [46]:
word_counts = {}

for i in range(len(flattened_keyword_list)):
    if flattened_keyword_list[i] not in word_counts:
        word_counts[flattened_keyword_list[i]] = 1
    else:
        word_counts[flattened_keyword_list[i]] += 1
        
word_counts

{'monster': 83,
 'dna': 7,
 'tyrannosaurus rex': 11,
 'velociraptor': 3,
 'island': 62,
 'future': 63,
 'chase': 35,
 'post-apocalyptic': 47,
 'dystopia': 136,
 'australia': 41,
 'based on novel': 278,
 'revolution': 13,
 'sequel': 130,
 'dystopic future': 10,
 'android': 25,
 'spaceship': 15,
 'jedi': 6,
 'space opera': 10,
 '3d': 31,
 'car race': 24,
 'speed': 4,
 'revenge': 137,
 'suspense': 159,
 'car': 17,
 'father-son relationship': 90,
 'rape': 96,
 'mountains': 15,
 'winter': 19,
 'saving the world': 40,
 'artificial intelligence': 32,
 'cyborg': 26,
 'killer robot': 11,
 'mars': 8,
 'nasa': 18,
 'isolation': 21,
 'botanist': 4,
 'assistant': 5,
 'aftercreditsstinger': 71,
 'duringcreditsstinger': 152,
 'evil mastermind': 1,
 'minions': 2,
 'dream': 56,
 'cartoon': 7,
 'imaginary friend': 8,
 'animation': 43,
 'kid': 4,
 'spy': 53,
 'secret agent': 26,
 'james bond': 6,
 'jupiter': 5,
 'space': 33,
 'woman director': 413,
 'interspecies romance': 3,
 'dancing': 44,
 'helicopter

In [47]:
counts = []

for k,v in word_counts.items():
    counts.append([k,v])
    

counts

[['monster', 83],
 ['dna', 7],
 ['tyrannosaurus rex', 11],
 ['velociraptor', 3],
 ['island', 62],
 ['future', 63],
 ['chase', 35],
 ['post-apocalyptic', 47],
 ['dystopia', 136],
 ['australia', 41],
 ['based on novel', 278],
 ['revolution', 13],
 ['sequel', 130],
 ['dystopic future', 10],
 ['android', 25],
 ['spaceship', 15],
 ['jedi', 6],
 ['space opera', 10],
 ['3d', 31],
 ['car race', 24],
 ['speed', 4],
 ['revenge', 137],
 ['suspense', 159],
 ['car', 17],
 ['father-son relationship', 90],
 ['rape', 96],
 ['mountains', 15],
 ['winter', 19],
 ['saving the world', 40],
 ['artificial intelligence', 32],
 ['cyborg', 26],
 ['killer robot', 11],
 ['mars', 8],
 ['nasa', 18],
 ['isolation', 21],
 ['botanist', 4],
 ['assistant', 5],
 ['aftercreditsstinger', 71],
 ['duringcreditsstinger', 152],
 ['evil mastermind', 1],
 ['minions', 2],
 ['dream', 56],
 ['cartoon', 7],
 ['imaginary friend', 8],
 ['animation', 43],
 ['kid', 4],
 ['spy', 53],
 ['secret agent', 26],
 ['james bond', 6],
 ['jupiter'

In [50]:
counts.sort(key = lambda x:x[1], reverse=True)
counts

[['woman director', 413],
 ['independent film', 396],
 ['based on novel', 278],
 ['sex', 272],
 ['sport', 216],
 ['murder', 204],
 ['biography', 169],
 ['musical', 169],
 ['new york', 163],
 ['suspense', 159],
 ['nudity', 154],
 ['duringcreditsstinger', 152],
 ['female nudity', 149],
 ['prison', 140],
 ['revenge', 137],
 ['dystopia', 136],
 ['high school', 135],
 ['sequel', 130],
 ['london', 123],
 ['suicide', 115],
 ['friendship', 111],
 ['police', 103],
 ['holiday', 100],
 ['rape', 96],
 ['love', 96],
 ['detective', 93],
 ['world war ii', 91],
 ['father-son relationship', 90],
 ['gay', 88],
 ['teenager', 87],
 ['brother brother relationship', 85],
 ['robbery', 85],
 ['monster', 83],
 ['prostitute', 82],
 ['paris', 81],
 ['corruption', 81],
 ['serial killer', 80],
 ['secret', 79],
 ['vampire', 77],
 ['alien', 76],
 ['fight', 75],
 ['martial arts', 74],
 ['superhero', 73],
 ['england', 73],
 ['journalist', 73],
 ['aftercreditsstinger', 71],
 ['alcohol', 71],
 ['cia', 70],
 ['witch', 68

In [None]:
def keywordset(df, column_name):

    keywords_set = set()

    for list_keywords in df[column_name]:
        if isinstance(list_keywords, float):
            continue
        else:
            keywords_set = keywords_set.union(list_keywords)
            
    return keywords_set

keyword_list = list(keywordset(df, 'keywords'))
keyword_list

In [None]:
def wordcount (column_name):
    
    keyword_list = df[column_name].values
    
    word_counts = {}
    
    for i in range(len(keyword_list)):
        if keyword_list[i] not in word_counts:
            word_counts[keyword_list[i]] = 1
        else:
            word_counts[keyword_list[i]] += 1
    
    counts = []
    
    for k,v in word_counts.items():
        counts.append([k,v])
        
    counts.sort(key = lambda x:x[1], reverse = True)
    
    return counts

wordcount(keyword_list)