# data clean for 'movie_metadata.csv'




In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('movie_metadata.csv')

In [3]:
# 資料描述

# 變數名稱：描述
# movie_title                Title of the Movie, 電影名稱                       
# duration                   Duration in minutes, 電影長度
# director_name              Name of the Director of the Movie, 導演名
# director_facebook_likes    Number of likes of the Director on his Facebook Page, 導演粉絲專業讚數
# actor_1_name               Primary actor starring in the movie, 主要演員名
# actor_1_facebook_likes     Number of likes of the Actor_1 on his/her Facebook Page, 主要演員粉絲專業讚數
# actor_2_name               Other actor starring in the movie, 主要演員名
# actor_2_facebook_likes     Number of likes of the Actor_2 on his/her Facebook Page, 主要演員粉絲專業讚數
# actor_3_name               Other actor starring in the movie, 主要演員名
# actor_3_facebook_likes     Number of likes of the Actor_3 on his/her Facebook Page, 主要演員粉絲專業讚數
# num_user_for_reviews       Number of users who gave a review, 使用者給予回饋數
# num_critic_for_reviews     Number of critical reviews on imdb, 在IMDB上的評論數
# num_voted_users            Number of people who voted for the movie, 投票數
# cast_total_facebook_likes  Total number of facebook likes of the entire cast of the movie, 整部電影的讚數
# movie_facebook_likes       Number of Facebook likes in the movie page, 電影粉絲專業讚數
# plot_keywords              Keywords describing the movie plot, 電影關鍵詞
# facenumber_in_poster       Number of the actor who featured in the movie poster, 電影海報出現演員數
# color                      Film colorization. ‘Black and White’ or ‘Color’, 黑白電影/彩色電影
# genres                     Film categorization like ‘Animation’, ‘Comedy’, ‘Romance’, ‘Horror’, ‘Sci-Fi’, ‘Action’, ‘Family’, 電影類別
# title_year                 The year in which the movie is released (1916:2016), 電影出版年
# language                   English, Arabic, Chinese, French, German, Danish, Italian, Japanese etc, 語言
# country                    Country where the movie is produced, 出版國家
# content_rating             Content rating of the movie, 電影分級
# aspect_ratio               Aspect ratio the movie was made in, 電影長寬比
# movie_imdb_link            IMDB link of the movie, 電影imdb連結
# gross	Gross                earnings of the movie in Dollars, 電影總收益
# budget                     Budget of the movie in Dollars, 電影預算
# imdb_score                 IMDB Score of the movie on IMDB, IMDB評分

In [4]:
data.shape

(5043, 28)

In [5]:
data.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [6]:
import matplotlib
matplotlib.__version__

'3.5.1'

In [7]:
data.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [8]:
# we have movies only upto 2016
import matplotlib.pyplot as plt
%matplotlib inline  

data.title_year.value_counts(dropna=False).sort_index()
plt.show()


# data.title_year.value_counts().sort_index()


In [9]:
# recommendation will be based on these features only
data = data.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]

In [10]:
data.head(10)

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action|Adventure|Fantasy|Sci-Fi,Avatar
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action|Adventure|Fantasy,Pirates of the Caribbean: At World's End
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action|Adventure|Thriller,Spectre
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action|Thriller,The Dark Knight Rises
4,Doug Walker,Doug Walker,Rob Walker,,Documentary,Star Wars: Episode VII - The Force Awakens ...
5,Andrew Stanton,Daryl Sabara,Samantha Morton,Polly Walker,Action|Adventure|Sci-Fi,John Carter
6,Sam Raimi,J.K. Simmons,James Franco,Kirsten Dunst,Action|Adventure|Romance,Spider-Man 3
7,Nathan Greno,Brad Garrett,Donna Murphy,M.C. Gainey,Adventure|Animation|Comedy|Family|Fantasy|Musi...,Tangled
8,Joss Whedon,Chris Hemsworth,Robert Downey Jr.,Scarlett Johansson,Action|Adventure|Sci-Fi,Avengers: Age of Ultron
9,David Yates,Alan Rickman,Daniel Radcliffe,Rupert Grint,Adventure|Family|Fantasy|Mystery,Harry Potter and the Half-Blood Prince


In [11]:
data['actor_1_name'] = data['actor_1_name'].replace(np.nan, 'unknown')
data['actor_2_name'] = data['actor_2_name'].replace(np.nan, 'unknown')
data['actor_3_name'] = data['actor_3_name'].replace(np.nan, 'unknown')
data['director_name'] = data['director_name'].replace(np.nan, 'unknown')

In [12]:
data['genres'] = data['genres'].str.replace('|', ' ', regex=True)

In [13]:
data['movie_title'] = data['movie_title'].str.lower()
data['movie_title']

0                                                 avatar 
1               pirates of the caribbean: at world's end 
2                                                spectre 
3                                  the dark knight rises 
4       star wars: episode vii - the force awakens    ...
                              ...                        
5038                             signed sealed delivered 
5039                           the following             
5040                                a plague so pleasant 
5041                                    shanghai calling 
5042                                   my date with drew 
Name: movie_title, Length: 5043, dtype: object

In [14]:
# null terminating char at the end
data['movie_title'][1]


"pirates of the caribbean: at world's end\xa0"

In [15]:
# removing the null terminating char at the end
data['movie_title'] = data['movie_title'].apply(lambda x : x[:-1])

In [16]:
data['movie_title'][1]

"pirates of the caribbean: at world's end"

In [17]:
data.to_csv('data.csv',index=False)

# data clean for 'credits.csv' and 'movies_metadata.csv'


In [18]:
credits = pd.read_csv('credits.csv')

In [19]:
credits

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862
...,...,...,...
45471,"[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de...",439050
45472,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",111109
45473,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",67758
45474,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506


In [20]:
meta = pd.read_csv('movies_metadata.csv', low_memory=False)


In [21]:
meta.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [22]:
meta['release_date']

0        1995-10-30
1        1995-12-15
2        1995-12-22
3        1995-12-22
4        1995-02-10
            ...    
45461           NaN
45462    2011-11-17
45463    2003-08-01
45464    1917-10-21
45465    2017-06-09
Name: release_date, Length: 45466, dtype: object

In [23]:
# errors='coerce' 把資料轉成相似型態 
meta['release_date'] = pd.to_datetime(meta['release_date'], errors='coerce')

In [24]:
meta['release_date']

0       1995-10-30
1       1995-12-15
2       1995-12-22
3       1995-12-22
4       1995-02-10
           ...    
45461          NaT
45462   2011-11-17
45463   2003-08-01
45464   1917-10-21
45465   2017-06-09
Name: release_date, Length: 45466, dtype: datetime64[ns]

In [25]:
meta['year'] = meta['release_date'].dt.year

In [26]:
meta['year']

0        1995.0
1        1995.0
2        1995.0
3        1995.0
4        1995.0
          ...  
45461       NaN
45462    2011.0
45463    2003.0
45464    1917.0
45465    2017.0
Name: year, Length: 45466, dtype: float64

In [27]:
meta['year'].value_counts().sort_index()

1874.0       1
1878.0       1
1883.0       1
1887.0       1
1888.0       2
          ... 
2015.0    1905
2016.0    1604
2017.0     532
2018.0       5
2020.0       1
Name: year, Length: 135, dtype: int64

In [28]:
# meta[meta['year'] >=1980]['year'].value_counts().sort_index()

In [29]:
# meta[meta.year==2013]

In [30]:
# Getting only movies upto 2017 movies as we don't have enough data for the movies from 2018, 2019 and 2020. 
# We'll deal with it in the upcoming preprocessing files
new_meta = meta.loc[meta.year <= 2017,['genres','id','title','year']]

In [31]:
new_meta

Unnamed: 0,genres,id,title,year
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,Toy Story,1995.0
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,Jumanji,1995.0
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,Grumpier Old Men,1995.0
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,Waiting to Exhale,1995.0
4,"[{'id': 35, 'name': 'Comedy'}]",11862,Father of the Bride Part II,1995.0
...,...,...,...,...
45460,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",30840,Robin Hood,1991.0
45462,"[{'id': 18, 'name': 'Drama'}]",111109,Century of Birthing,2011.0
45463,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,Betrayal,2003.0
45464,[],227506,Satan Triumphant,1917.0


In [32]:
new_meta['id'] = new_meta['id'].astype(int)

In [33]:
data = pd.merge(new_meta, credits, on='id')

In [34]:
data

Unnamed: 0,genres,id,title,year,cast,crew
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,Toy Story,1995.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,Jumanji,1995.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,Grumpier Old Men,1995.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,Waiting to Exhale,1995.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,"[{'id': 35, 'name': 'Comedy'}]",11862,Father of the Bride Part II,1995.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."
...,...,...,...,...,...,...
45440,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",30840,Robin Hood,1991.0,"[{'cast_id': 1, 'character': 'Sir Robert Hode'...","[{'credit_id': '52fe44439251416c9100a899', 'de..."
45441,"[{'id': 18, 'name': 'Drama'}]",111109,Century of Birthing,2011.0,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de..."
45442,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,Betrayal,2003.0,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de..."
45443,[],227506,Satan Triumphant,1917.0,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de..."


In [35]:
pd.set_option('display.max_colwidth', 75)
data

Unnamed: 0,genres,id,title,year,cast,crew
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': ...",862,Toy Story,1995.0,"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3...","[{'credit_id': '52fe4284c3a36847f8024f49', 'department': 'Directing', '..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id':...",8844,Jumanji,1995.0,"[{'cast_id': 1, 'character': 'Alan Parrish', 'credit_id': '52fe44bfc3a3...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'department': 'Production', ..."
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]",15602,Grumpier Old Men,1995.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'credit_id': '52fe466a92514...","[{'credit_id': '52fe466a9251416c75077a89', 'department': 'Directing', '..."
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 1074...",31357,Waiting to Exhale,1995.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah' Jackson"", 'credit_id': ...","[{'credit_id': '52fe44779251416c91011acb', 'department': 'Directing', '..."
4,"[{'id': 35, 'name': 'Comedy'}]",11862,Father of the Bride Part II,1995.0,"[{'cast_id': 1, 'character': 'George Banks', 'credit_id': '52fe44959251...","[{'credit_id': '52fe44959251416c75039ed7', 'department': 'Sound', 'gend..."
...,...,...,...,...,...,...
45440,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}, {'id': 1074...",30840,Robin Hood,1991.0,"[{'cast_id': 1, 'character': 'Sir Robert Hode', 'credit_id': '52fe44439...","[{'credit_id': '52fe44439251416c9100a899', 'department': 'Directing', '..."
45441,"[{'id': 18, 'name': 'Drama'}]",111109,Century of Birthing,2011.0,"[{'cast_id': 1002, 'character': 'Sister Angela', 'credit_id': '52fe4af1...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'department': 'Directing', '..."
45442,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'name': 'Drama'}, {'id': 53, ...",67758,Betrayal,2003.0,"[{'cast_id': 6, 'character': 'Emily Shaw', 'credit_id': '52fe4776c3a368...","[{'credit_id': '52fe4776c3a368484e0c8387', 'department': 'Directing', '..."
45443,[],227506,Satan Triumphant,1917.0,"[{'cast_id': 2, 'character': '', 'credit_id': '52fe4ea59251416c7515d7d5...","[{'credit_id': '533bccebc3a36844cf0011a7', 'department': 'Directing', '..."


In [36]:
# evaluates an expression node or a string containing a Python literal or container display
# ast.literal_eval(x) 把list轉換成原始資料型態
import ast
data['genres'] = data['genres'].map(lambda x: ast.literal_eval(x))
data['cast'] = data['cast'].map(lambda x: ast.literal_eval(x))
data['crew'] = data['crew'].map(lambda x: ast.literal_eval(x))

In [37]:
data

Unnamed: 0,genres,id,title,year,cast,crew
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': ...",862,Toy Story,1995.0,"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3...","[{'credit_id': '52fe4284c3a36847f8024f49', 'department': 'Directing', '..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id':...",8844,Jumanji,1995.0,"[{'cast_id': 1, 'character': 'Alan Parrish', 'credit_id': '52fe44bfc3a3...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'department': 'Production', ..."
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]",15602,Grumpier Old Men,1995.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'credit_id': '52fe466a92514...","[{'credit_id': '52fe466a9251416c75077a89', 'department': 'Directing', '..."
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 1074...",31357,Waiting to Exhale,1995.0,"[{'cast_id': 1, 'character': 'Savannah 'Vannah' Jackson', 'credit_id': ...","[{'credit_id': '52fe44779251416c91011acb', 'department': 'Directing', '..."
4,"[{'id': 35, 'name': 'Comedy'}]",11862,Father of the Bride Part II,1995.0,"[{'cast_id': 1, 'character': 'George Banks', 'credit_id': '52fe44959251...","[{'credit_id': '52fe44959251416c75039ed7', 'department': 'Sound', 'gend..."
...,...,...,...,...,...,...
45440,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}, {'id': 1074...",30840,Robin Hood,1991.0,"[{'cast_id': 1, 'character': 'Sir Robert Hode', 'credit_id': '52fe44439...","[{'credit_id': '52fe44439251416c9100a899', 'department': 'Directing', '..."
45441,"[{'id': 18, 'name': 'Drama'}]",111109,Century of Birthing,2011.0,"[{'cast_id': 1002, 'character': 'Sister Angela', 'credit_id': '52fe4af1...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'department': 'Directing', '..."
45442,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'name': 'Drama'}, {'id': 53, ...",67758,Betrayal,2003.0,"[{'cast_id': 6, 'character': 'Emily Shaw', 'credit_id': '52fe4776c3a368...","[{'credit_id': '52fe4776c3a368484e0c8387', 'department': 'Directing', '..."
45443,[],227506,Satan Triumphant,1917.0,"[{'cast_id': 2, 'character': '', 'credit_id': '52fe4ea59251416c7515d7d5...","[{'credit_id': '533bccebc3a36844cf0011a7', 'department': 'Directing', '..."


In [38]:
def make_genresList(x):
    gen = []
    st = " "
    for i in x:
        if i.get('name') == 'Science Fiction':
            scifi = 'Sci-Fi'
            gen.append(scifi)
        else:
            gen.append(i.get('name'))
    if gen == []:
        return np.NaN
    else:
        return (st.join(gen))

In [39]:
data['genres_list'] = data['genres'].map(lambda x: make_genresList(x))

In [40]:
data['genres_list']


0         Animation Comedy Family
1        Adventure Fantasy Family
2                  Romance Comedy
3            Comedy Drama Romance
4                          Comedy
                   ...           
45440        Drama Action Romance
45441                       Drama
45442       Action Drama Thriller
45443                         NaN
45444                         NaN
Name: genres_list, Length: 45445, dtype: object

In [41]:
def get_actor1(x):
    casts = []
    for i in x:
        casts.append(i.get('name'))
    if casts == []:
        return np.NaN
    else:
        return (casts[0])

In [42]:
data['actor_1_name'] = data['cast'].map(lambda x: get_actor1(x))

In [43]:
data['actor_1_name']

0               Tom Hanks
1          Robin Williams
2          Walter Matthau
3         Whitney Houston
4            Steve Martin
               ...       
45440      Patrick Bergin
45441        Angel Aquino
45442       Erika Eleniak
45443    Iwan Mosschuchin
45444                 NaN
Name: actor_1_name, Length: 45445, dtype: object

In [44]:
def get_actor2(x):
    casts = []
    for i in x:
        casts.append(i.get('name'))
    if casts == [] or len(casts)<=1:
        return np.NaN
    else:
        return (casts[1])

In [45]:
data['actor_2_name'] = data['cast'].map(lambda x: get_actor2(x))

In [46]:
data['actor_2_name']

0                Tim Allen
1            Jonathan Hyde
2              Jack Lemmon
3           Angela Bassett
4             Diane Keaton
               ...        
45440          Uma Thurman
45441          Perry Dizon
45442         Adam Baldwin
45443    Nathalie Lissenko
45444                  NaN
Name: actor_2_name, Length: 45445, dtype: object

In [47]:
def get_actor3(x):
    casts = []
    for i in x:
        casts.append(i.get('name'))
    if casts == [] or len(casts)<=2:
        return np.NaN
    else:
        return (casts[2])

In [48]:
data['actor_3_name'] = data['cast'].map(lambda x: get_actor3(x))


In [49]:
def get_directors(x):
    dt = []
    st = " "
    for i in x:
        if i.get('job') == 'Director':
            dt.append(i.get('name'))
    if dt == []:
        return np.NaN
    else:
        return (st.join(dt))

In [50]:
data['director_name'] = data['crew'].map(lambda x: get_directors(x))

In [51]:
movie = data.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres_list','title']]

In [52]:
movie

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres_list,title
0,John Lasseter,Tom Hanks,Tim Allen,Don Rickles,Animation Comedy Family,Toy Story
1,Joe Johnston,Robin Williams,Jonathan Hyde,Kirsten Dunst,Adventure Fantasy Family,Jumanji
2,Howard Deutch,Walter Matthau,Jack Lemmon,Ann-Margret,Romance Comedy,Grumpier Old Men
3,Forest Whitaker,Whitney Houston,Angela Bassett,Loretta Devine,Comedy Drama Romance,Waiting to Exhale
4,Charles Shyer,Steve Martin,Diane Keaton,Martin Short,Comedy,Father of the Bride Part II
...,...,...,...,...,...,...
45440,John Irvin,Patrick Bergin,Uma Thurman,David Morrissey,Drama Action Romance,Robin Hood
45441,Lav Diaz,Angel Aquino,Perry Dizon,Hazel Orencio,Drama,Century of Birthing
45442,Mark L. Lester,Erika Eleniak,Adam Baldwin,Julie du Page,Action Drama Thriller,Betrayal
45443,Yakov Protazanov,Iwan Mosschuchin,Nathalie Lissenko,Pavel Pavlov,,Satan Triumphant


In [53]:
movie.isna().sum()

director_name     835
actor_1_name     2354
actor_2_name     3683
actor_3_name     4593
genres_list      2384
title               0
dtype: int64

In [54]:
movie.shape

(45445, 6)

In [55]:
movie = movie.dropna(how='any')

In [56]:
movie.isna().sum()

director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres_list      0
title            0
dtype: int64

In [57]:
movie

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres_list,title
0,John Lasseter,Tom Hanks,Tim Allen,Don Rickles,Animation Comedy Family,Toy Story
1,Joe Johnston,Robin Williams,Jonathan Hyde,Kirsten Dunst,Adventure Fantasy Family,Jumanji
2,Howard Deutch,Walter Matthau,Jack Lemmon,Ann-Margret,Romance Comedy,Grumpier Old Men
3,Forest Whitaker,Whitney Houston,Angela Bassett,Loretta Devine,Comedy Drama Romance,Waiting to Exhale
4,Charles Shyer,Steve Martin,Diane Keaton,Martin Short,Comedy,Father of the Bride Part II
...,...,...,...,...,...,...
45438,Ben Rock,Monty Bane,Lucy Butler,David Grammer,Horror,The Burkittsville 7
45439,Aaron Osborne,Lisa Boyle,Kena Land,Zaneta Polard,Sci-Fi,Caged Heat 3000
45440,John Irvin,Patrick Bergin,Uma Thurman,David Morrissey,Drama Action Romance,Robin Hood
45441,Lav Diaz,Angel Aquino,Perry Dizon,Hazel Orencio,Drama,Century of Birthing


In [58]:
movie = movie.rename(columns={'genres_list':'genres'})
movie = movie.rename(columns={'title':'movie_title'})

In [59]:
movie['movie_title'] = movie['movie_title'].str.lower()

In [60]:
movie['comb'] = movie['actor_1_name'] + ' ' + movie['actor_2_name'] + ' '+ movie['actor_3_name'] + ' '+ movie['director_name'] +' ' + movie['genres']

In [61]:
movie['comb']

0             Tom Hanks Tim Allen Don Rickles John Lasseter Animation Comedy Family
1        Robin Williams Jonathan Hyde Kirsten Dunst Joe Johnston Adventure Fanta...
2               Walter Matthau Jack Lemmon Ann-Margret Howard Deutch Romance Comedy
3        Whitney Houston Angela Bassett Loretta Devine Forest Whitaker Comedy Dr...
4                       Steve Martin Diane Keaton Martin Short Charles Shyer Comedy
                                            ...                                    
45438                          Monty Bane Lucy Butler David Grammer Ben Rock Horror
45439                       Lisa Boyle Kena Land Zaneta Polard Aaron Osborne Sci-Fi
45440    Patrick Bergin Uma Thurman David Morrissey John Irvin Drama Action Romance
45441                         Angel Aquino Perry Dizon Hazel Orencio Lav Diaz Drama
45442    Erika Eleniak Adam Baldwin Julie du Page Mark L. Lester Action Drama Th...
Name: comb, Length: 39201, dtype: object

In [62]:
movie

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,John Lasseter,Tom Hanks,Tim Allen,Don Rickles,Animation Comedy Family,toy story,Tom Hanks Tim Allen Don Rickles John Lasseter Animation Comedy Family
1,Joe Johnston,Robin Williams,Jonathan Hyde,Kirsten Dunst,Adventure Fantasy Family,jumanji,Robin Williams Jonathan Hyde Kirsten Dunst Joe Johnston Adventure Fanta...
2,Howard Deutch,Walter Matthau,Jack Lemmon,Ann-Margret,Romance Comedy,grumpier old men,Walter Matthau Jack Lemmon Ann-Margret Howard Deutch Romance Comedy
3,Forest Whitaker,Whitney Houston,Angela Bassett,Loretta Devine,Comedy Drama Romance,waiting to exhale,Whitney Houston Angela Bassett Loretta Devine Forest Whitaker Comedy Dr...
4,Charles Shyer,Steve Martin,Diane Keaton,Martin Short,Comedy,father of the bride part ii,Steve Martin Diane Keaton Martin Short Charles Shyer Comedy
...,...,...,...,...,...,...,...
45438,Ben Rock,Monty Bane,Lucy Butler,David Grammer,Horror,the burkittsville 7,Monty Bane Lucy Butler David Grammer Ben Rock Horror
45439,Aaron Osborne,Lisa Boyle,Kena Land,Zaneta Polard,Sci-Fi,caged heat 3000,Lisa Boyle Kena Land Zaneta Polard Aaron Osborne Sci-Fi
45440,John Irvin,Patrick Bergin,Uma Thurman,David Morrissey,Drama Action Romance,robin hood,Patrick Bergin Uma Thurman David Morrissey John Irvin Drama Action Romance
45441,Lav Diaz,Angel Aquino,Perry Dizon,Hazel Orencio,Drama,century of birthing,Angel Aquino Perry Dizon Hazel Orencio Lav Diaz Drama


In [63]:
old = pd.read_csv('data.csv')

In [64]:
old

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens
...,...,...,...,...,...,...
5038,Scott Smith,Eric Mabius,Daphne Zuniga,Crystal Lowe,Comedy Drama,signed sealed delivered
5039,unknown,Natalie Zea,Valorie Curry,Sam Underwood,Crime Drama Mystery Thriller,the following
5040,Benjamin Roberds,Eva Boehnke,Maxwell Moody,David Chandler,Drama Horror Thriller,a plague so pleasant
5041,Daniel Hsia,Alan Ruck,Daniel Henney,Eliza Coupe,Comedy Drama Romance,shanghai calling


In [65]:
old['comb'] = old['actor_1_name'] + ' ' + old['actor_2_name'] + ' '+ old['actor_3_name'] + ' '+ old['director_name'] +' ' + old['genres']

In [66]:
old

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James Cameron Action Adventure F...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore Verbinski Action Adventur...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman Sam Mendes Action Adventu...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt Christopher Nolan Action ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens,Doug Walker Rob Walker unknown Doug Walker Documentary
...,...,...,...,...,...,...,...
5038,Scott Smith,Eric Mabius,Daphne Zuniga,Crystal Lowe,Comedy Drama,signed sealed delivered,Eric Mabius Daphne Zuniga Crystal Lowe Scott Smith Comedy Drama
5039,unknown,Natalie Zea,Valorie Curry,Sam Underwood,Crime Drama Mystery Thriller,the following,Natalie Zea Valorie Curry Sam Underwood unknown Crime Drama Mystery Thr...
5040,Benjamin Roberds,Eva Boehnke,Maxwell Moody,David Chandler,Drama Horror Thriller,a plague so pleasant,Eva Boehnke Maxwell Moody David Chandler Benjamin Roberds Drama Horror ...
5041,Daniel Hsia,Alan Ruck,Daniel Henney,Eliza Coupe,Comedy Drama Romance,shanghai calling,Alan Ruck Daniel Henney Eliza Coupe Daniel Hsia Comedy Drama Romance


In [67]:
new = old.append(movie)

  new = old.append(movie)


In [68]:
new

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James Cameron Action Adventure F...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore Verbinski Action Adventur...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman Sam Mendes Action Adventu...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt Christopher Nolan Action ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens,Doug Walker Rob Walker unknown Doug Walker Documentary
...,...,...,...,...,...,...,...
45438,Ben Rock,Monty Bane,Lucy Butler,David Grammer,Horror,the burkittsville 7,Monty Bane Lucy Butler David Grammer Ben Rock Horror
45439,Aaron Osborne,Lisa Boyle,Kena Land,Zaneta Polard,Sci-Fi,caged heat 3000,Lisa Boyle Kena Land Zaneta Polard Aaron Osborne Sci-Fi
45440,John Irvin,Patrick Bergin,Uma Thurman,David Morrissey,Drama Action Romance,robin hood,Patrick Bergin Uma Thurman David Morrissey John Irvin Drama Action Romance
45441,Lav Diaz,Angel Aquino,Perry Dizon,Hazel Orencio,Drama,century of birthing,Angel Aquino Perry Dizon Hazel Orencio Lav Diaz Drama


In [69]:
new.drop_duplicates(subset ="movie_title", keep = 'last', inplace = True)

In [70]:
new

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens,Doug Walker Rob Walker unknown Doug Walker Documentary
68,Rob Letterman,Amy Poehler,Rainn Wilson,Stephen Colbert,Action Adventure Animation Comedy Family Sci-Fi,monsters vs. aliens,Amy Poehler Rainn Wilson Stephen Colbert Rob Letterman Action Adventure...
175,George Miller,Robin Williams,Brad Pitt,Common,Animation Comedy Family Musical,happy feet 2,Robin Williams Brad Pitt Common George Miller Animation Comedy Family M...
177,unknown,Don Johnson,Philip Michael Thomas,John Diehl,Action Crime Drama Mystery Thriller,miami vice,Don Johnson Philip Michael Thomas John Diehl unknown Action Crime Drama...
191,Chris Miller,Salma Hayek,Constance Marie,Amy Sedaris,Action Adventure Animation Comedy Family Fantasy,puss in boots,Salma Hayek Constance Marie Amy Sedaris Chris Miller Action Adventure A...
...,...,...,...,...,...,...,...
45438,Ben Rock,Monty Bane,Lucy Butler,David Grammer,Horror,the burkittsville 7,Monty Bane Lucy Butler David Grammer Ben Rock Horror
45439,Aaron Osborne,Lisa Boyle,Kena Land,Zaneta Polard,Sci-Fi,caged heat 3000,Lisa Boyle Kena Land Zaneta Polard Aaron Osborne Sci-Fi
45440,John Irvin,Patrick Bergin,Uma Thurman,David Morrissey,Drama Action Romance,robin hood,Patrick Bergin Uma Thurman David Morrissey John Irvin Drama Action Romance
45441,Lav Diaz,Angel Aquino,Perry Dizon,Hazel Orencio,Drama,century of birthing,Angel Aquino Perry Dizon Hazel Orencio Lav Diaz Drama


In [71]:
new.to_csv('new_data.csv',index=False)