# IMDB 5000 Movie Dataset

https://www.kaggle.com/datasets/carolzhangdc/imdb-5000-movie-dataset?datasetId=7181&sortBy=voteCount&language=Python

一部商業上成功的電影不僅能娛樂觀眾，還能讓電影公司獲得巨大的利潤。 好的導演、有經驗的演員等很多因素對於拍出好的電影來說都是相當重要的。 但是，著名導演和演員總能帶來預期的票房收入，但不能保證imdb評分很高。

數據集來自 Kaggle 網站。 它包含 5043 部電影的 28 個變量，跨越 66 個國家/地區的。 有 2399 個獨特的導演姓名，以及數千名演員/女演員。 “imdb_score”是響應變量，而其他 27 個變量是可能的預測變量。

In [1]:
# 資料描述

# 變數名稱：描述
# movie_title                Title of the Movie, 電影名稱                       
# duration                   Duration in minutes, 電影長度
# director_name              Name of the Director of the Movie, 導演名
# director_facebook_likes    Number of likes of the Director on his Facebook Page, 導演粉絲專業讚數
# actor_1_name               Primary actor starring in the movie, 主要演員名
# actor_1_facebook_likes     Number of likes of the Actor_1 on his/her Facebook Page, 主要演員粉絲專業讚數
# actor_2_name               Other actor starring in the movie, 主要演員名
# actor_2_facebook_likes     Number of likes of the Actor_2 on his/her Facebook Page, 主要演員粉絲專業讚數
# actor_3_name               Other actor starring in the movie, 主要演員名
# actor_3_facebook_likes     Number of likes of the Actor_3 on his/her Facebook Page, 主要演員粉絲專業讚數
# num_user_for_reviews       Number of users who gave a review, 使用者給予回饋數
# num_critic_for_reviews     Number of critical reviews on imdb, 在IMDB上的評論數
# num_voted_users            Number of people who voted for the movie, 投票數
# cast_total_facebook_likes  Total number of facebook likes of the entire cast of the movie, 整部電影的讚數
# movie_facebook_likes       Number of Facebook likes in the movie page, 電影粉絲專業讚數
# plot_keywords              Keywords describing the movie plot, 電影關鍵詞
# facenumber_in_poster       Number of the actor who featured in the movie poster, 電影海報出現演員數
# color                      Film colorization. ‘Black and White’ or ‘Color’, 黑白電影/彩色電影
# genres                     Film categorization like ‘Animation’, ‘Comedy’, ‘Romance’, ‘Horror’, ‘Sci-Fi’, ‘Action’, ‘Family’, 電影類別
# title_year                 The year in which the movie is released (1916:2016), 電影出版年
# language                   English, Arabic, Chinese, French, German, Danish, Italian, Japanese etc, 語言
# country                    Country where the movie is produced, 出版國家
# content_rating             Content rating of the movie, 電影分級
# aspect_ratio               Aspect ratio the movie was made in, 電影長寬比
# movie_imdb_link            IMDB link of the movie, 電影imdb連結
# gross	Gross                earnings of the movie in Dollars, 電影總收益
# budget                     Budget of the movie in Dollars, 電影預算
# imdb_score                 IMDB Score of the movie on IMDB, IMDB評分

## Data Loading

In [2]:
#!pip install seaborn
#!pip install pandas-profiling
#!pip3 install ipywidgets

In [3]:
#importing the libraries that we use
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp

In [4]:
dataset = pd.read_csv('./movie_metadata.csv')
dataset.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [5]:
print(dataset.shape)
print(dataset.columns)
dataset.describe()

(5043, 28)
Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')


Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4993.0,5028.0,4939.0,5020.0,5036.0,4159.0,5043.0,5043.0,5030.0,5022.0,4551.0,4935.0,5030.0,5043.0,4714.0,5043.0
mean,140.194272,107.201074,686.509212,645.009761,6560.047061,48468410.0,83668.16,9699.063851,1.371173,272.770808,39752620.0,2002.470517,1651.754473,6.442138,2.220403,7525.964505
std,121.601675,25.197441,2813.328607,1665.041728,15020.75912,68452990.0,138485.3,18163.799124,2.013576,377.982886,206114900.0,12.474599,4042.438863,1.125116,1.385113,19320.44511
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,50.0,93.0,7.0,133.0,614.0,5340988.0,8593.5,1411.0,0.0,65.0,6000000.0,1999.0,281.0,5.8,1.85,0.0
50%,110.0,103.0,49.0,371.5,988.0,25517500.0,34359.0,3090.0,1.0,156.0,20000000.0,2005.0,595.0,6.6,2.35,166.0
75%,195.0,118.0,194.5,636.0,11000.0,62309440.0,96309.0,13756.5,2.0,326.0,45000000.0,2011.0,918.0,7.2,2.35,3000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,12215500000.0,2016.0,137000.0,9.5,16.0,349000.0


In [6]:
#dataset.profile_report()

## Data Cleaning

In [56]:
print(dataset.shape)
dataset.drop_duplicates(inplace = True)
print(dataset.shape)

(3851, 27)
(3817, 27)


In [7]:
numerical_cols = [col for col in dataset.columns if dataset[col].dtype != 'object']
categorical_cols = [col for col in dataset.columns if dataset[col].dtype == 'object']

In [8]:
dataset[numerical_cols].describe()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4993.0,5028.0,4939.0,5020.0,5036.0,4159.0,5043.0,5043.0,5030.0,5022.0,4551.0,4935.0,5030.0,5043.0,4714.0,5043.0
mean,140.194272,107.201074,686.509212,645.009761,6560.047061,48468410.0,83668.16,9699.063851,1.371173,272.770808,39752620.0,2002.470517,1651.754473,6.442138,2.220403,7525.964505
std,121.601675,25.197441,2813.328607,1665.041728,15020.75912,68452990.0,138485.3,18163.799124,2.013576,377.982886,206114900.0,12.474599,4042.438863,1.125116,1.385113,19320.44511
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,50.0,93.0,7.0,133.0,614.0,5340988.0,8593.5,1411.0,0.0,65.0,6000000.0,1999.0,281.0,5.8,1.85,0.0
50%,110.0,103.0,49.0,371.5,988.0,25517500.0,34359.0,3090.0,1.0,156.0,20000000.0,2005.0,595.0,6.6,2.35,166.0
75%,195.0,118.0,194.5,636.0,11000.0,62309440.0,96309.0,13756.5,2.0,326.0,45000000.0,2011.0,918.0,7.2,2.35,3000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,12215500000.0,2016.0,137000.0,9.5,16.0,349000.0


In [9]:
dataset[categorical_cols].describe()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating
count,5024,4939,5030,5043,5036,5043,5020,4890,5043,5031,5038,4740
unique,2,2398,3032,914,2097,4917,3521,4760,4919,47,65,18
top,Color,Steven Spielberg,Morgan Freeman,Drama,Robert De Niro,Ben-Hur,John Heard,based on novel,http://www.imdb.com/title/tt0232500/?ref_=fn_t...,English,USA,R
freq,4815,26,20,236,49,3,8,4,3,4704,3807,2118


In [10]:
#查看各變數的缺失值
dataset.isnull().sum()

color                         19
director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
dtype: int64

In [11]:
dataset.color.unique()

array(['Color', nan, ' Black and White'], dtype=object)

In [12]:
#將color的19個缺失值改為'color', mode():眾數
print(dataset.color.isnull().sum())
color_mode = dataset['color'].mode().iloc[0]
dataset.color.fillna(color_mode, inplace = True)
print(dataset.color.isnull().sum())

19
0


In [14]:
#沒有導演的資料 直接刪除
print(dataset.director_name.nunique(), dataset.director_name.isnull().sum())
dataset = dataset.dropna(axis = 0, subset = ['director_name'] )

In [15]:
#評論數的缺失值由中位數替換
print(dataset.num_critic_for_reviews.describe())
print("-------------------------------------------")
print(dataset.num_critic_for_reviews.isnull().sum())
num_critic_for_reviews_median = dataset['num_critic_for_reviews'].median()
dataset.num_critic_for_reviews.fillna(num_critic_for_reviews_median, inplace = True)
print(dataset.num_critic_for_reviews.isnull().sum())

count    4894.000000
mean      142.552309
std       121.627905
min         1.000000
25%        53.000000
50%       112.000000
75%       197.000000
max       813.000000
Name: num_critic_for_reviews, dtype: float64
-------------------------------------------
45
0


In [16]:
#電影時長的缺失值由中位數替換
print(dataset.duration.describe())
print("-------------------------------------------")
print(dataset.duration.isnull().sum())
duration_median = dataset.duration.median()
dataset.duration.fillna(duration_median, inplace = True)
print(dataset.duration.isnull().sum())

count    4926.000000
mean      108.137028
std        22.577966
min         7.000000
25%        94.000000
50%       104.000000
75%       118.000000
max       330.000000
Name: duration, dtype: float64
-------------------------------------------
13
0


In [17]:
#或許需要修正
#導演facebook案讚由平均數取代
print(dataset.director_facebook_likes.describe())
print("-------------------------------------------")
print(dataset.director_facebook_likes.isnull().sum())
director_facebook_likes_mean = dataset.director_facebook_likes.mean()
dataset.director_facebook_likes.fillna(director_facebook_likes_mean, inplace = True)
print(dataset.director_facebook_likes.isnull().sum())

count     4939.000000
mean       686.509212
std       2813.328607
min          0.000000
25%          7.000000
50%         49.000000
75%        194.500000
max      23000.000000
Name: director_facebook_likes, dtype: float64
-------------------------------------------
0
0


In [18]:
#演員3按讚數的缺失值由平均數取代
print(dataset.actor_3_facebook_likes.describe())
print("-------------------------------------------")
print(dataset.actor_3_facebook_likes.isnull().sum())
actor_3_facebook_likes_mean = dataset.actor_3_facebook_likes.mean()
dataset.actor_3_facebook_likes.fillna(actor_3_facebook_likes_mean, inplace = True)
print(dataset.actor_3_facebook_likes.isnull().sum())

count     4919.00000
mean       651.20553
std       1681.08616
min          0.00000
25%        133.00000
50%        372.00000
75%        637.00000
max      23000.00000
Name: actor_3_facebook_likes, dtype: float64
-------------------------------------------
20
0


In [19]:
#演員2按讚數的缺失值由平均數取代
print(dataset.actor_2_facebook_likes.describe())
print("-------------------------------------------")
print(dataset.actor_2_facebook_likes.isnull().sum())
actor_2_facebook_likes_mean = dataset.actor_2_facebook_likes.mean()
dataset.actor_2_facebook_likes.fillna(actor_2_facebook_likes_mean, inplace = True)
print(dataset.actor_2_facebook_likes.isnull().sum())

count      4928.000000
mean       1675.850852
std        4080.170643
min           0.000000
25%         281.750000
50%         598.000000
75%         920.000000
max      137000.000000
Name: actor_2_facebook_likes, dtype: float64
-------------------------------------------
11
0


In [20]:
#演員1按讚數的缺失值由平均數取代
print(dataset.actor_1_facebook_likes.describe())
print("-------------------------------------------")
print(dataset.actor_1_facebook_likes.isnull().sum())
actor_1_facebook_likes_mean = dataset.actor_1_facebook_likes.mean()
dataset.actor_1_facebook_likes.fillna(actor_1_facebook_likes_mean, inplace = True)
print(dataset.actor_1_facebook_likes.isnull().sum())

count      4932.000000
mean       6668.287713
std       15150.446402
min           0.000000
25%         618.000000
50%         991.500000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64
-------------------------------------------
7
0


In [21]:
#收益有缺失值的資料直接刪去
print(dataset.shape)
print(dataset.gross.describe())
print(dataset.gross.isnull().sum())
dataset = dataset.dropna(axis = 0, subset = ['gross'])
dataset.gross.isnull().sum()
print(dataset.shape)

(4939, 28)
count    4.156000e+03
mean     4.850314e+07
std      6.846548e+07
min      1.620000e+02
25%      5.355042e+06
50%      2.552969e+07
75%      6.231942e+07
max      7.605058e+08
Name: gross, dtype: float64
783
(4156, 28)


In [22]:
dataset.isnull().sum()

color                          0
director_name                  0
num_critic_for_reviews         0
duration                       0
director_facebook_likes        0
actor_3_facebook_likes         0
actor_2_name                   6
actor_1_facebook_likes         0
gross                          0
genres                         0
actor_1_name                   4
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  13
facenumber_in_poster           7
plot_keywords                 40
movie_imdb_link                0
num_user_for_reviews           1
language                       3
country                        0
content_rating                64
budget                       265
title_year                     0
actor_2_facebook_likes         0
imdb_score                     0
aspect_ratio                 105
movie_facebook_likes           0
dtype: int64

In [23]:
#把預算有缺失資料直接刪除
print(dataset.budget.isnull().sum())
dataset = dataset.dropna(axis = 0, subset = ['budget'])
print(dataset.budget.isnull().sum())

265
0


In [24]:
#把缺少演員3名子的資料直接刪除
print(dataset.actor_3_name.isnull().sum())
dataset = dataset.dropna(axis = 0, subset = ['actor_3_name'])
print(dataset.actor_3_name.isnull().sum())

10
0


In [25]:
#把電影海報出現人數的缺失資料填入中位數
print(dataset.facenumber_in_poster.describe())
print(dataset.facenumber_in_poster.isnull().sum())
facenumber_in_poster_median = dataset.facenumber_in_poster.median()
dataset.facenumber_in_poster.fillna(facenumber_in_poster_median, inplace = True)
print(dataset.facenumber_in_poster.isnull().sum())

count    3875.000000
mean        1.383226
std         2.056878
min         0.000000
25%         0.000000
50%         1.000000
75%         2.000000
max        43.000000
Name: facenumber_in_poster, dtype: float64
6
0


In [26]:
dataset.plot_keywords.unique()

array(['avatar|future|marine|native|paraplegic',
       'goddess|marriage ceremony|marriage proposal|pirate|singapore',
       'bomb|espionage|sequel|spy|terrorist', ...,
       'assassin|death|guitar|gun|mariachi',
       'written and directed by cast member',
       'actress name in title|crush|date|four word title|video camera'],
      dtype=object)

In [27]:
dataset.plot_keywords.value_counts()

assistant|experiment|frankenstein|medical student|scientist                          3
animal name in title|ape abducts a woman|gorilla|island|king kong                    3
1940s|child hero|fantasy world|orphan|reference to peter pan                         3
alien friendship|alien invasion|australia|flying car|mother daughter relationship    3
eighteen wheeler|illegal street racing|truck|trucker|undercover cop                  3
                                                                                    ..
drifter|past life regression|psychological testing|right hand man|scientology        1
african american|coach|football movie|nonlinear timeline|syracuse university         1
alien|attack|based on novel|based on young adult novel|fear                          1
boxing|boxing match|fight|montage|philadelphia                                       1
actress name in title|crush|date|four word title|video camera                        1
Name: plot_keywords, Length: 3750, dtype: i

In [28]:
dataset.language.value_counts()

English       3698
French          37
Spanish         26
Mandarin        15
German          13
Japanese        12
Hindi           10
Cantonese        8
Italian          7
Portuguese       5
Korean           5
Norwegian        4
Dutch            3
Persian          3
Thai             3
Danish           3
Aboriginal       2
Dari             2
Indonesian       2
Hebrew           2
Russian          1
Romanian         1
Vietnamese       1
Arabic           1
Dzongkha         1
Zulu             1
None             1
Aramaic          1
Czech            1
Telugu           1
Icelandic        1
Filipino         1
Hungarian        1
Maya             1
Bosnian          1
Mongolian        1
Kazakh           1
Swedish          1
Name: language, dtype: int64

In [29]:
#把語言缺失資料設為 'English'
print(dataset.language.isnull().sum())
language_mode = dataset.language.mode().iloc[0]
# print(language_mode)
dataset.language.fillna(language_mode, inplace = True)
print(dataset.language.isnull().sum())

3
0


In [30]:
#把缺乏關鍵字的資料直接刪除
print(dataset.plot_keywords.isnull().sum())
dataset = dataset.dropna(axis = 0, subset = ['plot_keywords'])
print(dataset.plot_keywords.isnull().sum())

30
0


In [31]:
dataset.content_rating.unique()

array(['PG-13', 'PG', 'G', 'R', 'Approved', 'NC-17', nan, 'X',
       'Not Rated', 'Unrated', 'M', 'GP', 'Passed'], dtype=object)

In [32]:
#把沒有被分級的電影，新增類別 "Not Rated"
print(dataset.content_rating.isnull().sum())
dataset.content_rating.fillna('Not Rated', inplace = True)
print(dataset.content_rating.isnull().sum())

38
0


In [33]:
print(dataset.aspect_ratio.describe())
print("-------------------------------------------")
print(dataset.aspect_ratio.isnull().sum())
dataset.aspect_ratio.unique()
aspect_ratio_mode = dataset.aspect_ratio.mode().iloc[0]
dataset.aspect_ratio.fillna(aspect_ratio_mode, inplace = True)
print(dataset.aspect_ratio.isnull().sum())

count    3790.000000
mean        2.110164
std         0.352579
min         1.180000
25%         1.850000
50%         2.350000
75%         2.350000
max        16.000000
Name: aspect_ratio, dtype: float64
-------------------------------------------
61
0


In [34]:
dataset.isnull().sum()

color                        0
director_name                0
num_critic_for_reviews       0
duration                     0
director_facebook_likes      0
actor_3_facebook_likes       0
actor_2_name                 0
actor_1_facebook_likes       0
gross                        0
genres                       0
actor_1_name                 0
movie_title                  0
num_voted_users              0
cast_total_facebook_likes    0
actor_3_name                 0
facenumber_in_poster         0
plot_keywords                0
movie_imdb_link              0
num_user_for_reviews         0
language                     0
country                      0
content_rating               0
budget                       0
title_year                   0
actor_2_facebook_likes       0
imdb_score                   0
aspect_ratio                 0
movie_facebook_likes         0
dtype: int64

In [35]:
dataset.reset_index(inplace = True, drop = True)

In [36]:
# dataset.profile_report()

In [37]:
numerical_cols, categorical_cols

(['num_critic_for_reviews',
  'duration',
  'director_facebook_likes',
  'actor_3_facebook_likes',
  'actor_1_facebook_likes',
  'gross',
  'num_voted_users',
  'cast_total_facebook_likes',
  'facenumber_in_poster',
  'num_user_for_reviews',
  'budget',
  'title_year',
  'actor_2_facebook_likes',
  'imdb_score',
  'aspect_ratio',
  'movie_facebook_likes'],
 ['color',
  'director_name',
  'actor_2_name',
  'genres',
  'actor_1_name',
  'movie_title',
  'actor_3_name',
  'plot_keywords',
  'movie_imdb_link',
  'language',
  'country',
  'content_rating'])

In [38]:
dataset['color'] = dataset.color.map({'Color' : 1 , ' Black and White' : 0})

In [39]:
#把導演在該資料所有的作品個數新增為一行，並且刪除導演的名稱
# dataset.director_name.unique(), dataset.director_name.nunique()
director_name_value_counts = dataset.director_name.value_counts()
director_name_value_counts  = pd.DataFrame(director_name_value_counts).reset_index().rename(columns = {'index': 'director_name', 'director_name':'director_name_value_counts'})
dataset = pd.merge(dataset, director_name_value_counts,left_on = 'director_name', right_on = 'director_name', how = 'left')
dataset = dataset.drop(columns = 'director_name')
print(dataset)

      color  num_critic_for_reviews  duration  director_facebook_likes  \
0         1                   723.0     178.0                      0.0   
1         1                   302.0     169.0                    563.0   
2         1                   602.0     148.0                      0.0   
3         1                   813.0     164.0                  22000.0   
4         1                   462.0     132.0                    475.0   
...     ...                     ...       ...                      ...   
3846      1                   143.0      77.0                    291.0   
3847      1                    35.0      80.0                      0.0   
3848      1                    56.0      81.0                      0.0   
3849      1                    14.0      95.0                      0.0   
3850      1                    43.0      90.0                     16.0   

      actor_3_facebook_likes        actor_2_name  actor_1_facebook_likes  \
0                      855.0    Joe

In [40]:
#把演員1在該資料所有的作品個數新增為一行，並且刪除演員一的名稱
#dataset.actor_1_name.unique(), dataset.actor_1_name.nunique()
actor_1_name_value_counts = dataset.actor_1_name.value_counts()
actor_1_name_value_counts  = pd.DataFrame(actor_1_name_value_counts).reset_index().rename(columns = {'index': 'actor_1_name', 'actor_1_name':'actor_1_name_value_counts'})
dataset = pd.merge(dataset, actor_1_name_value_counts,left_on = 'actor_1_name', right_on = 'actor_1_name', how = 'left')
dataset = dataset.drop(columns = 'actor_1_name')

In [41]:
#把演員2在該資料所有的作品個數新增為一行，並且刪除演員二的名稱
#print(dataset.actor_2_name.unique(), dataset.actor_2_name.nunique())
actor_2_name_value_counts = dataset.actor_2_name.value_counts()
actor_2_name_value_counts  = pd.DataFrame(actor_2_name_value_counts).reset_index().rename(columns = {'index': 'actor_2_name', 'actor_2_name':'actor_2_name_value_counts'})
dataset = pd.merge(dataset, actor_2_name_value_counts,left_on = 'actor_2_name', right_on = 'actor_2_name', how = 'left')
dataset = dataset.drop(columns = 'actor_2_name')

In [42]:
#把演員3在該資料所有的作品個數新增為一行，並且刪除演員三的名稱
# print(dataset.actor_3_name.unique(), dataset.actor_3_name.nunique())
actor_3_name_value_counts = dataset.actor_3_name.value_counts()
actor_3_name_value_counts = pd.DataFrame(actor_3_name_value_counts).reset_index().rename(columns = {'index' : 'actor_3_name', 'actor_3_name' : 'actor_3_name_value_counts'})
dataset= pd.merge(dataset, actor_3_name_value_counts,left_on = 'actor_3_name', right_on = 'actor_3_name', how = 'left')
dataset = dataset.drop(columns = 'actor_3_name')

In [43]:
#新增一行為主要類別 'main_genre'
print(dataset.genres.unique(), dataset.genres.nunique())
dataset['main_genre'] = dataset.genres.str.split('|').str[0]
print(dataset['main_genre'])

0            Action
1            Action
2            Action
3            Action
4            Action
           ...     
3846          Drama
3847       Thriller
3848         Action
3849         Comedy
3850    Documentary
Name: main_genre, Length: 3851, dtype: object


In [44]:
#將 'main_genre' 轉為數字類別
# print(dataset.main_genre.unique(), dataset.main_genre.nunique())
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataset['main_genre'] = le.fit_transform(dataset.main_genre)
print(dataset['main_genre'])

['Action' 'Adventure' 'Drama' 'Animation' 'Comedy' 'Mystery' 'Crime'
 'Biography' 'Fantasy' 'Documentary' 'Sci-Fi' 'Horror' 'Romance' 'Family'
 'Western' 'Musical' 'Thriller'] 17
0        0
1        0
2        0
3        0
4        0
        ..
3846     7
3847    15
3848     0
3849     4
3850     6
Name: main_genre, Length: 3851, dtype: int32


In [45]:
#把相同電影類別進行記數，並新增 'genres_value_counts' ，再將原本 'genres' 刪去
genres_value_counts = dataset.genres.value_counts()
print(genres_value_counts)
genres_value_counts  = pd.DataFrame(genres_value_counts).reset_index().rename(columns = {'index' : 'genres', 'genres' : 'genres_value_counts'})
dataset = pd.merge(dataset, genres_value_counts,left_on = 'genres', right_on = 'genres', how = 'left')
dataset = dataset.drop(columns = 'genres')


Comedy|Drama|Romance                        151
Drama                                       149
Comedy|Drama                                146
Comedy                                      146
Comedy|Romance                              135
                                           ... 
Action|Adventure|Fantasy|Horror|Thriller      1
Action|Biography|Drama                        1
Animation|Comedy|Family|Horror|Sci-Fi         1
Action|Adventure|Drama|Fantasy|War            1
Comedy|Crime|Horror                           1
Name: genres, Length: 758, dtype: int64


In [46]:
#把缺少 "title" 的資料直接刪除
print(dataset.movie_title.unique(), dataset.movie_title.nunique())
dataset = dataset.drop(columns = 'movie_title')

['Avatar\xa0' "Pirates of the Caribbean: At World's End\xa0" 'Spectre\xa0'
 ... 'El Mariachi\xa0' 'Newlyweds\xa0' 'My Date with Drew\xa0'] 3749


In [47]:
#新增 'main_plot_keyword' ，並把 'plot_keywords' 刪除
# print(dataset.plot_keywords.unique(), dataset.plot_keywords.nunique())
dataset['main_plot_keyword'] = dataset.plot_keywords.str.split('|').str[0]
dataset = dataset.drop(columns = 'plot_keywords')
print(dataset.main_plot_keyword.unique(), dataset.main_plot_keyword.nunique())

['avatar' 'goddess' 'bomb' ... 'jihad'
 'written and directed by cast member' 'actress name in title'] 1688


In [48]:
#將 main_plot_keyword 記數，並新增一行 'main_plot_keyword_value_counts'，然後將原始行刪除
main_plot_keyword_value_counts = dataset.main_plot_keyword.value_counts()
main_plot_keyword_value_counts = pd.DataFrame(main_plot_keyword_value_counts).reset_index().rename(columns = {'index' : 'main_plot_keyword', 'main_plot_keyword' : 'main_plot_keyword_value_counts'})
dataset = pd.merge(dataset, main_plot_keyword_value_counts, left_on = 'main_plot_keyword', right_on = 'main_plot_keyword', how = 'left')
dataset = dataset.drop(columns = 'main_plot_keyword')


In [49]:
#將 'movie_imdb_link' 資料刪去
print(dataset.movie_imdb_link.unique(), dataset.movie_imdb_link.nunique())
dataset = dataset.drop(columns = 'movie_imdb_link')

['http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1'
 'http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1'
 'http://www.imdb.com/title/tt2379713/?ref_=fn_tt_tt_1' ...
 'http://www.imdb.com/title/tt0104815/?ref_=fn_tt_tt_1'
 'http://www.imdb.com/title/tt1880418/?ref_=fn_tt_tt_1'
 'http://www.imdb.com/title/tt0378407/?ref_=fn_tt_tt_1'] 3750


In [50]:
#將 'language' encode
print(dataset.language.unique(), dataset.language.nunique())
from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
dataset['language'] = le1.fit_transform(dataset.language)


['English' 'Mandarin' 'Aboriginal' 'Spanish' 'French' 'Filipino' 'Maya'
 'Kazakh' 'Telugu' 'Cantonese' 'Japanese' 'Aramaic' 'Italian' 'Dutch'
 'Dari' 'German' 'Mongolian' 'Thai' 'Bosnian' 'Korean' 'Hungarian' 'Hindi'
 'Icelandic' 'Danish' 'Portuguese' 'Norwegian' 'Czech' 'Russian' 'None'
 'Zulu' 'Hebrew' 'Dzongkha' 'Arabic' 'Vietnamese' 'Indonesian' 'Romanian'
 'Persian' 'Swedish'] 38


In [51]:
#將 'country' encode
print(dataset.country.unique(), dataset.country.nunique())
from sklearn.preprocessing import LabelEncoder
le2 = LabelEncoder()
dataset['country'] = le2.fit_transform(dataset.country)

['USA' 'UK' 'New Zealand' 'Canada' 'Australia' 'Germany' 'China'
 'New Line' 'France' 'Japan' 'Spain' 'Hong Kong' 'Czech Republic' 'Peru'
 'South Korea' 'India' 'Aruba' 'Denmark' 'Ireland' 'South Africa' 'Italy'
 'Romania' 'Chile' 'Netherlands' 'Hungary' 'Russia' 'Belgium' 'Greece'
 'Taiwan' 'Official site' 'Thailand' 'Iran' 'West Germany' 'Georgia'
 'Mexico' 'Iceland' 'Brazil' 'Finland' 'Norway' 'Argentina' 'Colombia'
 'Poland' 'Israel' 'Indonesia' 'Afghanistan' 'Sweden' 'Philippines'] 47


In [52]:
#將 'content_rating' encode
print(dataset.content_rating.unique(),dataset.content_rating.nunique())
from sklearn.preprocessing import LabelEncoder
le3 = LabelEncoder()
dataset['content_rating'] = le3.fit_transform(dataset.content_rating)


['PG-13' 'PG' 'G' 'R' 'Approved' 'NC-17' 'Not Rated' 'X' 'Unrated' 'M'
 'GP' 'Passed'] 12


In [53]:
dataset.head()

Unnamed: 0,color,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,...,imdb_score,aspect_ratio,movie_facebook_likes,director_name_value_counts,actor_1_name_value_counts,actor_2_name_value_counts,actor_3_name_value_counts,main_genre,genres_value_counts,main_plot_keyword_value_counts
0,1,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,...,7.9,1.78,33000,7,4,3,3,0,12,2
1,1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,...,7.1,2.35,0,7,39,7,4,0,25,1
2,1,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,...,6.8,2.35,85000,8,5,2,1,0,45,7
3,1,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,...,8.5,2.35,164000,8,9,5,2,0,22,2
4,1,462.0,132.0,475.0,530.0,640.0,73058679.0,212204,1873,1.0,...,6.6,2.35,24000,3,2,3,1,0,48,69


In [54]:
print(dataset.dtypes)

color                               int64
num_critic_for_reviews            float64
duration                          float64
director_facebook_likes           float64
actor_3_facebook_likes            float64
actor_1_facebook_likes            float64
gross                             float64
num_voted_users                     int64
cast_total_facebook_likes           int64
facenumber_in_poster              float64
num_user_for_reviews              float64
language                            int32
country                             int32
content_rating                      int32
budget                            float64
title_year                        float64
actor_2_facebook_likes            float64
imdb_score                        float64
aspect_ratio                      float64
movie_facebook_likes                int64
director_name_value_counts          int64
actor_1_name_value_counts           int64
actor_2_name_value_counts           int64
actor_3_name_value_counts         

In [55]:
dataset.profile_report()
