# 此文件对数据集进行预处理

In [31]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import numpy as np
import pandas as pd
import math
from wordcloud import WordCloud
from scipy import stats
from collections import Counter
from math import isnan
%matplotlib inline
pd.set_option('display.float_format', lambda x:'%.2f'%x)

## 训练数据预处理

In [32]:
#载入训练数据
ftrain_data="bop_train.csv"
train_data = pd.read_csv(ftrain_data, index_col=False, low_memory=False)
train_data.shape

(2500, 23)

In [33]:
#删除 imdb_id, original_title, overview, poster_path, status, title 等属性
train_data.drop(['imdb_id'], axis=1, inplace=True)
train_data.drop(['original_title'], axis=1, inplace=True)
train_data.drop(['overview'], axis=1, inplace=True)
train_data.drop(['poster_path'], axis=1, inplace=True)
train_data.drop(['status'], axis=1, inplace=True)
train_data.drop(['title'], axis=1, inplace=True)
train_data.shape

(2500, 17)

In [34]:
#转化为“有/无”（1/0）属性后删除原属性（4 种）
#belongs_to_collection, homepage, original_language, tagline

#构建二值属性
train_data['collection'] = 0
train_data.loc[train_data['belongs_to_collection'].isnull() == False, 'collection'] = 1
train_data['has_homepage'] = 0
train_data.loc[train_data['homepage'].isnull() == False, 'has_homepage'] = 1
train_data['is_en'] = 0
train_data.loc[train_data['original_language'] == 'en', 'is_en'] = 1
train_data['has_tagline'] = 0
train_data.loc[train_data['tagline'].isnull() == False, 'has_tagline'] = 1

#删除原有属性
train_data.drop(['belongs_to_collection'], axis=1, inplace=True)
train_data.drop(['homepage'], axis=1, inplace=True)
train_data.drop(['original_language'], axis=1, inplace=True)
train_data.drop(['tagline'], axis=1, inplace=True)
train_data.shape

(2500, 17)

In [35]:
#转化为计数属性并删除原属性（6 种）
#production_companies, production_countries, spoken_languages, Keywords, cast, crew

#转化为计数属性
def change_to_count_attri(data, attri):
    count=[]
    for i in data[attri]:
        if(not(pd.isnull(i))):
            count.append(len(eval(i)))

        else:
            count.append(0)
    attri1 = "num_" + attri
    data[attri1] = count
    data.drop([attri], axis=1, inplace=True)
    

change_to_count_attri(train_data,'production_companies')
change_to_count_attri(train_data,'production_countries')
change_to_count_attri(train_data,'spoken_languages')
change_to_count_attri(train_data,'Keywords')
change_to_count_attri(train_data,'cast')
change_to_count_attri(train_data,'crew')
train_data.shape

(2500, 17)

In [36]:
#genres 转化为计数属性并删除原属性
#选取Adventure，Animation，Fantasy，Family，Action 和 Science_Fiction作为流行类型

train_data['is_popular_genres'] = 0
for i in range(len(train_data)):
    if train_data['genres'][i] == train_data['genres'][i]:
        for genre in eval(train_data['genres'][i]):
            if genre['name'] == "Adventure":
                train_data['is_popular_genres'][i] += 1
            elif genre['name'] == "Animation":
                train_data['is_popular_genres'][i] += 1
            elif genre['name'] == "Fantasy":
                train_data['is_popular_genres'][i] += 1
            elif genre['name'] == "Family":
                train_data['is_popular_genres'][i] += 1
            elif genre['name'] == "Action":
                train_data['is_popular_genres'][i] += 1
            elif genre['name'] == "Science_Fiction":
                train_data['is_popular_genres'][i] += 1
            else:
                continue


train_data.drop(['genres'], axis=1, inplace=True)
train_data.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['is_popular_genres'][i] += 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['is_popular_genres'][i] += 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['is_popular_genres'][i] += 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['is_popular_genres'][i] += 1
A value 

(2500, 17)

In [37]:
#均值填充 runtime 缺失值和零值
mean = train_data['runtime'].mean()
print(mean)
train_data['runtime'] = train_data['runtime'].fillna(mean)
train_data['runtime'] = train_data['runtime'].replace(0, mean)
train_data.shape

107.94115292233786


(2500, 17)

In [38]:
#release_date 分为年、月、日（星期）三个属性

train_data['release_date'] = pd.to_datetime(train_data['release_date'], format='%m/%d/%y')
train_data["release_year"] = pd.to_datetime(train_data["release_date"]).dt.year.astype(int)
train_data["release_day"] = pd.to_datetime(train_data["release_date"]).dt.dayofweek.astype(int)
train_data["release_month"] = pd.to_datetime(train_data["release_date"]).dt.month.astype(int)
train_data.drop(['release_date'], axis=1, inplace=True)
train_data.shape

(2500, 19)

In [39]:
train_data.head()

Unnamed: 0,id,budget,popularity,runtime,revenue,collection,has_homepage,is_en,has_tagline,num_production_companies,num_production_countries,num_spoken_languages,num_Keywords,num_cast,num_crew,is_popular_genres,release_year,release_day,release_month
0,1,14000000,6.58,93.0,12314651,1,0,1,1,3,1,1,4,24,72,0,2015,4,2
1,2,40000000,8.25,113.0,95149435,1,0,1,1,1,1,1,4,20,9,1,2004,4,8
2,3,3300000,64.3,105.0,13092000,0,1,1,1,3,1,1,12,51,64,0,2014,4,10
3,4,1200000,3.17,122.0,16000000,0,1,0,0,0,1,2,7,7,3,0,2012,4,3
4,5,0,1.15,118.0,3923970,0,0,0,0,0,1,1,0,4,2,1,2009,3,2


In [40]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        2500 non-null   int64  
 1   budget                    2500 non-null   int64  
 2   popularity                2500 non-null   float64
 3   runtime                   2500 non-null   float64
 4   revenue                   2500 non-null   int64  
 5   collection                2500 non-null   int64  
 6   has_homepage              2500 non-null   int64  
 7   is_en                     2500 non-null   int64  
 8   has_tagline               2500 non-null   int64  
 9   num_production_companies  2500 non-null   int64  
 10  num_production_countries  2500 non-null   int64  
 11  num_spoken_languages      2500 non-null   int64  
 12  num_Keywords              2500 non-null   int64  
 13  num_cast                  2500 non-null   int64  
 14  num_crew

In [28]:
#保存处理后的数据集
#train_data.to_csv("bop_train_processed.csv", index=None)

In [30]:
# ftrain_data="bop_train_processed.csv"
# train_data = pd.read_csv(ftrain_data, index_col=False, low_memory=False)
# train_data.head()

In [41]:
#载入训练数据
ftrain_data="bop_test.csv"
train_data = pd.read_csv(ftrain_data, index_col=False, low_memory=False)
train_data.shape

(500, 23)

In [42]:
#删除 imdb_id, original_title, overview, poster_path, status, title 等属性
train_data.drop(['imdb_id'], axis=1, inplace=True)
train_data.drop(['original_title'], axis=1, inplace=True)
train_data.drop(['overview'], axis=1, inplace=True)
train_data.drop(['poster_path'], axis=1, inplace=True)
train_data.drop(['status'], axis=1, inplace=True)
train_data.drop(['title'], axis=1, inplace=True)
train_data.shape

(500, 17)

In [43]:
#转化为“有/无”（1/0）属性后删除原属性（4 种）
#belongs_to_collection, homepage, original_language, tagline

#构建二值属性
train_data['collection'] = 0
train_data.loc[train_data['belongs_to_collection'].isnull() == False, 'collection'] = 1
train_data['has_homepage'] = 0
train_data.loc[train_data['homepage'].isnull() == False, 'has_homepage'] = 1
train_data['is_en'] = 0
train_data.loc[train_data['original_language'] == 'en', 'is_en'] = 1
train_data['has_tagline'] = 0
train_data.loc[train_data['tagline'].isnull() == False, 'has_tagline'] = 1

#删除原有属性
train_data.drop(['belongs_to_collection'], axis=1, inplace=True)
train_data.drop(['homepage'], axis=1, inplace=True)
train_data.drop(['original_language'], axis=1, inplace=True)
train_data.drop(['tagline'], axis=1, inplace=True)
train_data.shape

(500, 17)

In [44]:
#转化为计数属性并删除原属性（6 种）
#production_companies, production_countries, spoken_languages, Keywords, cast, crew

#转化为计数属性
def change_to_count_attri(data, attri):
    count=[]
    for i in data[attri]:
        if(not(pd.isnull(i))):
            count.append(len(eval(i)))

        else:
            count.append(0)
    attri1 = "num_" + attri
    data[attri1] = count
    data.drop([attri], axis=1, inplace=True)
    

change_to_count_attri(train_data,'production_companies')
change_to_count_attri(train_data,'production_countries')
change_to_count_attri(train_data,'spoken_languages')
change_to_count_attri(train_data,'Keywords')
change_to_count_attri(train_data,'cast')
change_to_count_attri(train_data,'crew')
train_data.shape

(500, 17)

In [45]:
#genres 转化为计数属性并删除原属性
#选取Adventure，Animation，Fantasy，Family，Action 和 Science_Fiction作为流行类型

train_data['is_popular_genres'] = 0
for i in range(len(train_data)):
    if train_data['genres'][i] == train_data['genres'][i]:
        for genre in eval(train_data['genres'][i]):
            if genre['name'] == "Adventure":
                train_data['is_popular_genres'][i] += 1
            elif genre['name'] == "Animation":
                train_data['is_popular_genres'][i] += 1
            elif genre['name'] == "Fantasy":
                train_data['is_popular_genres'][i] += 1
            elif genre['name'] == "Family":
                train_data['is_popular_genres'][i] += 1
            elif genre['name'] == "Action":
                train_data['is_popular_genres'][i] += 1
            elif genre['name'] == "Science_Fiction":
                train_data['is_popular_genres'][i] += 1
            else:
                continue


train_data.drop(['genres'], axis=1, inplace=True)
train_data.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['is_popular_genres'][i] += 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['is_popular_genres'][i] += 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['is_popular_genres'][i] += 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['is_popular_genres'][i] += 1
A value 

(500, 17)

In [46]:
#均值填充 runtime 缺失值和零值
mean = 107.94115292233786
print(mean)
train_data['runtime'] = train_data['runtime'].fillna(mean)
train_data['runtime'] = train_data['runtime'].replace(0, mean)
train_data.shape

107.94115292233786


(500, 17)

In [47]:
#release_date 分为年、月、日（星期）三个属性

train_data['release_date'] = pd.to_datetime(train_data['release_date'], format='%m/%d/%y')
train_data["release_year"] = pd.to_datetime(train_data["release_date"]).dt.year.astype(int)
train_data["release_day"] = pd.to_datetime(train_data["release_date"]).dt.dayofweek.astype(int)
train_data["release_month"] = pd.to_datetime(train_data["release_date"]).dt.month.astype(int)
train_data.drop(['release_date'], axis=1, inplace=True)
train_data.shape

(500, 19)

In [48]:
train_data.head()

Unnamed: 0,id,budget,popularity,runtime,revenue,collection,has_homepage,is_en,has_tagline,num_production_companies,num_production_countries,num_spoken_languages,num_Keywords,num_cast,num_crew,is_popular_genres,release_year,release_day,release_month
0,2501,25000000,5.28,107.0,19829957,0,0,1,1,2,1,1,5,14,10,0,2010,4,10
1,2502,0,3.72,94.0,8887603,0,0,1,1,2,1,1,7,12,2,0,2012,4,12
2,2503,3500000,14.57,91.0,28215918,1,1,1,1,1,1,1,4,18,5,0,1995,2,4
3,2504,0,11.46,95.0,49364621,1,0,1,1,1,1,1,9,20,6,1,1985,3,7
4,2505,0,5.07,98.0,3960327,0,0,1,1,2,0,1,7,12,4,0,1989,4,4


In [49]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        500 non-null    int64  
 1   budget                    500 non-null    int64  
 2   popularity                500 non-null    float64
 3   runtime                   500 non-null    float64
 4   revenue                   500 non-null    int64  
 5   collection                500 non-null    int64  
 6   has_homepage              500 non-null    int64  
 7   is_en                     500 non-null    int64  
 8   has_tagline               500 non-null    int64  
 9   num_production_companies  500 non-null    int64  
 10  num_production_countries  500 non-null    int64  
 11  num_spoken_languages      500 non-null    int64  
 12  num_Keywords              500 non-null    int64  
 13  num_cast                  500 non-null    int64  
 14  num_crew  

In [50]:
#保存处理后的数据集
train_data.to_csv("bop_test_processed.csv", index=None)

In [30]:
# ftrain_data="bop_train_processed.csv"
# train_data = pd.read_csv(ftrain_data, index_col=False, low_memory=False)
# train_data.head()