# 映画のジャンルと評価の相関

In [16]:
import numpy as np
import pandas as pd

metadata_URL = "./data/movies_metadata.csv"
genres_URL = "./data/genres.csv"
scores_URL = "./data/95%_least_plausible.csv"
links_URL = "./data/links.csv"

## 1.データ整形

In [17]:
## データフレーム中のstringをdict型のリストに変換する
def make_list_from_str(string):
    if isinstance(string, str):
        result_list = eval(string)
    else:
        result_list = np.nan
    return result_list

In [3]:
##ジャンルのdictから、それぞれのidだけを抜き出したリストの作成
def extract_ids(genres_list):
    id_list = []
    for genre in genres_list:
        id_list.append(genre["id"])
    return id_list

In [18]:
##ジャンルのdictから、それぞれのnameだけを抜き出したリストの作成
def extract_names(genres_list):
    id_list = []
    for genre in genres_list:
        id_list.append(genre["name"])
    return id_list

In [19]:
metadata_df = pd.read_csv(metadata_URL)[["id", "genres"]]
metadata_df["genres"] = metadata_df["genres"].map(make_list_from_str)
metadata_df["genres"] = metadata_df["genres"].map(extract_names)
metadata_df["genre_count"] = metadata_df["genres"].map(lambda x: len(x))
metadata_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,genres,genre_count
0,862,"[Animation, Comedy, Family]",3
1,8844,"[Adventure, Fantasy, Family]",3
2,15602,"[Romance, Comedy]",2
3,31357,"[Comedy, Drama, Romance]",3
4,11862,[Comedy],1


In [26]:
##すべてのジャンルのidリスト、nameリストを作成

genres_df = pd.read_csv(genres_URL)
all_genre_id = genres_df["id"].tolist()
all_genre_name = genres_df["name"].tolist()
print("ids: ", all_genre_id)
print("names: ", all_genre_name)

ids:  [16, 35, 10751, 12, 14, 10749, 18, 28, 80, 53, 27, 36, 878, 9648, 10752, 10769, 10402, 99, 37, 10770]
names:  ['Animation', 'Comedy', 'Family', 'Adventure', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'History', 'Science Fiction', 'Mystery', 'War', 'Foreign', 'Music', 'Documentary', 'Western', 'TV Movie']


## 2.ダミー変数の作成

#### まず、ジャンルがないデータが存在するので、削除

In [20]:
metadata_df.groupby("genre_count")["id"].count()

genre_count
0     2442
1    14559
2    14480
3     9586
4     3380
5      835
6      157
7       24
8        3
Name: id, dtype: int64

In [21]:
metadata_df = metadata_df[metadata_df["genre_count"] > 0]
metadata_df.groupby("genre_count")["id"].count()

genre_count
1    14559
2    14480
3     9586
4     3380
5      835
6      157
7       24
8        3
Name: id, dtype: int64

#### ダミー変数を作る

In [30]:
genre_dummies = pd.get_dummies(metadata_df["genres"].apply(pd.Series).stack()).sum(level=0)
genre_dummies.head()

Unnamed: 0,Action,Adventure,Animation,Aniplex,BROSTA TV,Carousel Productions,Comedy,Crime,Documentary,Drama,...,Romance,Science Fiction,Sentai Filmworks,TV Movie,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western
0,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 変な名前のジャンルがあるので、削除する

In [32]:
genre_dummies = genre_dummies[all_genre_name]
genre_dummies.head()

Unnamed: 0,Animation,Comedy,Family,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,History,Science Fiction,Mystery,War,Foreign,Music,Documentary,Western,TV Movie
0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [33]:
movie_data = pd.concat([metadata_df[["id"]] , genre_dummies], axis=1)
movie_data

Unnamed: 0,id,Animation,Comedy,Family,Adventure,Fantasy,Romance,Drama,Action,Crime,...,Horror,History,Science Fiction,Mystery,War,Foreign,Music,Documentary,Western,TV Movie
0,862,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8844,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,15602,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,31357,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11862,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,949,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
6,11860,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,45325,0,0,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
8,9091,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,710,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## 3.予測データ（映画のスコア）とのデータベース結合

#### とりあえず、ともやが作った95%下限値スコアを予測するモデルを作る

#### metadata_dfの各idに対応するmovieIdを、links.csvから取得する
※吉田のデータのidはmovieId, metadataのidはtmdbidに対応

In [34]:
links_df = pd.read_csv(links_URL)
links_df = links_df.dropna()
links_df["tmdbId"] = links_df["tmdbId"].map(int)
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [35]:
def get_movieId(tmdbId_list, links_df):
    movieid_list = []
    for tmdbId in tmdbId_list:
        if not tmdbId.isdigit():
            movieid_list.append(np.nan)
        else:
            movieId = links_df[links_df["tmdbId"] == int(tmdbId)]["movieId"].tolist()
            if len(movieId) == 1:
                movieid_list.append(movieId[0])
            else:
                movieid_list.append(np.nan)
    return movieid_list

In [36]:
tmdbid_list = movie_data["id"].tolist()
movieid_list = get_movieId(tmdbid_list, links_df)
movie_data["movieId"] = movieid_list
movie_data.head()

Unnamed: 0,id,Animation,Comedy,Family,Adventure,Fantasy,Romance,Drama,Action,Crime,...,History,Science Fiction,Mystery,War,Foreign,Music,Documentary,Western,TV Movie,movieId
0,862,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
1,8844,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.0
2,15602,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3.0
3,31357,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,4.0
4,11862,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.0


In [39]:
movie_data["id"].count()

43024

In [40]:
# NaNを削除
movie_data = movie_data[movie_data["movieId"] > 0]
movie_data["id"].count()

42962

#### 95%下限値データの読み込み

In [41]:
scores_df = pd.read_csv(scores_URL)[["movieId", "95%_least_plausible"]]
scores_df.head()

Unnamed: 0,movieId,95%_least_plausible
0,318.0,4.424716
1,159817.0,4.409009
2,858.0,4.333404
3,50.0,4.294587
4,170705.0,4.261164


#### metadataとの結合

In [42]:
movie_data =pd.merge(movie_data, scores_df, on="movieId")
movie_data.drop(["id","movieId"] , axis=1, inplace=True)
movie_data.head()

Unnamed: 0,Animation,Comedy,Family,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,...,History,Science Fiction,Mystery,War,Foreign,Music,Documentary,Western,TV Movie,95%_least_plausible
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.882039
1,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.226958
2,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.161669
3,0,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2.841556
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.066052


## 4.ジャンルと評価の関係の学習（重回帰）

In [99]:
# データの分割（学習データとテストデータ分ける）
from sklearn.model_selection import train_test_split

# モデル
from sklearn import linear_model

# モデルのインスタンス
l_model = linear_model.LinearRegression()
 
# 説明変数に "price" 以外を利用
X = movie_data.drop("95%_least_plausible", axis=1)

# 目的変数
Y = movie_data["95%_least_plausible"]

# 学習データとテストデータ分ける
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5,random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5)

# モデルのあてはめ
clf = l_model.fit(X_train,y_train)
print("train:",clf.__class__.__name__ ,clf.score(X_train,y_train))
print("test:",clf.__class__.__name__ , clf.score(X_test,y_test))
 
# 偏回帰係数
print(pd.DataFrame({"Name":X.columns,
                    "Coefficients":clf.coef_}).sort_values(by='Coefficients') )
 
# 切片 
print(clf.intercept_)

train: LinearRegression 0.0817413526117
test: LinearRegression 0.081544473187
    Coefficients             Name
19     -0.169084         TV Movie
15     -0.154618          Foreign
10     -0.120896           Horror
7      -0.014082           Action
2      -0.011531           Family
12      0.053433  Science Fiction
18      0.074654          Western
4       0.086041          Fantasy
13      0.093748          Mystery
5       0.099179          Romance
14      0.099966              War
9       0.102961         Thriller
16      0.108870            Music
1       0.116299           Comedy
8       0.122217            Crime
3       0.131046        Adventure
11      0.176372          History
0       0.190762        Animation
17      0.197610      Documentary
6       0.233099            Drama
2.32031329131


決定係数は低いが、だいたい各ジャンルの重みは一貫している。

- 第1グループ Drama
- 第2グループ Documentary, History, Animation
- 第3グループ Comedy, Crime, Fantasy, Adventure, Music, Thriller, War, Romance, Mystery, Western, Science Fiction 
- 第4グループ Family, Action
- 第5グループ Horror, Foreign, TV Movie

こんな感じ