-**[説明変数をジャンルとその他もろもろで重回帰分析](#説明変数をジャンルとその他もろもろで重回帰分析)**  
<br><br>
-**[評価回数、rating、興行収入、予算などの間の相関係数](#評価回数、rating、興行収入、予算などの間の相関係数)**


In [1]:
import numpy as np
import pandas as pd
from IPython.core.pylabtools import figsize
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
metadata_URL = "./the-movies-dataset/movies_metadata.csv"
genres_URL = "./the-movies-dataset/genres.csv"
scores_URL = "./ml-latest/95%_least_plausible.csv"
links_URL = "./the-movies-dataset/links.csv"
links_small_URL = "./the-movies-dataset/links_small.csv"
credits_URL = "./the-movies-dataset/credits.csv"
directors_URL = "./the-movies-dataset/directors.csv"
actors_URL = "./the-movies-dataset/actors.csv"
jobs_URL = "./the-movies-dataset/jobs.csv"

In [3]:
# データフレーム中のstringをdict型のリストに変換する
def make_list_from_str(string):
    if isinstance(string, str):
        result_list = eval(string)
    else:
        result_list = np.nan
    return result_list

def extract_ids(genres_list):
    id_list = []
    for genre in genres_list:
        id_list.append(genre["id"])
    return id_list

def extract_names(genres_list):
    id_list = []
    for genre in genres_list:
        id_list.append(genre["name"])
    return id_list

In [10]:
metadata_df = pd.read_csv(metadata_URL)[["id", "genres"]]
metadata_df["genres"] = metadata_df["genres"].map(make_list_from_str)
metadata_df["genres"] = metadata_df["genres"].map(extract_names)
metadata_df["genre_count"] = metadata_df["genres"].map(lambda x: len(x))
metadata_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,genres,genre_count
0,862,"[Animation, Comedy, Family]",3
1,8844,"[Adventure, Fantasy, Family]",3
2,15602,"[Romance, Comedy]",2
3,31357,"[Comedy, Drama, Romance]",3
4,11862,[Comedy],1


In [13]:
##すべてのジャンルのidリスト、nameリストを作成

genres_df = pd.read_csv(genres_URL)
all_genre_id = genres_df["id"].tolist()
all_genre_name = genres_df["name"].tolist()
print("ids: ", all_genre_id)
print("names: ", all_genre_name)

ids:  [16, 35, 10751, 12, 14, 10749, 18, 28, 80, 53, 27, 36, 878, 9648, 10752, 10769, 10402, 99, 37, 10770]
names:  ['Animation', 'Comedy', 'Family', 'Adventure', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'History', 'Science Fiction', 'Mystery', 'War', 'Foreign', 'Music', 'Documentary', 'Western', 'TV Movie']


In [14]:
metadata_df.groupby("genre_count")["id"].count()

genre_count
0     2442
1    14559
2    14480
3     9586
4     3380
5      835
6      157
7       24
8        3
Name: id, dtype: int64

In [15]:
metadata_df = metadata_df[metadata_df["genre_count"] > 0]
metadata_df.groupby("genre_count")["id"].count()

genre_count
1    14559
2    14480
3     9586
4     3380
5      835
6      157
7       24
8        3
Name: id, dtype: int64

In [16]:
genre_dummies = pd.get_dummies(metadata_df["genres"].apply(pd.Series).stack()).sum(level=0)
genre_dummies.head()

Unnamed: 0,Action,Adventure,Animation,Aniplex,BROSTA TV,Carousel Productions,Comedy,Crime,Documentary,Drama,...,Romance,Science Fiction,Sentai Filmworks,TV Movie,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western
0,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
genre_dummies = genre_dummies[all_genre_name]
genre_dummies.head()

Unnamed: 0,Animation,Comedy,Family,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,History,Science Fiction,Mystery,War,Foreign,Music,Documentary,Western,TV Movie
0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
movie_data = pd.concat([metadata_df[["id"]] , genre_dummies], axis=1)
movie_data

Unnamed: 0,id,Animation,Comedy,Family,Adventure,Fantasy,Romance,Drama,Action,Crime,...,Horror,History,Science Fiction,Mystery,War,Foreign,Music,Documentary,Western,TV Movie
0,862,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8844,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,15602,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,31357,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11862,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,949,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
6,11860,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,45325,0,0,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
8,9091,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,710,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
links_df = pd.read_csv(links_URL)
links_df = links_df.dropna()
links_df["tmdbId"] = links_df["tmdbId"].map(int)
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [21]:
def get_movieId(tmdbId_list, links_df):
    movieid_list = []
    for tmdbId in tmdbId_list:
        if not tmdbId.isdigit():
            movieid_list.append(np.nan)
        else:
            movieId = links_df[links_df["tmdbId"] == int(tmdbId)]["movieId"].tolist()
            if len(movieId) == 1:
                movieid_list.append(movieId[0])
            else:
                movieid_list.append(np.nan)
    return movieid_list

In [22]:
tmdbid_list = movie_data["id"].tolist()
movieid_list = get_movieId(tmdbid_list, links_df)
movie_data["movieId"] = movieid_list
movie_data.head()

Unnamed: 0,id,Animation,Comedy,Family,Adventure,Fantasy,Romance,Drama,Action,Crime,...,History,Science Fiction,Mystery,War,Foreign,Music,Documentary,Western,TV Movie,movieId
0,862,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
1,8844,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.0
2,15602,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3.0
3,31357,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,4.0
4,11862,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.0


In [23]:
# NaNを削除
movie_data = movie_data[movie_data["movieId"] > 0]
movie_data["id"].count()

42962

In [24]:
scores_df = pd.read_csv(scores_URL)[["movieId", "95%_least_plausible"]]
scores_df.head()

Unnamed: 0,movieId,95%_least_plausible
0,318.0,4.424716
1,159817.0,4.409009
2,858.0,4.333404
3,50.0,4.294587
4,170705.0,4.261164


In [25]:
movie_data =pd.merge(movie_data, scores_df, on="movieId")
movie_data.drop(["id","movieId"] , axis=1, inplace=True)
movie_data.head()

Unnamed: 0,Animation,Comedy,Family,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,...,History,Science Fiction,Mystery,War,Foreign,Music,Documentary,Western,TV Movie,95%_least_plausible
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.882039
1,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.226958
2,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.161669
3,0,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2.841556
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.066052


## 説明変数をジャンルとその他もろもろで重回帰分析

In [26]:
# データの分割（学習データとテストデータ分ける）
from sklearn.model_selection import train_test_split

# モデル
from sklearn import linear_model

# モデルのインスタンス
l_model = linear_model.LinearRegression()
 
# 説明変数に "price" 以外を利用
X = movie_data.drop("95%_least_plausible", axis=1)

# 目的変数
Y = movie_data["95%_least_plausible"]

# 学習データとテストデータ分ける
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5,random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5)

# モデルのあてはめ
clf = l_model.fit(X_train,y_train)
print("train:",clf.__class__.__name__ ,clf.score(X_train,y_train))
print("test:",clf.__class__.__name__ , clf.score(X_test,y_test))
 
# 偏回帰係数
print(pd.DataFrame({"Name":X.columns,
                    "Coefficients":clf.coef_}).sort_values(by='Coefficients') )
 
# 切片 
print(clf.intercept_)

train: LinearRegression 0.0827957769437
test: LinearRegression 0.0801041123114
    Coefficients             Name
19     -0.146525         TV Movie
15     -0.145967          Foreign
10     -0.133427           Horror
7      -0.015848           Action
2      -0.013090           Family
18      0.052045          Western
12      0.071038  Science Fiction
9       0.078430         Thriller
5       0.086155          Romance
14      0.091610              War
4       0.101197          Fantasy
13      0.108459          Mystery
16      0.116138            Music
3       0.122141        Adventure
1       0.122150           Comedy
8       0.128492            Crime
0       0.182488        Animation
17      0.198859      Documentary
11      0.209293          History
6       0.234537            Drama
2.32122058936


***

ここまではがいあのを完全にコピー

In [68]:
metadata = pd.read_csv(metadata_URL)[['budget', 'id', 'revenue', 'runtime']]

  interactivity=interactivity, compiler=compiler, result=result)


In [69]:
metadata.budget[19730] = 0
metadata.budget[29503] = 0
metadata.budget[35587] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [70]:
metadata.budget = pd.to_numeric(metadata.budget)

In [71]:
metadata = metadata[metadata['budget']>0]
metadata = metadata[metadata['revenue']>0]
metadata = metadata[metadata['runtime']>0]

In [72]:
tmdbid_list = metadata["id"].tolist()
movieid_list = get_movieId(tmdbid_list, links_df)
metadata["movieId"] = movieid_list
metadata.head()

Unnamed: 0,budget,id,revenue,runtime,movieId
0,30000000,862,373554033.0,81.0,1.0
1,65000000,8844,262797249.0,104.0,2.0
3,16000000,31357,81452156.0,127.0,4.0
5,60000000,949,187436818.0,170.0,6.0
8,35000000,9091,64350171.0,106.0,9.0


In [73]:
metadata =  metadata[metadata["movieId"] > 0]
metadata["id"].count()

5357

In [74]:
metadata =pd.merge(metadata, scores_df, on="movieId")
metadata.drop(["id","movieId"] , axis=1, inplace=True)
metadata.head()

Unnamed: 0,budget,revenue,runtime,95%_least_plausible
0,30000000,373554033.0,81.0,3.882039
1,65000000,262797249.0,104.0,3.226958
2,16000000,81452156.0,127.0,2.841556
3,60000000,187436818.0,170.0,3.83265
4,35000000,64350171.0,106.0,2.983756


In [75]:
# データの分割（学習データとテストデータ分ける）
from sklearn.model_selection import train_test_split

# モデル
from sklearn import linear_model

# モデルのインスタンス
l_model = linear_model.LinearRegression()
 
# 説明変数に "price" 以外を利用
X = metadata.drop("95%_least_plausible", axis=1)

# 目的変数
Y = metadata["95%_least_plausible"]

# 学習データとテストデータ分ける
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5,random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5)

# モデルのあてはめ
clf = l_model.fit(X_train,y_train)
print("train:",clf.__class__.__name__ ,clf.score(X_train,y_train))
print("test:",clf.__class__.__name__ , clf.score(X_test,y_test))
 
# 偏回帰係数
print(pd.DataFrame({"Name":X.columns,
                    "Coefficients":clf.coef_}).sort_values(by='Coefficients') )
 
# 切片 
print(clf.intercept_)

train: LinearRegression 0.11455269848
test: LinearRegression 0.106363238076
   Coefficients     Name
0 -2.596164e-09   budget
1  8.534381e-10  revenue
2  6.759755e-03  runtime
2.30887040763


In [77]:
metadata_df = pd.read_csv(metadata_URL)[["id", "genres"]]
metadata_df["genres"] = metadata_df["genres"].map(make_list_from_str)
metadata_df["genres"] = metadata_df["genres"].map(extract_names)
metadata_df["genre_count"] = metadata_df["genres"].map(lambda x: len(x))
genres_df = pd.read_csv(genres_URL)
all_genre_id = genres_df["id"].tolist()
all_genre_name = genres_df["name"].tolist()
metadata_df = metadata_df[metadata_df["genre_count"] > 0]
genre_dummies = pd.get_dummies(metadata_df["genres"].apply(pd.Series).stack()).sum(level=0)
genre_dummies = genre_dummies[all_genre_name]
movie_data = pd.concat([metadata_df[["id"]] , genre_dummies], axis=1)
tmdbid_list = movie_data["id"].tolist()
movieid_list = get_movieId(tmdbid_list, links_df)
movie_data["movieId"] = movieid_list
movie_data = movie_data[movie_data["movieId"] > 0]
movie_data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,Animation,Comedy,Family,Adventure,Fantasy,Romance,Drama,Action,Crime,...,History,Science Fiction,Mystery,War,Foreign,Music,Documentary,Western,TV Movie,movieId
0,862,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
1,8844,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.0
2,15602,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3.0
3,31357,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,4.0
4,11862,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.0


In [79]:
metadata = pd.read_csv(metadata_URL)[['budget', 'id', 'revenue', 'runtime']]
metadata.budget[19730] = 0
metadata.budget[29503] = 0
metadata.budget[35587] = 0
metadata.budget = pd.to_numeric(metadata.budget)
metadata = metadata[metadata['budget']>0]
metadata = metadata[metadata['revenue']>0]
metadata = metadata[metadata['runtime']>0]
tmdbid_list = metadata["id"].tolist()
movieid_list = get_movieId(tmdbid_list, links_df)
metadata["movieId"] = movieid_list
metadata.head()
metadata =  metadata[metadata["movieId"] > 0]
metadata.head()

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,budget,id,revenue,runtime,movieId
0,30000000,862,373554033.0,81.0,1.0
1,65000000,8844,262797249.0,104.0,2.0
3,16000000,31357,81452156.0,127.0,4.0
5,60000000,949,187436818.0,170.0,6.0
8,35000000,9091,64350171.0,106.0,9.0


In [84]:
merged = pd.merge(movie_data, metadata, on='movieId')
merged = pd.merge(merged, scores_df, on='movieId')
merged.drop(["id_x","id_y", "movieId"] , axis=1, inplace=True)
merged

Unnamed: 0,Animation,Comedy,Family,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,...,War,Foreign,Music,Documentary,Western,TV Movie,budget,revenue,runtime,95%_least_plausible
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,30000000,373554033.0,81.0,3.882039
1,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,65000000,262797249.0,104.0,3.226958
2,0,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,16000000,81452156.0,127.0,2.841556
3,0,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,60000000,187436818.0,170.0,3.832650
4,0,0,0,1,0,0,0,1,0,1,...,0,0,0,0,0,0,35000000,64350171.0,106.0,2.983756
5,0,0,0,1,0,0,0,1,0,1,...,0,0,0,0,0,0,58000000,352194034.0,130.0,3.423798
6,0,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,62000000,107879496.0,106.0,3.649377
7,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,44000000,13681765.0,192.0,3.410690
8,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,98000000,10017322.0,119.0,2.694739
9,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,52000000,116112375.0,178.0,3.785011


### 目的変数はratingスコア、説明変数は予算、興行収入、上映時間をくわえた

In [85]:
# データの分割（学習データとテストデータ分ける）
from sklearn.model_selection import train_test_split

# モデル
from sklearn import linear_model

# モデルのインスタンス
l_model = linear_model.LinearRegression()
 
# 説明変数に "price" 以外を利用
X = merged.drop("95%_least_plausible", axis=1)

# 目的変数
Y = merged["95%_least_plausible"]

# 学習データとテストデータ分ける
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5,random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5)

# モデルのあてはめ
clf = l_model.fit(X_train,y_train)
print("train:",clf.__class__.__name__ ,clf.score(X_train,y_train))
print("test:",clf.__class__.__name__ , clf.score(X_test,y_test))
 
# 偏回帰係数
print(pd.DataFrame({"Name":X.columns,
                    "Coefficients":clf.coef_}).sort_values(by='Coefficients') )
 
# 切片 
print(clf.intercept_)

train: LinearRegression 0.201998219022
test: LinearRegression 0.214548445573
    Coefficients             Name
15 -7.938138e-01          Foreign
10 -2.127594e-01           Horror
7  -1.651048e-01           Action
2  -1.607588e-01           Family
19 -8.062623e-02         TV Movie
1  -5.442705e-02           Comedy
16 -4.687441e-02            Music
5  -3.141176e-02          Romance
4  -1.622615e-02          Fantasy
9  -6.249431e-03         Thriller
20 -2.098975e-09           budget
21  9.734993e-10          revenue
3   1.284215e-03        Adventure
22  4.774270e-03          runtime
14  3.810700e-02              War
11  4.119863e-02          History
12  8.940953e-02  Science Fiction
13  1.082160e-01          Mystery
8   1.266655e-01            Crime
6   1.575125e-01            Drama
0   1.780475e-01        Animation
18  2.646165e-01          Western
17  3.169014e-01      Documentary
2.50632836292


### 目的変数を興行収入にして、重回帰分析


In [86]:
# データの分割（学習データとテストデータ分ける）
from sklearn.model_selection import train_test_split

# モデル
from sklearn import linear_model

# モデルのインスタンス
l_model = linear_model.LinearRegression()
 
# 説明変数に "price" 以外を利用
X = merged.drop(["95%_least_plausible", "revenue"], axis=1)

# 目的変数
Y = merged["revenue"]

# 学習データとテストデータ分ける
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5,random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5)

# モデルのあてはめ
clf = l_model.fit(X_train,y_train)
print("train:",clf.__class__.__name__ ,clf.score(X_train,y_train))
print("test:",clf.__class__.__name__ , clf.score(X_test,y_test))
 
# 偏回帰係数
print(pd.DataFrame({"Name":X.columns,
                    "Coefficients":clf.coef_}).sort_values(by='Coefficients') )
 
# 切片 
print(clf.intercept_)

train: LinearRegression 0.547097222639
test: LinearRegression 0.55171525053
    Coefficients             Name
11 -4.186356e+07          History
18 -2.781296e+07          Western
15 -2.006806e+07          Foreign
6  -1.541197e+07            Drama
7  -1.207149e+07           Action
16 -1.116661e+07            Music
14 -1.083537e+07              War
13 -8.647997e+06          Mystery
12 -7.506687e+06  Science Fiction
9  -6.833033e+06         Thriller
8  -4.848498e+06            Crime
1  -4.043272e+06           Comedy
20  2.884988e+00           budget
21  7.713494e+05          runtime
4   7.072818e+06          Fantasy
5   1.039719e+07          Romance
17  1.231812e+07      Documentary
2   1.358423e+07           Family
10  1.405370e+07           Horror
0   2.561281e+07        Animation
3   2.687958e+07        Adventure
19  2.945461e+07         TV Movie
-74490573.7353


さきほどの結果と比較するとおもしろい。TV Movie, Horror, Familyなどはratingには負に効いていたのに、興行収入では正に効いている。逆にWestern, Dramaなどは反対の挙動を示してる。

## 監督、俳優のダミー変数づくり

In [4]:
actors = pd.read_csv(actors_URL)
actors.head()

Unnamed: 0.1,Unnamed: 0,cast_id,character,credit_id,gender,id,name,order,profile_path,movies,appearance
0,0,14,Woody (voice),52fe4284c3a36847f8024f95,2,31,Tom Hanks,0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg,"[862, 568, 13, 9800, 858, 32562, 9591, 857, 11...",72
1,1,15,Buzz Lightyear (voice),52fe4284c3a36847f8024f99,2,12898,Tim Allen,1,/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg,"[862, 11395, 9446, 10371, 863, 926, 2185, 1231...",29
2,2,16,Mr. Potato Head (voice),52fe4284c3a36847f8024f9d,2,7167,Don Rickles,2,/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg,"[862, 524, 18937, 14577, 18784, 863, 11589, 39...",29
3,3,17,Slinky Dog (voice),52fe4284c3a36847f8024fa1,2,12899,Jim Varney,3,/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg,"[862, 11041, 32302, 863, 18935, 10865, 26386, ...",20
4,4,18,Rex (voice),52fe4284c3a36847f8024fa5,2,12900,Wallace Shawn,4,/oGE6JqPP2xH4tNORKNqxbNPYi7u.jpg,"[862, 9603, 1775, 15789, 32636, 23333, 19042, ...",75


In [10]:
len(actors)

206158

In [5]:
actors = actors[actors.appearance>10]

In [12]:
len(actors)

9144

In [16]:
np.array(actors.id)

array([    31,  12898,   7167, ..., 148052, 141695, 586625])

In [53]:
for i in range(len(pre_dummy)):
    if type(pre_dummy[i]) == np.float64:
        pre_dummy[i] = np.nan
    else:
        for j in range(len(pre_dummy[i])):
            if pre_dummy[i][j] in list(actors.id):
                continue
            else:
                pre_dummy[i][j] = np.nan

In [54]:
pre_dummy = pre_dummy.dropna()
cast_dummies = pd.get_dummies(pre_dummy).sum(level=0)

In [59]:
cast_dummies

Unnamed: 0,1.0,2.0,3.0,4.0,5.0,6.0,13.0,14.0,18.0,19.0,...,1468755.0,1469587.0,1478372.0,1514445.0,1573982.0,1583011.0,1584544.0,1619660.0,1706855.0,1787560.0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# データの分割（学習データとテストデータ分ける）
from sklearn.model_selection import train_test_split

# モデル
from sklearn import linear_model

# モデルのインスタンス
l_model = linear_model.LinearRegression()
 
# 説明変数に "price" 以外を利用
X = merged.drop(["95%_least_plausible", "revenue"], axis=1)

# 目的変数
Y = merged["revenue"]

# 学習データとテストデータ分ける
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5,random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5)

# モデルのあてはめ
clf = l_model.fit(X_train,y_train)
print("train:",clf.__class__.__name__ ,clf.score(X_train,y_train))
print("test:",clf.__class__.__name__ , clf.score(X_test,y_test))
 
# 偏回帰係数
print(pd.DataFrame({"Name":X.columns,
                    "Coefficients":clf.coef_}).sort_values(by='Coefficients') )
 
# 切片 
print(clf.intercept_)

***

In [6]:
def make_list_from_str(string):
    if isinstance(string, str):
        result_list = eval(string)
    else:
        result_list = np.nan
    return result_list

#idだけを取得、リストの作成
def extract_ids(genres_list):
    id_list = []
    for genre in genres_list:
        id_list.append(genre["id"])
    return id_list

#nameだけを取得、リストの作成
def extract_names(genres_list):
    id_list = []
    for genre in genres_list:
        id_list.append(genre["name"])
    return id_list

In [6]:
credits = pd.read_csv(credits_URL)

credits['cast_list'] = credits.cast.map(make_list_from_str)
credits['crew_list'] = credits.crew.map(make_list_from_str)

credits['cast_id'] = credits.cast_list.map(extract_ids)
credits['cast_name'] = credits.cast_list.map(extract_names)

credits['crew_id'] = credits.crew_list.map(extract_ids)
credits['crew_name'] = credits.crew_list.map(extract_names)

In [60]:
credits

Unnamed: 0,cast,crew,id,cast_list,crew_list,cast_id,cast_name,crew_id,crew_name
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,"[{'credit_id': '52fe4284c3a36847f8024f95', 'ge...","[{'credit_id': '52fe4284c3a36847f8024f49', 'pr...","[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[7879, 12891, 7, 12892, 12893, 12894, 12895, 1...","[John Lasseter, Joss Whedon, Andrew Stanton, J..."
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,"[{'credit_id': '52fe44bfc3a36847f80a7c73', 'ge...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'pr...","[2157, 8537, 205, 145151, 5149, 10739, 58563, ...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[511, 876, 1729, 4945, 4951, 4952, 8023, 9967,...","[Larry J. Franco, Jonathan Hensleigh, James Ho..."
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,"[{'credit_id': '52fe466a9251416c75077a8d', 'ge...","[{'credit_id': '52fe466a9251416c75077a89', 'pr...","[6837, 3151, 13567, 16757, 589, 16523, 7166]","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[26502, 16837, 16837, 1551320]","[Howard Deutch, Mark Steven Johnson, Mark Stev..."
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,"[{'credit_id': '52fe44779251416c91011aad', 'ge...","[{'credit_id': '52fe44779251416c91011acb', 'pr...","[8851, 9780, 18284, 51359, 66804, 352, 87118, ...","[Whitney Houston, Angela Bassett, Loretta Devi...","[2178, 5144, 5144, 21968, 70592, 111118, 11111...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez..."
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,"[{'credit_id': '52fe44959251416c75039eb9', 'ge...","[{'credit_id': '52fe44959251416c75039ed7', 'pr...","[67773, 3092, 519, 70696, 59222, 18793, 14592,...","[Steve Martin, Diane Keaton, Martin Short, Kim...","[37, 5506, 17698, 17698, 26160, 56106, 68755]","[Alan Silvestri, Elliot Davis, Nancy Meyers, N..."
5,"[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de...",949,"[{'credit_id': '52fe4292c3a36847f80291f5', 'ge...","[{'credit_id': '52fe4292c3a36847f802916d', 'pr...","[1158, 380, 5576, 10127, 3197, 6200, 15851, 15...","[Al Pacino, Robert De Niro, Val Kilmer, Jon Vo...","[638, 638, 1254, 638, 5581, 11099, 15840, 1584...","[Michael Mann, Michael Mann, Art Linson, Micha..."
6,"[{'cast_id': 1, 'character': 'Linus Larrabee',...","[{'credit_id': '52fe44959251416c75039da9', 'de...",11860,"[{'credit_id': '52fe44959251416c75039d97', 'ge...","[{'credit_id': '52fe44959251416c75039da9', 'pr...","[3, 15887, 17141, 4301, 12957, 8937, 16554, 34...","[Harrison Ford, Julia Ormond, Greg Kinnear, An...","[2226, 70846, 2226, 491, 10640, 2997, 5490, 17...","[Sydney Pollack, Barbara Benedek, Sydney Polla..."
7,"[{'cast_id': 2, 'character': 'Tom Sawyer', 'cr...","[{'credit_id': '52fe46bdc3a36847f810f797', 'de...",45325,"[{'credit_id': '52fe46bdc3a36847f810f771', 'ge...","[{'credit_id': '52fe46bdc3a36847f810f797', 'pr...","[53283, 51214, 38581, 8316, 87007, 57448, 102313]","[Jonathan Taylor Thomas, Brad Renfro, Rachael ...","[2075, 7775, 18357, 72225]","[David Loughery, Stephen Sommers, Peter Hewitt..."
8,"[{'cast_id': 1, 'character': 'Darren Francis T...","[{'credit_id': '52fe44dbc3a36847f80ae0f1', 'de...",9091,"[{'credit_id': '52fe44dbc3a36847f80ae0e3', 'ge...","[{'credit_id': '52fe44dbc3a36847f80ae0f1', 'pr...","[15111, 6280, 8656, 10361, 12928, 79088]","[Jean-Claude Van Damme, Powers Boothe, Dorian ...","[37710, 53300, 56953, 56032, 56954, 53299, 450...","[Peter Hyams, Karen Elise Baldwin, Gene Quinta..."
9,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de...",710,"[{'credit_id': '52fe426ec3a36847f801e10d', 'ge...","[{'credit_id': '52fe426ec3a36847f801e14b', 'pr...","[517, 48, 10695, 10696, 10671, 5309, 3757, 192...","[Pierce Brosnan, Sean Bean, Izabella Scorupco,...","[10702, 9856, 10704, 10705, 10666, 10493, 996,...","[Martin Campbell, Ian Fleming, Jeffrey Caine, ..."


In [62]:
credits[['id', 'cast_id']].head()

Unnamed: 0,id,cast_id
0,862,"[31, 12898, 7167, 12899, 12900, 7907, 8873, 11..."
1,8844,"[2157, 8537, 205, 145151, 5149, 10739, 58563, ..."
2,15602,"[6837, 3151, 13567, 16757, 589, 16523, 7166]"
3,31357,"[8851, 9780, 18284, 51359, 66804, 352, 87118, ..."
4,11862,"[67773, 3092, 519, 70696, 59222, 18793, 14592,..."


In [7]:
id_cast_id = credits[['id', 'cast_id']]

In [70]:
list(id_cast_id[id_cast_id.index==0].cast_id)[0]

[31,
 12898,
 7167,
 12899,
 12900,
 7907,
 8873,
 1116442,
 12901,
 12133,
 8655,
 12903,
 37221]

In [78]:
id_cast_id[id_cast_id.id==8844].cast_id

1    [2157, 8537, 205, 145151, 5149, 10739, 58563, ...
Name: cast_id, dtype: object

In [93]:
id_cast_id[id_cast_id.id==i].cast_id

0    [31, 12898, 7167, 12899, 12900, 7907, 8873, na...
Name: cast_id, dtype: object

***

実行中

In [8]:
for i in np.array(id_cast_id.id):
    cast_list = list(id_cast_id[id_cast_id.id==i].cast_id)[0]
    for j in range(len(cast_list)):
        if cast_list[j] in list(actors.id):
            continue
        else:
            cast_list[j] = np.nan
    id_cast_id[id_cast_id.id==i]['cast_id'] = str(cast_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
id_cast_id.cast_id = id_cast_id.cast_id.map(make_list_from_str)

In [15]:
id_cast_id = id_cast_id.dropna()
id_cast_id.head()

Unnamed: 0,id,cast_id


In [11]:
pre_dummy = id_cast_id.cast_id.apply(pd.Series).stack()

In [None]:
pre_dummy = pre_dummy.dropna()
cast_dummies = pd.get_dummies(pre_dummy).sum(level=0)

***

In [19]:
credits[credits.cast_name.isnull()]

Unnamed: 0,cast,crew,id,cast_list,crew_list,cast_id,cast_name,crew_id,crew_name


In [44]:
pre_dummy = credits["cast_id"].apply(pd.Series).stack()

In [19]:
pre_dummy

0      0          31.0
       1       12898.0
       2        7167.0
       3       12899.0
       4       12900.0
       5        7907.0
       6        8873.0
       7     1116442.0
       8       12901.0
       9       12133.0
       10       8655.0
       11      12903.0
       12      37221.0
1      0        2157.0
       1        8537.0
       2         205.0
       3      145151.0
       4        5149.0
       5       10739.0
       6       58563.0
       7        1276.0
       8       46530.0
       9       56523.0
       10      51551.0
       11      56522.0
       12    1000304.0
       13     188949.0
       14    1076551.0
       15    1480246.0
       16      25024.0
               ...    
45472  1      111636.0
       2     1204271.0
       3      278923.0
       4     1042953.0
       5      575774.0
       6     1184497.0
       7      462483.0
       8     1728582.0
       9     1485780.0
       10    1801183.0
45473  0       23764.0
       1        2059.0
       2   

In [41]:
type(pre_dummy)

pandas.core.series.Series

***

↓メモリが足りなくて処理できない

In [None]:
cast_dummies = pd.get_dummies(pre_dummy).sum(level=0)

In [None]:
cast_dummies.head()

***

### director, actorのリストを取得

In [22]:
ls

IMDB-Movie-Data.csv      [34mml-latest[m[m/               regression.ipynb
get_dummies.py           [34mml-latest-small[m[m/         [34mthe-movies-dataset[m[m/
[31mimdb.csv[m[m*                movie_analysis.ipynb     [34mtmdb-5000-movie-dataset[m[m/
[34mml-100k[m[m/                 movie_cast.gml           untitled.txt
[34mml-1m[m[m/                   network.ipynb
[34mml-20m[m[m/                  notebook.ipynb


In [23]:
cd the-movies-dataset/

/Users/yoshidatomoya/Documents/GCI/movie_analysis/the-movies-dataset


In [24]:
ls

actors.csv           jobs.csv             [31mmovies_metadata.csv[m[m*
[31mcredits.csv[m[m*         [31mkeywords.csv[m[m*        [31mratings_small.csv[m[m*
directors.csv        [31mlinks.csv[m[m*
[31mgenres.csv[m[m*          [31mlinks_small.csv[m[m*


In [25]:
actors = pd.read_csv("actors.csv")
directors = pd.read_csv("directors.csv")

In [69]:
actors.head()

Unnamed: 0.1,Unnamed: 0,id,name,appearance
0,0,31,Tom Hanks,72
1,1,12898,Tim Allen,29
2,2,7167,Don Rickles,29
3,3,12899,Jim Varney,20
4,4,12900,Wallace Shawn,75


In [26]:
actors[actors.appearance>=10].sort_values('appearance', ascending=False)

Unnamed: 0.1,Unnamed: 0,id,name,appearance
9530,9530,121323,Bess Flowers,241
8401,8401,113,Christopher Lee,148
11575,11575,4165,John Wayne,125
2415,2415,2231,Samuel L. Jackson,123
15436,15436,3895,Michael Caine,110
5223,5223,16927,Gérard Depardieu,110
11044,11044,8516,John Carradine,109
3110,3110,55636,Donald Sutherland,109
1726,1726,18897,Jackie Chan,108
260,260,15831,Frank Welker,107


10回以上映画に出演している俳優は10450人

In [70]:
directors.head()

Unnamed: 0.1,Unnamed: 0,name,id,appearance
0,0,John Lasseter,7879,10
1,1,Joe Johnston,4945,10
2,2,Howard Deutch,26502,11
3,3,Forest Whitaker,2178,4
4,4,Charles Shyer,56106,7


In [75]:
directors[directors.appearance>=5].sort_values('appearance', ascending=False)

Unnamed: 0.1,Unnamed: 0,name,id,appearance
901,901,John Ford,8500,68
726,726,Michael Curtiz,4109,65
523,523,Werner Herzog,6818,55
722,722,Alfred Hitchcock,2636,53
4262,4262,Georges Méliès,11523,51
567,567,Jean-Luc Godard,3776,50
57,57,Woody Allen,1243,49
412,412,Sidney Lumet,39996,46
914,914,Charlie Chaplin,13848,44
746,746,William A. Wellman,14643,43


5回以上映画を作っている監督は2283人

## 評価回数、rating、興行収入、予算などの間の相関係数

In [8]:
ls

IMDB-Movie-Data.csv      [34mml-latest[m[m/               notebook.ipynb
[31mimdb.csv[m[m*                [34mml-latest-small[m[m/         regression.ipynb
[34mml-100k[m[m/                 movie_analysis.ipynb     [34mthe-movies-dataset[m[m/
[34mml-1m[m[m/                   movie_cast.gml           [34mtmdb-5000-movie-dataset[m[m/
[34mml-20m[m[m/                  network.ipynb            untitled.txt


In [9]:
cd ml-latest

/Users/yoshidatomoya/Documents/GCI/movie_analysis/ml-latest


In [10]:
ls

95%_least_plausible.csv
MovieLens.ipynb
README.txt
genome-scores.csv
genome-tags.csv
links.csv
movies.csv
posterior_distribution_of_352    Forrest Gump (1994)?Name: title, dtype: object.png
posterior_distribution_of_45841    Satan Triumphant (1917)?Name: title, dtype: object.png
posterior_distributions_exapmle.png
ratings.csv
tags.csv


In [11]:
genome_score = pd.read_csv('genome-scores.csv')
links = pd.read_csv('links.csv')
ratings = pd.read_csv('ratings.csv')
genome_tags = pd.read_csv('genome-tags.csv')
movies = pd.read_csv("movies.csv")
tags = pd.read_csv('tags.csv')

In [18]:
rating_mean = ratings.groupby(['movieId'])['rating'].mean()
rating_num = ratings.groupby(['movieId']).userId.count()

In [20]:
cd ..

/Users/yoshidatomoya/Documents/GCI/movie_analysis


In [21]:
metadata = pd.read_csv(metadata_URL)

  interactivity=interactivity, compiler=compiler, result=result)


In [49]:
metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [35]:
metadata[['budget', 'id', 'imdb_id', 'revenue', 'vote_average', 'vote_count']]

Unnamed: 0,budget,id,imdb_id,revenue,vote_average,vote_count
0,30000000.0,862,tt0114709,373554033.0,7.7,5415.0
1,65000000.0,8844,tt0113497,262797249.0,6.9,2413.0
2,0.0,15602,tt0113228,0.0,6.5,92.0
3,16000000.0,31357,tt0114885,81452156.0,6.1,34.0
4,0.0,11862,tt0113041,76578911.0,5.7,173.0
5,60000000.0,949,tt0113277,187436818.0,7.7,1886.0
6,58000000.0,11860,tt0114319,0.0,6.2,141.0
7,0.0,45325,tt0112302,0.0,5.4,45.0
8,35000000.0,9091,tt0114576,64350171.0,5.5,174.0
9,58000000.0,710,tt0113189,352194034.0,6.6,1194.0


In [28]:
np.array(rating_mean)

array([ 3.8881575 ,  3.23695318,  3.17555011, ...,  5.        ,
        1.        ,  3.        ])

In [29]:
np.array(rating_num)

array([66008, 26060, 15497, ...,     1,     1,     1])

ratingの平均とratingされた数の相関は

In [31]:
import scipy.stats as sps

sps.pearsonr(np.array(rating_mean), np.array(rating_num))

(0.12124607340899146, 2.5816084465206952e-147)

0.12でほとんどない

In [33]:
metadata.budget = metadata.budget.convert_objects(convert_numeric=True)

  if __name__ == '__main__':


In [39]:
budget_revenue = metadata[['budget', 'revenue']][metadata.budget!=0]
budget_revenue = budget_revenue[budget_revenue.revenue!=0]

In [40]:
budget_revenue

Unnamed: 0,budget,revenue
0,30000000.0,373554033.0
1,65000000.0,262797249.0
3,16000000.0,81452156.0
5,60000000.0,187436818.0
8,35000000.0,64350171.0
9,58000000.0,352194034.0
10,62000000.0,107879496.0
13,44000000.0,13681765.0
14,98000000.0,10017322.0
15,52000000.0,116112375.0


In [41]:
sps.pearsonr(budget_revenue.budget, budget_revenue.revenue)

(nan, 1.0)