In [1]:
import pandas as pd
from ast import literal_eval

In [2]:
converters = {"director": literal_eval, "author": literal_eval, "actor": literal_eval, "genre": literal_eval}
movies = pd.read_csv("./data/movies_actors_df.csv", index_col=0, converters=converters)

In [3]:
# drop irrelevant columns
movies.drop(columns=["url", "image", "datePublished", "duration", "description", "language"], inplace=True, errors="ignore")
movies.head()

Unnamed: 0,id,name,year,genre,ratingCount,bestRating,worstRating,ratingValue,directorMaxEffect,actorMaxEffect,writerMaxEffect,directorSumEffect,actorSumEffect,writerSumEffect
0,10344754,毒战 毒戰,2012,"[剧情, 动作, 犯罪]",210072,10,2,7.5,34235,27592,5441,34235,107664,7410
1,10355621,粉红女郎之爱人快跑,2013,"[喜剧, 爱情]",1826,10,2,3.2,65,3861,65,65,13658,65
2,10355633,绝命藏宝图,2012,"[动作, 悬疑]",78,10,2,3.2,43,43,43,43,69,43
4,10430281,七个隆咚锵咚锵,2012,"[喜剧, 爱情]",506,10,2,4.9,115,161,115,115,340,115
5,10437802,武当少年 武當少年,2010,"[剧情, 儿童]",68,10,2,6.1,2,192,2,2,233,2


In [4]:
# cf 02a summary statistics
genres = {'传记', '儿童', '冒险', '剧情', '动作', '动画', '历史', '古装', '喜剧', '奇幻', '家庭', '恐怖', '悬疑', '惊悚', '战争', '歌舞', '武侠', '灾难',
          '爱情', '犯罪', '科幻', '西部', '运动', '音乐'}

In [5]:
# build genre dummies
for genre in genres:
    dummies = []
    for g in movies.genre:
        dummies.append(1 if genre in g else 0)
    movies.insert(len(movies.columns), f"d{genre}", dummies)

movies.head()

Unnamed: 0,id,name,year,genre,ratingCount,bestRating,worstRating,ratingValue,directorMaxEffect,actorMaxEffect,...,d历史,d西部,d动画,d爱情,d犯罪,d动作,d冒险,d悬疑,d剧情,d恐怖
0,10344754,毒战 毒戰,2012,"[剧情, 动作, 犯罪]",210072,10,2,7.5,34235,27592,...,0,0,0,0,1,1,0,0,1,0
1,10355621,粉红女郎之爱人快跑,2013,"[喜剧, 爱情]",1826,10,2,3.2,65,3861,...,0,0,0,1,0,0,0,0,0,0
2,10355633,绝命藏宝图,2012,"[动作, 悬疑]",78,10,2,3.2,43,43,...,0,0,0,0,0,1,0,1,0,0
4,10430281,七个隆咚锵咚锵,2012,"[喜剧, 爱情]",506,10,2,4.9,115,161,...,0,0,0,1,0,0,0,0,0,0
5,10437802,武当少年 武當少年,2010,"[剧情, 儿童]",68,10,2,6.1,2,192,...,0,0,0,0,0,0,0,0,1,0


In [6]:
# build year dummies
# careful: dummy variable trap
for year in range(2011, 2023):
    dummies = []
    for y in movies.year:
        dummies.append(1 if year == y else 0)
    movies.insert(len(movies.columns), f"y{year}", dummies)

movies.head()

Unnamed: 0,id,name,year,genre,ratingCount,bestRating,worstRating,ratingValue,directorMaxEffect,actorMaxEffect,...,y2013,y2014,y2015,y2016,y2017,y2018,y2019,y2020,y2021,y2022
0,10344754,毒战 毒戰,2012,"[剧情, 动作, 犯罪]",210072,10,2,7.5,34235,27592,...,0,0,0,0,0,0,0,0,0,0
1,10355621,粉红女郎之爱人快跑,2013,"[喜剧, 爱情]",1826,10,2,3.2,65,3861,...,1,0,0,0,0,0,0,0,0,0
2,10355633,绝命藏宝图,2012,"[动作, 悬疑]",78,10,2,3.2,43,43,...,0,0,0,0,0,0,0,0,0,0
4,10430281,七个隆咚锵咚锵,2012,"[喜剧, 爱情]",506,10,2,4.9,115,161,...,0,0,0,0,0,0,0,0,0,0
5,10437802,武当少年 武當少年,2010,"[剧情, 儿童]",68,10,2,6.1,2,192,...,0,0,0,0,0,0,0,0,0,0


In [7]:
import statsmodels.api as sm

In [8]:
y = movies.ratingValue
X = movies.iloc[:,11:]
# add intercept
X.insert(0, "intercept", 1)
X.head()

Unnamed: 0,intercept,directorSumEffect,actorSumEffect,writerSumEffect,d运动,d家庭,d喜剧,d古装,d科幻,d武侠,...,y2013,y2014,y2015,y2016,y2017,y2018,y2019,y2020,y2021,y2022
0,1,34235,107664,7410,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,65,13658,65,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1,43,69,43,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,115,340,115,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,2,233,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:            ratingValue   R-squared:                       0.291
Model:                            OLS   Adj. R-squared:                  0.281
Method:                 Least Squares   F-statistic:                     31.07
Date:                Mon, 27 Jun 2022   Prob (F-statistic):          6.92e-189
Time:                        18:03:27   Log-Likelihood:                -5001.4
No. Observations:                2996   AIC:                         1.008e+04
Df Residuals:                    2956   BIC:                         1.032e+04
Df Model:                          39                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
intercept             5.4241      0.12

In [None]:
# is it possible to visualise multicollinearity?