In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
train_data = pd.read_csv('../data/train_final.csv', encoding='latin1')
val_data = pd.read_csv('../data/val_final.csv', encoding='latin1')
test_data = pd.read_csv('../data/test_final.csv', encoding='latin1')

In [3]:
for col in ['runtime','num_faces','hue','hue_sd','saturation','saturation_sd ','brightness','brightness_sd',
            'blue','blue_sd','green','green_sd','red','red_sd ']:
    mean_feature = np.mean(train_data[col])
    sd_feature = np.std(train_data[col])
    
    if sd_feature != 0:
        train_data[col] = (train_data[col] - mean_feature) / sd_feature
        val_data[col] = (val_data[col] - mean_feature) / sd_feature
        test_data[col] = (test_data[col] - mean_feature) / sd_feature
    else:
        train_data[col] = (train_data[col] - mean_feature) 
        val_data[col] = (val_data[col] - mean_feature) 
        test_data[col] = (test_data[col] - mean_feature)

In [5]:
rem_cols =  list(set(STOPWORDS) & set(train_data.columns))
rem_cols.append('imdb_score')
rem_cols.append('posterID')

In [9]:
train_X = train_data.drop(columns=rem_cols)
val_X = val_data.drop(columns=rem_cols)
test_X = test_data.drop(columns=rem_cols)
train_y = train_data.imdb_score
val_y = val_data.imdb_score
test_y = test_data.imdb_score

In [10]:
# imp_cols = ['runtime', 'documentary', 'drama', 'horror', 'action', 'animation',
#        'hue_sd', 'saturation_sd ', 'hue', 'saturation', 'brightness_sd',
#        'green_sd', 'blue_sd', 'red_sd ', 'blue', 'green', 'red', 'short',
#        'brightness', 'thriller']
# train_X = train_data[imp_cols]
# val_X = val_data[imp_cols]
# test_X = test_data[imp_cols]
# train_y = train_data.imdb_score
# val_y = val_data.imdb_score
# test_y = test_data.imdb_score

In [11]:
train_X

Unnamed: 0,runtime,num_faces,brightness,saturation,hue,brightness_sd,saturation_sd,hue_sd,blue,blue_sd,...,subway,cities,excitement,monstrous,May,traps,traffic,heir,fearless,eliminate
0,0.783686,-0.216658,-0.211632,-0.832558,2.121743,1.853820,-0.387582,-0.339980,0.351619,1.771207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.044494,-0.216658,-1.197686,-0.371092,-0.279790,-0.896663,-0.498212,0.927495,-1.012683,-1.225915,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.446279,-0.216658,1.803157,-0.938981,-0.561388,-1.358654,-0.300175,0.476504,1.576181,0.064789,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.013821,-0.216658,0.653993,-0.312038,-0.672706,0.982971,0.531417,0.350392,0.271128,1.133443,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.565941,-0.216658,-0.004048,0.907230,-0.093724,0.017742,0.753552,0.472705,-0.561222,-0.196671,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13189,-0.504594,-0.216658,-0.038594,0.960138,-0.934452,-0.733416,-0.928683,-1.157100,-0.862697,-1.386202,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13190,0.231566,-0.216658,-0.714362,-0.840712,0.235713,1.616463,0.784342,0.747825,-0.127652,1.566644,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13191,0.078199,-0.216658,0.408528,-0.926015,-1.103991,-0.203606,-0.973200,-1.619233,0.503233,-0.174021,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13192,-0.228534,-0.216658,-1.272881,-0.823072,0.742440,0.644669,-0.088083,-0.180132,-0.729585,0.670421,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# reg = LinearRegression().fit(train_X, train_y)

# print('train r^2: ', reg.score(train_X, train_y))
# print('train mse: ', mean_squared_error(reg.predict(train_X), train_y))

# print('valid r^2: ', reg.score(val_X, val_y))
# print('valid mse: ', mean_squared_error(reg.predict(val_X), val_y))

# print('test r^2: ', reg.score(test_X, test_y))
# print('test mse: ', mean_squared_error(reg.predict(test_X), test_y))

In [None]:
# embs = TSNE(n_components=2).fit_transform(train_X)

In [None]:
# fig = px.scatter_3d(x=embs[:, 0], y=embs[:, 1], z=train_y)
# fig.show()

In [None]:
# dtreg = DecisionTreeRegressor(random_state = 0, max_depth = 8)
# dtreg.fit(train_X, train_y)

# print('Train R^2: ', dtreg.score(train_X, train_y))
# print('Train MSE: ', mean_squared_error(dtreg.predict(train_X), train_y))

# print('Valid R^2: ', dtreg.score(val_X, val_y))
# print('Valid MSE: ', mean_squared_error(dtreg.predict(val_X), val_y))

# print('Test R^2: ', dtreg.score(test_X, test_y))
# print('Test MSE: ', mean_squared_error(dtreg.predict(test_X), test_y))

In [None]:
movies = pd.read_csv('../final_data.csv')

In [None]:
best_scores = movies.sort_values(by='IMDB Score', ascending=False).iloc[:16]

In [None]:
best_scores = best_scores['overview'].reset_index(drop=True)

In [None]:
fig, axes = plt.subplots(4, 4, figsize=(16, 12))

for i in range(4):
    for j in range(4):
        text = best_scores[4*i + j]
        wordcloud = WordCloud(max_font_size=60, background_color='white').generate(text)
        axes[i][j].imshow(wordcloud, interpolation='bilinear')
        axes[i][j].axis("off")

fig.show()

In [12]:
rfreg = RandomForestRegressor(max_depth=32, random_state=0,n_estimators=100)
rfreg.fit(train_X, train_y)

print('Train R^2: ', rfreg.score(train_X, train_y))
print('Train MSE: ', mean_squared_error(rfreg.predict(train_X), train_y))

print('Valid R^2: ', rfreg.score(val_X, val_y))
print('Valid MSE: ', mean_squared_error(rfreg.predict(val_X), val_y))

print('Test R^2: ', rfreg.score(test_X, test_y))
print('Test MSE: ', mean_squared_error(rfreg.predict(test_X), test_y))

Train R^2:  0.8174309780645628
Train MSE:  0.2696671870591365
Valid R^2:  0.3953315654391144
Valid MSE:  0.8622263989483003
Test R^2:  0.44529829535056664
Test MSE:  0.8273452589746876


In [None]:
importances_order = np.argsort(rfreg.feature_importances_)[::-1]
print(train_X.columns[importances_order[:40]])
print(rfreg.feature_importances_[importances_order[:40]])
plt.figure()
plt.title("Feature importances")
plt.bar(range(40),rfreg.feature_importances_[importances_order[:40]])
plt.xticks(range(40), train_X.columns[importances_order[:40]], rotation='vertical')
plt.show()

In [None]:
len(stop_words)

In [None]:
train_X