In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df_movie_reviews = pd.read_csv("movie_reviews_with_info.csv", index_col=0)
df_movie_reviews.loc[0]

Movie                                                           47 RONIN
Review                 Heavy on CGI, light on entertainment, 47 Ronin...
Score                                                               40.0
Average Movie Score                                                 44.5
Film Rating                                                       PG-13 
Description            Based on the original 1941 movie from Japan, a...
budget                                                       175000000.0
box office                                                   150962475.0
all genres                                Action|Adventure|Drama|Fantasy
Action                                                                 1
Adventure                                                              1
Animation                                                              0
Comedy                                                                 0
Crime                                              

In [17]:
len(df_movie_reviews)

18713

# Machine Learning to Predict Individual Critic Scores
Given all the variables available to us, our most optimal setup gives us predictions 17.09 points off on average(RSME) with the following configuration:


*   Linear Regression() >    KNeighborsRegressor(n_neighbors=19)
*   TfidfVectorizer(max_features=2000)
*   Variables: 'Review', 'box office', and Individual Genres(Comedy, Action, etc..)

Sadly, this is based on a 0-100 scoring system, so this prediction algorithm still cannot be said to be particularly accurate. 

We can conclude that we are unable to accurately predict what a critic's score will be for a movie just based on the data we currently have.

In [18]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [19]:
X_train = df_movie_reviews
y_train = X_train["Score"]
genres = df_movie_reviews.columns[9:]

## Linear Regression

In [8]:
#test with linear regression with top 2000 words(obtained from trial and error) in tf-idf
ct = make_column_transformer(
    (TfidfVectorizer(max_features=2000), "Review"),
    remainder="drop"  # all other columns in X will be dropped.
)

linear_model = make_pipeline(
            ct,
            LinearRegression()
            )

scores = cross_val_score(linear_model, 
                         X=X_train,
                         y=y_train,
                         scoring="neg_mean_squared_error",
                         cv=20)

print("[Review]: " + str(np.sqrt(-scores).mean()))

#[Review]: 17.47439931329904 #with top 2000 words

[Review]: 17.47439931329904


In [None]:
for columns_to_standard in [[], ["budget"], ["box office"], ["budget", "box office"], ]:
  for columns_to_encode in [[], ["all genres"]]: 
    for ct_val in [1, 2]: 
      ct = None
      if ct_val == 1:
        ct = make_column_transformer(
            (TfidfVectorizer(max_features=2000), "Review"),
            (OneHotEncoder(handle_unknown = 'ignore'), columns_to_encode),
            (StandardScaler(), columns_to_standard),
            remainder="drop"  # all other columns in X will be dropped.
          )
      if ct_val == 2:
        ct = make_column_transformer(
            (TfidfVectorizer(max_features=2000), "Review"),
            (TfidfVectorizer(max_features=2000), "Description"),
            (OneHotEncoder(handle_unknown = 'ignore'), columns_to_encode),
            (StandardScaler(), columns_to_standard),
            remainder="drop"  # all other columns in X will be dropped.
          )       

      pipeline = make_pipeline(
                  ct,
                  LinearRegression()
                  )

      scores = cross_val_score(pipeline, 
                              X=X_train,
                              y=y_train,
                              scoring="neg_mean_squared_error",
                              cv=20)
      columns = ["Review"] + columns_to_standard + columns_to_encode
      if ct_val == 2:
        columns = ["Review", "Description"] + columns_to_standard + columns_to_encode

      
      print(str(np.sqrt(-scores).mean()) + ": " + str(columns))

17.47439931329904: ['Review']
17.36852875173059: ['Review', 'Description']
17.84504378939193: ['Review', 'all genres']
17.498581555973747: ['Review', 'Description', 'all genres']
17.49448927603161: ['Review', 'budget']
17.39883790680097: ['Review', 'Description', 'budget']
17.876711730281784: ['Review', 'budget', 'all genres']
17.54434062767692: ['Review', 'Description', 'budget', 'all genres']
17.218241406780642: ['Review', 'box office']
17.23318071674128: ['Review', 'Description', 'box office']
17.538651317927084: ['Review', 'box office', 'all genres']
17.316507946388036: ['Review', 'Description', 'box office', 'all genres']
17.196651973540934: ['Review', 'budget', 'box office']
17.17716556945656: ['Review', 'Description', 'budget', 'box office']
17.530666970877995: ['Review', 'budget', 'box office', 'all genres']
17.30259701240983: ['Review', 'Description', 'budget', 'box office', 'all genres']


In [None]:
for columns_to_standard in [list(genres), ["budget"] + list(genres), ["box office"] + list(genres), ["budget", "box office"] + list(genres)]:
  for columns_to_encode in [[], ["all genres"]]: 
    for ct_val in [1, 2]: 
      ct = None
      if ct_val == 1:
        ct = make_column_transformer(
            (TfidfVectorizer(max_features=2000), "Review"),
            (OneHotEncoder(handle_unknown = 'ignore'), columns_to_encode),
            (StandardScaler(), columns_to_standard),
            remainder="drop"  # all other columns in X will be dropped.
          )
      if ct_val == 2:
        ct = make_column_transformer(
            (TfidfVectorizer(max_features=2000), "Review"),
            (TfidfVectorizer(max_features=2000), "Description"),
            (OneHotEncoder(handle_unknown = 'ignore'), columns_to_encode),
            (StandardScaler(), columns_to_standard),
            remainder="drop"  # all other columns in X will be dropped.
          )       

      pipeline = make_pipeline(
                  ct,
                  LinearRegression()
                  )

      scores = cross_val_score(pipeline, 
                              X=X_train,
                              y=y_train,
                              scoring="neg_mean_squared_error",
                              cv=20)
      
      columns = ["Review"] + columns_to_standard + columns_to_encode
      if ct_val == 2:
        columns = ["Review", "Description"] + columns_to_standard + columns_to_encode

      print(str(np.sqrt(-scores).mean()) + ": " + str(columns))




17.409121776318482: ['Review', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']
17.37175883544118: ['Review', 'Description', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']
17.633396209299356: ['Review', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western', 'all genres']
17.448663362013857: ['Review', 'Description', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western', 'all genres']
17.421848329449517: ['Review', 'budget', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western'

## K-Neighbors Regression

In [21]:
#find best k-value in 6 minutes
ct = make_column_transformer(
    (TfidfVectorizer(), "Review"),
    remainder="drop"  # all other columns in X will be dropped.
)

pipeline = make_pipeline(
            ct,
            KNeighborsRegressor(n_neighbors=19)
            )


#return 19 as best k-value
grid_search = GridSearchCV(pipeline,
                           param_grid={
                               "kneighborsregressor__n_neighbors": range(1, 20)
                           },
                           scoring="neg_mean_squared_error",
                           cv=10)
grid_search.fit(X_train, y_train)
grid_search.best_estimator_

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('tfidfvectorizer',
                                                  TfidfVectorizer(),
                                                  'Review')])),
                ('kneighborsregressor', KNeighborsRegressor(n_neighbors=19))])

In [None]:
#test with k neighbors with k-value of 19 (from gridSearchCV) with all words(obtained from trial and error) in tf-idf
ct = make_column_transformer(
    (TfidfVectorizer(), "Review"),
    remainder="drop"  # all other columns in X will be dropped.
)

pipeline = make_pipeline(
            ct,
            KNeighborsRegressor(n_neighbors=19)
            )

scores = cross_val_score(pipeline, 
                         X=X_train,
                         y=y_train,
                         scoring="neg_mean_squared_error",
                         cv=20)

print("[Review]: " + str(np.sqrt(-scores).mean()))


[Review]: 17.5071695121731


In [None]:
ct = make_column_transformer(
    (TfidfVectorizer(), "Review"),
    (TfidfVectorizer(), "Description"),
    remainder="drop"  # all other columns in X will be dropped.
  )

pipeline = make_pipeline(
            ct,
            KNeighborsRegressor(n_neighbors=19)
            )

scores = cross_val_score(pipeline, 
                        X=X_train,
                        y=y_train,
                        scoring="neg_mean_squared_error",
                        cv=20)

print("[Review, Description]: " + str(np.sqrt(-scores).mean()))

[Review, Description]: 17.96923610111263


In [None]:

genres = df_movie_reviews.columns[9:]

for columns_to_standard in [[], ["budget"], ["box office"], ["budget", "box office"], ]:
  for columns_to_encode in [[], ["all genres"]]: 
    ct = make_column_transformer(
        (TfidfVectorizer(), "Review"),
        (OneHotEncoder(handle_unknown = 'ignore'), columns_to_encode),
        (StandardScaler(), columns_to_standard),
        remainder="drop"  # all other columns in X will be dropped.
      )

    pipeline = make_pipeline(
                ct,
                KNeighborsRegressor(n_neighbors=19)
                )

    scores = cross_val_score(pipeline, 
                            X=X_train,
                            y=y_train,
                            scoring="neg_mean_squared_error",
                            cv=20)
    columns = ["Review"] + columns_to_standard + columns_to_encode
    
    print(str(np.sqrt(-scores).mean()) + ": " + str(columns))

17.5071695121731: ['Review']
18.38595259927196: ['Review', 'all genres']
18.322563774780157: ['Review', 'budget']
18.985151375126545: ['Review', 'budget', 'all genres']
17.93246114569583: ['Review', 'box office']
19.15445899565548: ['Review', 'box office', 'all genres']
19.367785021023344: ['Review', 'budget', 'box office']
20.237806867511274: ['Review', 'budget', 'box office', 'all genres']


In [None]:
for columns_to_standard in [list(genres), ["budget"] + list(genres), ["box office"] + list(genres), ["budget", "box office"] + list(genres)]:
  for columns_to_encode in [[], ["all genres"]]: 
    ct = make_column_transformer(
        (TfidfVectorizer(), "Review"),
        (OneHotEncoder(handle_unknown = 'ignore'), columns_to_encode),
        (StandardScaler(), columns_to_standard),
        remainder="drop"  # all other columns in X will be dropped.
      )

    pipeline = make_pipeline(
                ct,
                KNeighborsRegressor(n_neighbors=19)
                )

    scores = cross_val_score(pipeline, 
                            X=X_train,
                            y=y_train,
                            scoring="neg_mean_squared_error",
                            cv=20)
    columns = ["Review"] + columns_to_standard + columns_to_encode
    print(str(np.sqrt(-scores).mean()) + ": " + str(columns))


19.392682281073142: ['Review', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']
19.387944614801917: ['Review', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western', 'all genres']
19.695468667494854: ['Review', 'budget', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']
19.696357197975306: ['Review', 'budget', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western', 'all genres']
20.215740588439726: ['Review', 'box office', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']
20.

## Gradient Boosting Regressor

In [22]:
  ct = make_column_transformer(
      (TfidfVectorizer(), "Review"),
      (OneHotEncoder(handle_unknown = 'ignore'), []),
      (StandardScaler(), []),
      remainder="drop"  # all other columns in X will be dropped.
    )
    
  pipeline = make_pipeline(
              ct,
              GradientBoostingRegressor()
              )


  scores = cross_val_score(pipeline, 
                          X=X_train,
                          y=y_train,
                          scoring="neg_mean_squared_error",
                          cv=20)
  
  columns = ["Review"]
  print(str(np.sqrt(-scores).mean()) + ": " + str(columns))

18.423324955774177: ['Review']


In [23]:
  ct = make_column_transformer(
      (TfidfVectorizer(), "Review"),
      (OneHotEncoder(handle_unknown = 'ignore'), []),
      (StandardScaler(), list(genres)),
      remainder="drop"  # all other columns in X will be dropped.
    )
    
  pipeline = make_pipeline(
              ct,
              GradientBoostingRegressor()
              )


  scores = cross_val_score(pipeline, 
                          X=X_train,
                          y=y_train,
                          scoring="neg_mean_squared_error",
                          cv=20)
  
  columns = ["Review"] + list(genres)
  print(str(np.sqrt(-scores).mean()) + ": " + str(columns))

18.446451407054564: ['Review', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']


In [24]:
  ct = make_column_transformer(
      (TfidfVectorizer(), "Review"),
      (OneHotEncoder(handle_unknown = 'ignore'), []),
      (StandardScaler(), ["box office"] + list(genres)),
      remainder="drop"  # all other columns in X will be dropped.
    )
    
  pipeline = make_pipeline(
              ct,
              GradientBoostingRegressor()
              )


  scores = cross_val_score(pipeline, 
                          X=X_train,
                          y=y_train,
                          scoring="neg_mean_squared_error",
                          cv=20)
  
  columns = ["Review"] + ["box office"] + list(genres)
  print(str(np.sqrt(-scores).mean()) + ": " + str(columns))

17.865112618566243: ['Review', 'box office', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']


## Ensemble Methods - Stacking Regressor

In [31]:
ct = make_column_transformer(
    (TfidfVectorizer(), "Review"),
    remainder="drop"  # all other columns in X will be dropped.
)

linear_model = make_pipeline(
            ct,
            LinearRegression()
            )

knn_model = make_pipeline(
            ct,
            KNeighborsRegressor(n_neighbors=19)
            )

grad_model = make_pipeline(
              ct,
              GradientBoostingRegressor()
              )


stacking_model = StackingRegressor([
    ("linear", linear_model)],
    final_estimator=LinearRegression()
)


scores = cross_val_score(pipeline, 
                        X=X_train,
                        y=y_train,
                        scoring="neg_mean_squared_error",
                        cv=20)

columns = ["Review"]
print(str(np.sqrt(-scores).mean()) + ": " + str(columns))

17.900357047003954: ['Review']


In [27]:
from sklearn.ensemble import StackingRegressor

ct = make_column_transformer(
    (TfidfVectorizer(), "Review"),
    remainder="drop"  # all other columns in X will be dropped.
)

linear_model = make_pipeline(
            ct,
            LinearRegression()
            )

knn_model = make_pipeline(
            ct,
            KNeighborsRegressor(n_neighbors=19)
            )

grad_model = make_pipeline(
              ct,
              GradientBoostingRegressor()
              )


stacking_model = StackingRegressor([
    ("linear", linear_model), 
    ("knn", knn_model)],
    final_estimator=LinearRegression()
)


scores = cross_val_score(pipeline, 
                        X=X_train,
                        y=y_train,
                        scoring="neg_mean_squared_error",
                        cv=20)

columns = ["Review"]
print(str(np.sqrt(-scores).mean()) + ": " + str(columns))

17.88320758373705: ['Review']


In [30]:

ct = make_column_transformer(
    (TfidfVectorizer(), "Review"),
    remainder="drop"  # all other columns in X will be dropped.
)

linear_model = make_pipeline(
            ct,
            LinearRegression()
            )

knn_model = make_pipeline(
            ct,
            KNeighborsRegressor(n_neighbors=19)
            )

grad_model = make_pipeline(
              ct,
              GradientBoostingRegressor()
              )


stacking_model = StackingRegressor([
    ("linear", linear_model), 
    ("knn", knn_model),
    ("grad", grad_model)],
    final_estimator=LinearRegression()
)


scores = cross_val_score(pipeline, 
                        X=X_train,
                        y=y_train,
                        scoring="neg_mean_squared_error",
                        cv=20)

columns = ["Review"]
print(str(np.sqrt(-scores).mean()) + ": " + str(columns))

17.91932730413762: ['Review']
