In [2]:
import json
import glob
import sklearn
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('result.csv',index_col=['imdbID'], header=0)
df = df[np.isfinite(df['Opening Weekend USA'])]
df['BoxOffice'] = df['BoxOffice'].replace( '[\$,)]','', regex=True ).astype(float)
df['Production'] = [d.replace('/',',') for d in df['Production']]
df['Director'] = [d.replace('(co-director)','') for d in df['Director']]
df['Runtime'] = df['Runtime'].replace( 'min','', regex=True ).astype(int)
df['Released'] = [a[3:6] for a in df['Released']]
df['imdbRating'] = [round(r * 2) / 2 for r in df['imdbRating']]
df['Oscar'] = [int(str.find('Oscar') > 0) for str in df['Awards']]
df = shuffle(df)

In [4]:
def one_hot_encoding_column(df, col_name, delimiter=','):
    item_map = {}
    item_count = 0
    movie_size = len(df)
    for i in range(movie_size):
        items = str(df[col_name].iloc[i])
        items_arr = items.split(delimiter)
        for item in items_arr:
            item = item.strip()
            if item not in item_map:
                item_map[item] = item_count
                item_count += 1

    Matrix = [[0 for x in range(item_count)] for y in range(movie_size)] 
    for i in range(movie_size):
        items = str(df[col_name].iloc[i])
        items_arr = items.split(delimiter)
        for item in items_arr:
            item = item.strip()
            idx = item_map[item]
            Matrix[i][idx] = 1
    return item_map, np.asarray(Matrix)

In [5]:
def get_log_label (bins, box_office_value):
    log_val = np.log(box_office_value)
    for i in range(len(bins[1])):
        if bins[1][i] <= log_val and log_val <= bins[1][i+1]:
            return i
    return len(bins[1]) - 1

In [6]:
def setup_prediction_matrix (bins, column_names, year):
    
    labels = [get_log_label(bins, o) for o in df['Opening Weekend USA']]
    encoded_matrix = []
    for column in column_names:
        column_map, column_Matrix = one_hot_encoding_column(df,column)
        encoded_matrix.append(column_Matrix)
        
    t_Matrix = np.concatenate(encoded_matrix, axis=1)     
    x_train = t_Matrix[df['Year'] != year]
    x_test = t_Matrix[df['Year'] == year ]
    y_train = np.array(labels)[df['Year'] != year]
    y_test = np.array(labels)[df['Year'] == year ]
    
    return x_train, x_test, y_train, y_test


def logistic_regression_model (bin_size,column_names, year, verbose=False):
    bins = plt.hist(np.log(df['Opening Weekend USA']), bin_size)
    x_train, x_test, y_train, y_test = setup_prediction_matrix (bins,column_names, year)
    clf_l2_LR = LogisticRegression(multi_class="multinomial", solver = 'lbfgs', \
        class_weight ='balanced', random_state = 2)
    clf_l2_LR.fit(x_train, y_train)
    y_predit = clf_l2_LR.predict(x_test)
    if verbose:
        print ("Logistic regression with:", bin_size, "label classes,", "accuracy:", np.mean(y_predit == y_test))
    plt.close()
    return bins 
    
    
def decision_tree_model (bin_size,column_names, year, verbose=False):
    bins = plt.hist(np.log(df['Opening Weekend USA']), bin_size)
    x_train, x_test, y_train, y_test = setup_prediction_matrix (bins,column_names, year)
    
    DTC = DecisionTreeClassifier()
    DTC.fit(x_train, y_train)
    y_predit = DTC.predict(x_test)
    if verbose:
        print ("Decision Tree with:", bin_size, "label classes,", "accuracy:", np.mean(y_predit == y_test))
    plt.close()
    return bins, DTC

def random_forest_model (bin_size,column_names, year, verbose=False):
    bins = plt.hist(np.log(df['Opening Weekend USA']), bin_size)
    x_train, x_test, y_train, y_test = setup_prediction_matrix (bins,column_names, year)
    
    CLF = RandomForestClassifier()
    CLF.fit(x_train, y_train)
    y_predit = CLF.predict(x_test)
    if verbose:
        print ("SVM with:", bin_size, "label classes,", "accuracy:", np.mean(y_predit == y_test))
    plt.close()
    return bins, CLF

    
def show_bins(bins):
     for i in range(len(bins[0])):
        print ("label", i+1 , ":",  np.exp(bins[1][i]), '-' ,np.exp(bins[1][i+1]), "," , str(bins[0][i]) , "movies")
    
features = ['Actors','Production', 'Director', 'Country', 'Rated', 'Released', 'Genre', 'imdbRating']

#logistic_regression_model(2, 2017)
#logistic_regression_model(4, 2017)
#logistic_regression_model(8, features, 2017)
#decision_tree_model(2, features, 2017)
#decision_tree_model(4, features, 2017)
#decision_tree_model(8, features, 2017)
#svm_model(2, features, 2017)
#bins, CLF = svm_model(4, features, 2017)

#svm_model(8, features, 2017)

In [18]:
def feature_importance_study(column_name, year, df, bin_size, n):
    
    def recover_key(dicty, value):
        for a_key in dicty.keys():
            if (dicty[a_key] == value):
                return a_key

        
    col_map, col_matrix = one_hot_encoding_column(df, column_name)
    bins, CLF = decision_tree_model(n, [column_name], year)

    importances = CLF.feature_importances_
    indices = np.argsort(importances)[::-1]
    print("Feature ranking for:", column_name)
    cc = 0
    for f in range(col_matrix.shape[1]):
        print("%d. %s (%f)" % (f + 1, recover_key(col_map, indices[f]), importances[indices[f]]))  
        if cc == n:
            break
        cc += 1
    print ("")
    



feature_importance_study ("Production", 2017, df, 4, 3)
feature_importance_study ("Genre", 2017, df, 4, 3)
feature_importance_study ("Actors", 2017, df, 8, 3)
feature_importance_study ("Director", 2017, df, 4, 3)
feature_importance_study ("Released", 2017, df, 4, 3)


Feature ranking for: Production
1. Sony Pictures Classics (0.084816)
2. Universal Pictures (0.051247)
3. Warner Bros. Pictures (0.051029)
4. 20th Century Fox (0.049313)

Feature ranking for: Genre
1. Drama (0.506969)
2. Action (0.053715)
3. Biography (0.049320)
4. Romance (0.042706)

Feature ranking for: Actors
1. Kristin Scott Thomas (0.004973)
2. Adam Sandler (0.004208)
3. Johnny Depp (0.004190)
4. Dwayne Johnson (0.004106)

Feature ranking for: Director
1. Tyler Perry (0.005018)
2. Gus Van Sant (0.003602)
3. Shawn Levy (0.002848)
4. Ridley Scott (0.002830)

Feature ranking for: Released
1. Jan (0.364891)
2. Sep (0.203495)
3. May (0.202065)
4. Apr (0.109755)



In [49]:
encoded_matrix = []
for column in features:
    column_map, column_Matrix = one_hot_encoding_column(df,column)
    encoded_matrix.append(column_Matrix)
    
t_Matrix = np.concatenate(encoded_matrix, axis=1)     
x_train = t_Matrix[df['Year'] != 2017]
x_test = t_Matrix[df['Year'] == 2017 ]

y_train = [o for o in df[df['Year']!=2017]['Oscar']]
y_test = [o for o in df[df['Year']==2017]['Oscar']]

clf_l2_LR = LogisticRegression(multi_class="multinomial", solver = 'lbfgs', \
        class_weight ='balanced', random_state = 2)
clf_l2_LR.fit(x_train, y_train)
y_predit = clf_l2_LR.predict(x_test)
prob = clf_l2_LR.predict_proba(x_test)
print ("Accuracy score: ", clf_l2_LR.score(x_test, y_test), "\n")

prediction = {}
for i in range (len(x_test)):
    prediction[df[df['Year']==2017].iloc[i].Title] = (df[df['Year']==2017].iloc[i].Title+ " is " + \
                                                      str(round(prob[i][1] * 100,2)) + "% positive, "\
                                                      + str(round(prob[i][0]*100,2)) + "% negative to go to the Oscar."  )


prediction['Dunkirk']

Accuracy score:  0.8527131782945736 



'Dunkirk is 85.74% positive, 14.26% negative to go to the Oscar.'