In [1]:
import json
import glob
import sklearn
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

In [9]:
df = pd.read_csv('result.csv',index_col=['imdbID'], header=0)
df = df[np.isfinite(df['Opening Weekend USA'])]
df['BoxOffice'] = df['BoxOffice'].replace( '[\$,)]','', regex=True ).astype(float)
df['Production'] = [d.replace('/',',') for d in df['Production']]

df['Director'] = [d.replace('(co-director)','') for d in df['Director']]
df['Runtime'] = df['Runtime'].replace( 'min','', regex=True ).astype(int)
df['Released'] = [a[3:6] for a in df['Released']]
df['imdbRating'] = [round(r * 2) / 2 for r in df['imdbRating']]
df['Oscar'] = [int(str.find('Oscar') > 0) for str in df['Awards']]
df = shuffle(df)

In [10]:

def one_hot_encoding_column(df, col_name, delimiter=','):
    item_map = {}
    item_count = 0
    movie_size = len(df)
    for i in range(movie_size):
        items = str(df[col_name].iloc[i])
        items_arr = items.split(delimiter)
        for item in items_arr:
            item = item.strip()
            if item not in item_map:
                item_map[item] = item_count
                item_count += 1

    Matrix = [[0 for x in range(item_count)] for y in range(movie_size)] 
    for i in range(movie_size):
        items = str(df[col_name].iloc[i])
        items_arr = items.split(delimiter)
        for item in items_arr:
            item = item.strip()
            idx = item_map[item]
            Matrix[i][idx] = 1
    return item_map, np.asarray(Matrix)

In [5]:
def get_log_label (bins, box_office_value):
    log_val = np.log(box_office_value)
    for i in range(len(bins[1])):
        if bins[1][i] <= log_val and log_val <= bins[1][i+1]:
            return i
    return len(bins[1]) - 1

In [14]:
def setup_prediction_matrix (bins, year):
    
    labels = [get_log_label(bins, o) for o in df['Opening Weekend USA']]
    
    actors_map, actors_Matrix = one_hot_encoding_column(df,'Actors')
    genre_map, genre_Matrix = one_hot_encoding_column(df,'Genre')
    contry_map, country_Matrix = one_hot_encoding_column(df,'Country')
    production_map, production_Matrix = one_hot_encoding_column(df,'Production')
    rated_map, rated_Matrix = one_hot_encoding_column(df,'Rated')
    year_map, year_Matrix = one_hot_encoding_column(df,'Year')
    director_map, director_Matrix = one_hot_encoding_column(df,'Director')
    released_map, released_Matrix =  one_hot_encoding_column(df,'Released')


    t_Matrix = np.concatenate((actors_Matrix, 
                            production_Matrix,
                            genre_Matrix,
                            country_Matrix,
                            rated_Matrix,
                            director_Matrix,
                            released_Matrix,
                            ratring_Matrix
                            ), axis=1)
    
    x_train = t_Matrix[df['Year'] != year]
    x_test = t_Matrix[df['Year'] == year ]
    y_train = np.array(labels)[df['Year'] != year]
    y_test = np.array(labels)[df['Year'] == year ]
    
    return x_train, x_test, y_train, y_test



def logistic_regression_predict (bin_size, year, verbose=False):
    bins = plt.hist(np.log(df['Opening Weekend USA']), bin_size)

    x_train, x_test, y_train, y_test = setup_prediction_matrix (bins, year)
    clf_l2_LR = LogisticRegression(multi_class="multinomial", solver = 'lbfgs', \
        class_weight ='balanced', random_state = 2)
    clf_l2_LR.fit(x_train, y_train)
    y_predit = clf_l2_LR.predict(x_test)
    print ("Logistic regression with:", bin_size, "label classes,", "accuracy:", np.mean(y_predit == y_test))
    plt.close()

    return bins 
    

def show_bins(bins):
     for i in range(len(bins[0])):
        print ("label", i+1 , ":",  np.exp(bins[1][i]), '-' ,np.exp(bins[1][i+1]), "," , str(bins[0][i]) , "movies")
    
    
logistic_regression_predict(2, 2017)
logistic_regression_predict(4, 2017)
cur_bins = logistic_regression_predict(8, 2017)
show_bins(cur_bins)

Logistic regression with: 2 label classes, accuracy: 0.8294573643410853
Logistic regression with: 4 label classes, accuracy: 0.7441860465116279
Logistic regression with: 8 label classes, accuracy: 0.6356589147286822
label 1 : 64.99999999999999 - 432.1140146273819 , 1.0 movies
label 2 : 432.1140146273819 - 2872.6541790368224 , 17.0 movies
label 3 : 2872.6541790368224 - 19097.140460611212 , 245.0 movies
label 4 : 19097.140460611212 - 126956.0312667343 , 673.0 movies
label 5 : 126956.0312667343 - 843992.0054127391 , 367.0 movies
label 6 : 843992.0054127391 - 5610781.13495866 , 309.0 movies
label 7 : 5610781.13495866 - 37299956.32957797 , 1128.0 movies
label 8 : 37299956.32957797 - 247966674.99999967 , 285.0 movies
