## List of variables:  
**Main dataset:**
- `data_raw` : data imported locally, needs some touching up
- `data` : data stripped of unneeded information
- `data_ml` : actual data used for machine learning

**For machine learning:**
- `results_<train/test>` : oscar results, in boolean form
- `movinfo_<train/test>` : movie data, as a large dataframe

In [2]:
## Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt 
sb.set(palette='icefire')

## Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing, metrics
import xgboost

from collections import Counter
import statistics as stat
import hashlib ## for converting strings into unique ints

In [6]:
## import cleaned data from local dir
data_raw = pd.read_csv('./data/Final_Movie_Data_clean.csv').drop('Unnamed: 0',axis=1)
data = data_raw.copy(deep=True)
data_raw.info()
data_raw.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1838 entries, 0 to 1837
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   tmdb_id            1838 non-null   int64  
 1   imdb_id            1838 non-null   object 
 2   film               1838 non-null   object 
 3   year               1838 non-null   int64  
 4   budget             1838 non-null   float64
 5   revenue            1838 non-null   float64
 6   oscar_cat          1838 non-null   object 
 7   oscar_win          1838 non-null   bool   
 8   oscar_nominations  1838 non-null   int64  
 9   oscar_win_count    1838 non-null   int64  
 10  total_wins         1838 non-null   int64  
 11  total_nominations  1838 non-null   int64  
 12  cast_popularity    1838 non-null   float64
 13  crew_popularity    1838 non-null   float64
 14  tmdb_vote_average  1838 non-null   int64  
 15  tmdb_vote_count    1838 non-null   int64  
 16  imdb_rating        1838 

Unnamed: 0,tmdb_id,year,budget,revenue,oscar_nominations,oscar_win_count,total_wins,total_nominations,cast_popularity,crew_popularity,tmdb_vote_average,tmdb_vote_count,imdb_rating,imdb_votes,rotten_tomatoes,metascore,genre_id_0,genre_id_1
count,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1836.0,1399.0
mean,55636.692601,1990.130577,22679390.0,131239900.0,5.627312,1.574538,34.085963,56.468444,75.40444,49.638538,73.144178,2827.273667,75.741023,224228.3,82.727947,78.704704,515.738017,3060.347391
std,112345.263388,17.487232,28310610.0,196133700.0,3.289987,2.013467,42.673415,69.552938,51.746486,61.079298,5.639255,4233.068392,5.159868,332666.3,18.56721,10.720935,2191.254486,4794.62674
min,11.0,1960.0,6.0,8.0,1.0,0.0,2.0,2.0,2.843,0.6,45.0,2.0,58.0,68.0,10.0,26.0,12.0,12.0
25%,1542.75,1975.0,8000000.0,35892330.0,3.0,0.0,8.0,12.0,36.48,16.35025,69.0,211.0,72.0,19756.0,80.0,74.0,18.0,18.0
50%,11159.0,1990.0,15385530.0,88545090.0,5.0,1.0,17.0,24.0,63.7005,28.29,73.0,930.0,76.0,92527.0,89.0,79.883065,18.0,36.0
75%,42122.0,2006.0,25000000.0,140000000.0,8.0,2.0,42.0,77.0,98.916,58.65,77.0,3322.0,79.0,277242.0,93.0,86.0,35.0,10402.0
max,551332.0,2019.0,237000000.0,2787965000.0,14.0,11.0,297.0,372.0,298.903,714.314,87.0,28781.0,93.0,2367380.0,99.0,100.0,10752.0,10752.0


## NO LONGER NEEDED
## replacing all NAN values in 'metascore' col with the existing average score
#display(data['metascore'].describe())
#display(data.info())
data['metascore'].fillna(data['metascore'].mean(), axis=0, inplace=True)
#display(data['metascore'].describe())
#display(data.info())

In [9]:
## Remove columns not needed in machine learning
## Cols that do with year of film, name etc
data_ml = data.drop([
    'tmdb_id',
    'imdb_id',
    'film',
    'year'
], axis=1)

## remove NaN values
data_ml.dropna(axis=0, subset=['producer_0',], inplace=True)
## remove genre_id_1, producer_1 and screenplay_0, too little values, cannot be substituted for stuff like average
data_ml.drop([
    'genre_id_1',
    'producer_1',
    'screenplay_0'
], axis=1, inplace=True)
data_ml.info()
#data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1773 entries, 1 to 1837
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   budget             1773 non-null   float64
 1   revenue            1773 non-null   float64
 2   oscar_cat          1773 non-null   object 
 3   oscar_win          1773 non-null   bool   
 4   oscar_nominations  1773 non-null   int64  
 5   oscar_win_count    1773 non-null   int64  
 6   total_wins         1773 non-null   int64  
 7   total_nominations  1773 non-null   int64  
 8   cast_popularity    1773 non-null   float64
 9   crew_popularity    1773 non-null   float64
 10  tmdb_vote_average  1773 non-null   int64  
 11  tmdb_vote_count    1773 non-null   int64  
 12  imdb_rating        1773 non-null   int64  
 13  imdb_votes         1773 non-null   int64  
 14  rotten_tomatoes    1773 non-null   float64
 15  metascore          1773 non-null   float64
 16  director           1773 

In [10]:
## hash all strings into ints, using sha256
strhash = lambda x: int(hashlib.sha256(x.encode('utf-8')).hexdigest(),16) % 10**8

fields = data_ml.select_dtypes('object').columns.tolist()
for field in fields:
    data_ml[field] = data_ml[field].apply(lambda x: int(hashlib.sha256(x.encode('utf-8')).hexdigest(),16) % 10**8)

In [None]:
data_ml.columns

## Learning using Adaptive Boosting

In [11]:
## ML part (re-run with some headers included/ommited to see which is best)
results = data_ml['oscar_win'] ## response
movinfo = data_ml.drop(['oscar_win'#,'oscar_cat','director','producer_0','cast_3','cast_2','cast_1','cast_0'
                        ], axis=1) ## predictor, vary the columns used
display(movinfo.head(1))
## default split is 75% train, 25% test
movinfo_train, movinfo_test, results_train, results_test = train_test_split(movinfo, results) 

## Adaboost, OK
adaboost = AdaBoostClassifier(n_estimators=100, ## number of weak learners to be trained iteratively, more may be better, see results
                             learning_rate=0.2) ## something to do with how the weak learners are weighted
model = adaboost.fit(movinfo_train, results_train)

## Try out the model on test data
results_pred = model.predict(movinfo_test)
print("Accuracy:",metrics.accuracy_score(results_test, results_pred))

Unnamed: 0,budget,revenue,oscar_cat,oscar_nominations,oscar_win_count,total_wins,total_nominations,cast_popularity,crew_popularity,tmdb_vote_average,...,imdb_votes,rotten_tomatoes,metascore,director,genre_id_0,cast_0,cast_1,cast_2,cast_3,producer_0
1,3000000.0,10400000.0,91107069,5,3,8,13,39.177,15.158,72,...,10534,94.0,76.894942,59792764,18.0,47523203,90572685,4910213,883773,17187743


Accuracy: 0.831081081081081


## Learning using Gradient Boosting

In [12]:
## ML part (re-run with some headers included/ommited to see which is best) trying out gradient boosting
results = data_ml['oscar_win'] ## response
movinfo = data_ml.drop(['oscar_win'#,'oscar_cat','director','producer_0','cast_3','cast_2','cast_1','cast_0'
                        ], axis=1) ## predictor, vary the columns used
display(movinfo.head(1))
## default split is 75% train, 25% test
movinfo_train, movinfo_test, results_train, results_test = train_test_split(movinfo, results) 

## Gradboost, OK
gradbooster = GradientBoostingClassifier(n_estimators=100, ## number of weak learners to be trained iteratively, more may be better, see results
                                         learning_rate=0.2,
                                         subsample=0.6
                                        ) ## something to do with how the weak learners are weighted
model = gradbooster.fit(movinfo_train, results_train)

## Try out the model on test data
results_pred = model.predict(movinfo_test)
print("Accuracy:",metrics.accuracy_score(results_test, results_pred))

Unnamed: 0,budget,revenue,oscar_cat,oscar_nominations,oscar_win_count,total_wins,total_nominations,cast_popularity,crew_popularity,tmdb_vote_average,...,imdb_votes,rotten_tomatoes,metascore,director,genre_id_0,cast_0,cast_1,cast_2,cast_3,producer_0
1,3000000.0,10400000.0,91107069,5,3,8,13,39.177,15.158,72,...,10534,94.0,76.894942,59792764,18.0,47523203,90572685,4910213,883773,17187743


Accuracy: 0.8400900900900901


## Learning using ExtRa Trees (Extremely Random Trees)

In [13]:
## ML part (re-run with some headers included/ommited to see which is best) trying out gradient boosting
results = data_ml['oscar_win'] ## response
movinfo = data_ml.drop(['oscar_win'#,'oscar_cat','director','producer_0','cast_3','cast_2','cast_1','cast_0'
                        ], axis=1) ## predictor, vary the columns used
display(movinfo.head(1))
## default split is 75% train, 25% test
movinfo_train, movinfo_test, results_train, results_test = train_test_split(movinfo, results) 

## ExtRaboost, OK
extratrees = ExtraTreesClassifier(n_estimators=1000, ## number of weak learners to be trained iteratively, more may be better, see results
                                  max_depth=None,
                                  min_samples_split=2,
                                  random_state=0,
                                  warm_start=True
                                ) ## something to do with how the weak learners are weighted
model = extratrees.fit(movinfo_train, results_train)

## Try out the model on test data
results_pred = model.predict(movinfo_test)
print("Accuracy:",metrics.accuracy_score(results_test, results_pred))

Unnamed: 0,budget,revenue,oscar_cat,oscar_nominations,oscar_win_count,total_wins,total_nominations,cast_popularity,crew_popularity,tmdb_vote_average,...,imdb_votes,rotten_tomatoes,metascore,director,genre_id_0,cast_0,cast_1,cast_2,cast_3,producer_0
1,3000000.0,10400000.0,91107069,5,3,8,13,39.177,15.158,72,...,10534,94.0,76.894942,59792764,18.0,47523203,90572685,4910213,883773,17187743


Accuracy: 0.7432432432432432


## Learning using Random Forests

In [14]:
## ML part (re-run with some headers included/ommited to see which is best) trying out gradient boosting
results = data_ml['oscar_win'] ## response
movinfo = data_ml.drop(['oscar_win'#,'oscar_cat','director','producer_0','cast_3','cast_2','cast_1','cast_0'
                        ], axis=1) ## predictor, vary the columns used
display(movinfo.head(1))
## default split is 75% train, 25% test
movinfo_train, movinfo_test, results_train, results_test = train_test_split(movinfo, results) 

## RandForestboost, OK
randforest = RandomForestClassifier(n_estimators=100, ## number of weak learners to be trained iteratively, more may be better, see results
                                    warm_start=True
                                    ) ## something to do with how the weak learners are weighted
model = randforest.fit(movinfo_train, results_train)

## Try out the model on test data
results_pred = model.predict(movinfo_test)
print("Accuracy:",metrics.accuracy_score(results_test, results_pred))

Unnamed: 0,budget,revenue,oscar_cat,oscar_nominations,oscar_win_count,total_wins,total_nominations,cast_popularity,crew_popularity,tmdb_vote_average,...,imdb_votes,rotten_tomatoes,metascore,director,genre_id_0,cast_0,cast_1,cast_2,cast_3,producer_0
1,3000000.0,10400000.0,91107069,5,3,8,13,39.177,15.158,72,...,10534,94.0,76.894942,59792764,18.0,47523203,90572685,4910213,883773,17187743


Accuracy: 0.777027027027027


## Learning using eXtreme Gradient Boosting (XGBoost)

In [15]:
## ML part (re-run with some headers included/ommited to see which is best) trying out gradient boosting
results = data_ml['oscar_win'] ## response
movinfo = data_ml.drop(['oscar_win'#,'oscar_cat','director','producer_0','cast_3','cast_2','cast_1','cast_0'
                        ], axis=1) ## predictor, vary the columns used
display(movinfo.head(1))
## default split is 75% train, 25% test
movinfo_train, movinfo_test, results_train, results_test = train_test_split(movinfo, results) 

num_round=100
xgb = xgboost.XGBClassifier(objective='reg:logistic',
                           max_depth = 100, 
                           alpha = 15, 
                           n_estimators = 10)
model = xgb.fit(movinfo_train, results_train)

## Try out the model on test data
results_pred = xgb.predict(movinfo_test)
## error on this line
print("Accuracy:",metrics.accuracy_score(results_test, results_pred))

Unnamed: 0,budget,revenue,oscar_cat,oscar_nominations,oscar_win_count,total_wins,total_nominations,cast_popularity,crew_popularity,tmdb_vote_average,...,imdb_votes,rotten_tomatoes,metascore,director,genre_id_0,cast_0,cast_1,cast_2,cast_3,producer_0
1,3000000.0,10400000.0,91107069,5,3,8,13,39.177,15.158,72,...,10534,94.0,76.894942,59792764,18.0,47523203,90572685,4910213,883773,17187743




Accuracy: 0.8423423423423423


  "memory consumption")


---
### Test cells
---

In [None]:
## trying out label encoding and count encoding
## count encoding is being a bitch rn
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [None]:
count_encoder = ce.count.CountEncoder(return_df=True)
test_df = data.copy(deep=True)

In [None]:
#test_df.info()
#test_df['starring_0'] = count_encoder.fit_transform(test_df['starring_0'])
#est_df['starring_0']
test_df = data.copy(deep=True)
test_df['starring_0'] = count_encoder.fit_transform(test_df['starring_0'])
test_df['starring_0'].unique()

In [None]:
test_df = data.copy(deep=True)
test_df['starring_0'] = label_encoder.fit_transform(test_df['starring_0'])
test_df['starring_0'].unique()

In [None]:
## trying out encoding objs to a unique int
data_v2 = data.copy(deep=True)
data_v2_n = data_v2.drop(['producer_1','screenplay_0'], axis=1)
data_v2_n.info()

In [None]:
fields = [
    'oscar_cat',
    'director',
    'starring_0',
    'starring_1',
    'starring_2',
    'starring_3',
    'producer_0'
]
for field in fields:
    data_v2_n[field] = label_encoder.fit_transform(data_v2_n[field])
data_v2_n

In [None]:
## partitioning data into train and test
X = data_v2_n.drop(['oscar_win'], axis=1)

y = data_v2_n['oscar_win']
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
## boost, OK
booster = AdaBoostClassifier(n_estimators=60, ## number of weak learners to be trained iteratively, more may be better, see results
                             learning_rate=0.8) ## something to do with how the weak learners are weighted

model = booster.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
## Testing out hashing function to map every str to a unique int
import hashlib

def hash(sourcedf,destinationdf,*column):
    columnName = ''
    destinationdf['hash_'+columnName.join(column)] = pd.DataFrame(sourcedf[list(column)].values.sum(axis=1))[0].apply(lambda x: int(hashlib.sha512(x.encode('utf-8')).hexdigest(),16) % 10**8)

In [None]:
%time
## This works
strhash = lambda x: int(hashlib.sha256(x.encode('utf-8')).hexdigest(),16) % 10**8

In [None]:
## no longer in use
## convert all objects and strings into ints, using LabelEncoder
label_encoder = preprocessing.LabelEncoder()

fields = data_ml.select_dtypes('object').columns.tolist()
for field in fields:
    data_ml[field] = label_encoder.fit_transform(data_ml[field])
    
data_ml

In [None]:
z = data_ml.copy(deep=True)
fields = z.select_dtypes('object').columns.tolist()
z = z.convert_dtypes()
#z.info()
for field in fields:
    z[field] = z[field].apply(lambda x: int(hashlib.sha256(x.encode('utf-8')).hexdigest(),16) % 10**8)
#z.apply(lambda x: int(hashlib.sha256(x.encode('utf-8')).hexdigest(),16) % 10**8)
z