## List of variables:  
**Main dataset:**
- `data_raw` : data imported locally, needs some touching up
- `data` : data stripped of unneeded information
- `data_ml` : actual data used for machine learning

**For machine learning:**
- `results_<train/test>` : oscar results, in boolean form
- `movinfo_<train/test>` : movie data, as a large dataframe

In [1]:
## Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt 
sb.set(palette='icefire')

## Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from sklearn import preprocessing ## module to convert str/obj to a useful int

from collections import Counter
import statistics as stat
import hashlib

In [2]:
## import cleaned data from local dir
data_raw = pd.read_csv('./data/Final_Movie_Data_clean.csv')
data = data_raw.copy(deep=True)
data_raw.info()
data_raw.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1838 entries, 0 to 1837
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         1838 non-null   int64  
 1   tmdb_id            1838 non-null   int64  
 2   imdb_id            1838 non-null   object 
 3   film               1838 non-null   object 
 4   year               1838 non-null   int64  
 5   oscar_cat          1838 non-null   object 
 6   oscar_win          1838 non-null   bool   
 7   oscar_nominations  1838 non-null   int64  
 8   total_wins         1838 non-null   int64  
 9   total_nominations  1838 non-null   int64  
 10  cast_popularity    1838 non-null   float64
 11  crew_popularity    1838 non-null   float64
 12  tmdb_vote_average  1838 non-null   int64  
 13  tmdb_vote_count    1838 non-null   int64  
 14  imdb_rating        1838 non-null   int64  
 15  imdb_votes         1838 non-null   int64  
 16  rotten_tomatoes    1833 

Unnamed: 0.1,Unnamed: 0,tmdb_id,year,oscar_nominations,total_wins,total_nominations,cast_popularity,crew_popularity,tmdb_vote_average,tmdb_vote_count,imdb_rating,imdb_votes,rotten_tomatoes,metascore,genre_id_0,genre_id_1
count,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1833.0,1586.0,1836.0,1399.0
mean,918.5,55636.692601,1990.130577,5.627312,34.085963,56.468444,75.40444,49.638538,73.144178,2827.273667,75.741023,224228.3,82.727769,78.778689,515.738017,3060.347391
std,530.729215,112345.263388,17.487232,3.289987,42.673415,69.552938,51.746486,61.079298,5.639255,4233.068392,5.159868,332666.3,18.59211,11.503266,2191.254486,4794.62674
min,0.0,11.0,1960.0,1.0,2.0,2.0,2.843,0.6,45.0,2.0,58.0,68.0,10.0,26.0,12.0,12.0
25%,459.25,1542.75,1975.0,3.0,8.0,12.0,36.48,16.35025,69.0,211.0,72.0,19756.0,80.0,72.0,18.0,18.0
50%,918.5,11159.0,1990.0,5.0,17.0,24.0,63.7005,28.29,73.0,930.0,76.0,92527.0,89.0,80.0,18.0,36.0
75%,1377.75,42122.0,2006.0,8.0,42.0,77.0,98.916,58.65,77.0,3322.0,79.0,277242.0,93.0,87.0,35.0,10402.0
max,1837.0,551332.0,2019.0,14.0,297.0,372.0,298.903,714.314,87.0,28781.0,93.0,2367380.0,99.0,100.0,10752.0,10752.0


In [3]:
## replacing all NAN values in 'metascore' col with the existing average score
#display(data['metascore'].describe())
#display(data.info())
data['metascore'].fillna(data['metascore'].mean(), axis=0, inplace=True)
#display(data['metascore'].describe())
#display(data.info())

In [4]:
## Remove columns not needed in machine learning
## Cols that do with year of film, name etc
data_ml = data.drop([
    'Unnamed: 0',
    'tmdb_id',
    'imdb_id',
    'film',
    'year'
], axis=1)

## remove NaN values
data_ml.dropna(axis=0, subset=['producer_0','rotten_tomatoes'], inplace=True)
## remove genre_id_1, producer_1 and screenplay_0, too little values, cannot be substituted for stuff like average
data_ml.drop([
    'genre_id_1',
    'producer_1',
    'screenplay_0'
], axis=1, inplace=True)
data_ml.info()
#data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1770 entries, 1 to 1837
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   oscar_cat          1770 non-null   object 
 1   oscar_win          1770 non-null   bool   
 2   oscar_nominations  1770 non-null   int64  
 3   total_wins         1770 non-null   int64  
 4   total_nominations  1770 non-null   int64  
 5   cast_popularity    1770 non-null   float64
 6   crew_popularity    1770 non-null   float64
 7   tmdb_vote_average  1770 non-null   int64  
 8   tmdb_vote_count    1770 non-null   int64  
 9   imdb_rating        1770 non-null   int64  
 10  imdb_votes         1770 non-null   int64  
 11  rotten_tomatoes    1770 non-null   float64
 12  metascore          1770 non-null   float64
 13  director           1770 non-null   object 
 14  genre_id_0         1770 non-null   float64
 15  cast_0             1770 non-null   object 
 16  cast_1             1770 

In [5]:
## hash all strings into ints, using sha256
strhash = lambda x: int(hashlib.sha256(x.encode('utf-8')).hexdigest(),16) % 10**8

fields = data_ml.select_dtypes('object').columns.tolist()
for field in fields:
    data_ml[field] = data_ml[field].apply(lambda x: int(hashlib.sha256(x.encode('utf-8')).hexdigest(),16) % 10**8)

In [6]:
## partitioning data into train and test
## use results and movinfo as response and predictor
results = data_ml['oscar_win']
movinfo = data_ml.drop(['oscar_win'], axis=1)
## default split is 75% train, 25% test
movinfo_train, movinfo_test, results_train, results_test = train_test_split(movinfo, results) 
#movinfo_train.info()
#len(results_test)/(len(results_train)+len(results_test)

In [7]:
## boost, OK
booster = AdaBoostClassifier(n_estimators=100, ## number of weak learners to be trained iteratively, more may be better, see results
                             learning_rate=0.01) ## something to do with how the weak learners are weighted

model = booster.fit(movinfo_train, results_train)

In [8]:
results_pred = model.predict(movinfo_test)
print("Accuracy:",metrics.accuracy_score(results_test, results_pred))

Accuracy: 0.801354401805869


---
### Test cells
---

In [None]:
## trying out label encoding and count encoding
## count encoding is being a bitch rn
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [None]:
count_encoder = ce.count.CountEncoder(return_df=True)
test_df = data.copy(deep=True)

In [None]:
#test_df.info()
#test_df['starring_0'] = count_encoder.fit_transform(test_df['starring_0'])
#est_df['starring_0']
test_df = data.copy(deep=True)
test_df['starring_0'] = count_encoder.fit_transform(test_df['starring_0'])
test_df['starring_0'].unique()

In [None]:
test_df = data.copy(deep=True)
test_df['starring_0'] = label_encoder.fit_transform(test_df['starring_0'])
test_df['starring_0'].unique()

In [None]:
## trying out encoding objs to a unique int
data_v2 = data.copy(deep=True)
data_v2_n = data_v2.drop(['producer_1','screenplay_0'], axis=1)
data_v2_n.info()

In [None]:
fields = [
    'oscar_cat',
    'director',
    'starring_0',
    'starring_1',
    'starring_2',
    'starring_3',
    'producer_0'
]
for field in fields:
    data_v2_n[field] = label_encoder.fit_transform(data_v2_n[field])
data_v2_n

In [None]:
## partitioning data into train and test
X = data_v2_n.drop(['oscar_win'], axis=1)

y = data_v2_n['oscar_win']
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
## boost, OK
booster = AdaBoostClassifier(n_estimators=60, ## number of weak learners to be trained iteratively, more may be better, see results
                             learning_rate=0.8) ## something to do with how the weak learners are weighted

model = booster.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
## Testing out hashing function to map every str to a unique int
import hashlib

def hash(sourcedf,destinationdf,*column):
    columnName = ''
    destinationdf['hash_'+columnName.join(column)] = pd.DataFrame(sourcedf[list(column)].values.sum(axis=1))[0].apply(lambda x: int(hashlib.sha512(x.encode('utf-8')).hexdigest(),16) % 10**8)

In [None]:
%time
## This works
strhash = lambda x: int(hashlib.sha256(x.encode('utf-8')).hexdigest(),16) % 10**8

In [None]:
## no longer in use
## convert all objects and strings into ints, using LabelEncoder
label_encoder = preprocessing.LabelEncoder()

fields = data_ml.select_dtypes('object').columns.tolist()
for field in fields:
    data_ml[field] = label_encoder.fit_transform(data_ml[field])
    
data_ml

In [None]:
z = data_ml.copy(deep=True)
fields = z.select_dtypes('object').columns.tolist()
z = z.convert_dtypes()
#z.info()
for field in fields:
    z[field] = z[field].apply(lambda x: int(hashlib.sha256(x.encode('utf-8')).hexdigest(),16) % 10**8)
#z.apply(lambda x: int(hashlib.sha256(x.encode('utf-8')).hexdigest(),16) % 10**8)
z