In [148]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [240]:
import warnings
warnings.simplefilter("ignore")

In [138]:
training_data = pd.read_csv('OscarsWinners-TrainingData.csv', sep=';', encoding='latin1')
testing_data = pd.read_csv('OscarsWinners-TestingData.csv', sep=';', encoding='latin1')
testing_data = testing_data.fillna(0)
training_data = training_data.fillna(0)

In [36]:
categories_to_analyse = [[], ['Year','GG-Drama', 'GG-DramaW', 'GG-Comedy', 'GG-ComedyW', 'B-Film','B-FilmW',
                                  'CC-Film', 'CC-FilmW', 'CC-Komedia', 'CC-KomediaW', 'PGA','PGAW','O-Film'],
                             ['Year','GG-Dir', 'GG-DirW', 'B-Dir', 'B-DirW','CC-Dir', 'CC-DirW', 'DGA','DGAW','O-Dir'],
                             ['Year','GG-DramaM1', 'GG-DramaM1W', 'GG-ComedyM1', 'GG-ComedyM1W',
                              'B-M1', 'B-M1W','CC-M1', 'CC-M1W', 'SAG-M1','SAG-M1W','O-M1'],
                             ['Year','GG-DramaK1', 'GG-DramaK1W', 'GG-ComedyK1', 'GG-ComedyK1W',
                              'B-K1','B-K1W', 'CC-K1', 'CC-K1W', 'SAG-K1', 'SAG-K1W','O-K1'],
                             ['Year','GG-M2', 'GG-M2W', 'B-M2', 'B-M2W', 'CC-M2', 'CC-M2W', 'SAG-M2','SAG-M2W','O-M2'],
                             ['Year','GG-K2', 'GG-K2W', 'B-K2','B-K2W', 'CC-K2', 'CC-K2W', 'SAG-K2', 'SAG-K2W','O-K2'],
                             ['Year','GG-Screen', 'GG-ScreenW', 'B-ScreenO','B-ScreenOW', 'CC-ScreenO', 'CC-ScreenOW',
                              'WGA-ScreenO','WGA-ScreenOW','O-ScreenO'],
                             ['Year','GG-Screen', 'GG-ScreenW', 'B-ScreenA', 'B-ScreenAW', 'CC-ScreenA', 'CC-ScreenAW',
                              'WGA-ScreenA','WGA-ScreenAW','O-ScreenA'],
                             ['Year','B-Zdj', 'CC-Zdj', 'CC-ZdjW', 'ASC','ASCW','O-Zdj'],
                             ['Year','GG-Music', 'GG-MusicW', 'B-Music','B-MusicW', 'CC-Music', 'CC-MusicW','O-MuzykaW'],
                             ['Year','B-Scen', 'B-ScenW','CC-Scen', 'CC-ScenW', 'ADG', 'ADGW','O-Scen'],
                             ['Year','B-Kost','B-KostW', 'CC-Kost', 'CC-KostW', 'CDG','CDGW','O-Kost'],
                             ['Year','B-ChF', 'B-ChFW', 'CC-ChF', 'CC-ChFW','O-ChF'],
                             ['Year','B-Efekty', 'B-EfektyW', 'CC-Efekty', 'CC-EfektyW','O-Efekty'],
                             ['Year','B-Sound', 'B-SoundW', 'ZS','ZSW','O-Sound'],
                             ['Year','B-Mon','B-MonW', 'CC-Mon', 'CC-MonW', 'ACE','ACEW','O-Mont']
                             ]
category_to_use = [[], ['O-Film'], ['O-Dir'], ['O-M1'], ['O-K1'], ['O-M2'], ['O-K2'], ['O-ScreenO'],
                           ['O-ScreenA'],
                           ['O-Zdj'], ['O-Muzyka'], ['O-Scen'], ['O-Kost'], ['O-ChF'], ['O-Efekty'], ['O-Sound'],
                           ['O-Mont']]
category_to_predict = [[], ['O-FilmW'], ['O-DirW'], ['O-M1W'], ['O-K1W'], ['O-M2W'], ['O-K2W'], ['O-ScreenOW'],
                           ['O-ScreenAW'],
                           ['O-ZdjW'], ['O-MuzykaW'], ['O-ScenW'], ['O-KostW'], ['O-ChFW'], ['O-EfektyW'], ['O-SoundW'],
                           ['O-MontW']]

In [99]:
training_data.loc[training_data['Year']!=year]

Unnamed: 0,Title,Year,GG-Drama,GG-DramaW,GG-Comedy,GG-ComedyW,GG-DramaM1,GG-DramaM1W,GG-ComedyM1,GG-ComedyM1W,...,O-Kost,O-KostW,O-ChF,O-ChFW,O-Efekty,O-EfektyW,O-Mont,O-MontW,O-Sound,O-SoundW
0,1917,2020,1,1,0,0,0,0,0,0,...,0,0,1,0,1,1,0,0,2,1
1,13 godzin: Tajna misja w Benghazi,2017,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,Absolutnie fantastyczne: Film,2017,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Ad Astra,2020,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,Agent i pó³,2017,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,Zegar czarnoksiê¿nika,2019,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
228,Zimna wojna,2019,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
231,Zwierzêta nocy,2017,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
232,¯ona,2018,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
input_category =1
year=2016

In [252]:
sum = 0
place=[]
for input_category in range(1,17):
    for year in range(2016,2021):
        model_svc = SVC(probability=True,kernel='precomputed')
        model_logreg.fit(
            training_data.loc[training_data['Year']!=year].loc[training_data[category_to_use[input_category][0]]>0][categories_to_analyse[input_category]],
             training_data.loc[training_data['Year']!=year].loc[training_data[category_to_use[input_category][0]]>0][category_to_predict[input_category]].values.ravel())
        model_logreg_prediction = model_logreg.predict_proba(training_data[training_data['Year']==year][training_data[category_to_use[input_category][0]]>0][categories_to_analyse[input_category]])
        model_logreg_prediction_df = pd.DataFrame()
        model_logreg_prediction_df['Title']=training_data[training_data['Year']==year][training_data[category_to_use[input_category][0]]>0]['Title'].values.ravel()
        model_logreg_prediction_df['Chances (%)']=model_logreg_prediction[:, 1]
        model_logreg_prediction_df = model_logreg_prediction_df.sort_values(by='Chances (%)', ascending=False)
        model_logreg_prediction_df['Score']=list(range(1,model_logreg_prediction_df.shape[0]+1))
        i=0
        while(True):
            if training_data[category_to_predict[input_category]].values[training_data['Title']==model_logreg_prediction_df['Title'].values[i]][0][0]>0:
                sum+=i+1
                place.append(i+1)
                break
            i=i+1
print(sum)
print(np.mean(place))

119
1.4875


In [47]:
X_2016 = training_data[training_data['Year']==2016].drop(['O-FilmW', 'O-DirW', 'O-M1W', 'O-K1W', 'O-M2W', 'O-K2W', 'O-ScreenOW',
                           'O-ScreenAW',
                           'O-ZdjW', 'O-MuzykaW', 'O-ScenW', 'O-KostW', 'O-ChFW', 'O-EfektyW', 'O-SoundW',
                           'O-MontW'],axis=1)
y_2016 = training_data[training_data['Year']==2016][training_data[category_to_use[input_category][0]]>0][category_to_predict[input_category]].values.ravel()

Predictions for this category (logistic regression model):
                                  Title  Chances (%)
2                             Nomadland    40.612589
1               Ma Raine's Black Bottom    23.323032
3               Promising. Young. Woman    13.915839
0                     Pieces of a Woman    13.077467
4  The United States vs. Billie Holiday     9.071073


In [12]:
model_logreg.fit(training_data[training_data['Year']!=2016][training_data[category_to_use[input_category][0]]>0][categories_to_analyse[input_category]],
             training_data[training_data['Year']!=2016][training_data[category_to_use[input_category][0]]>0][category_to_predict[input_category]].values.ravel())

LogisticRegression()

In [137]:
training_data[training_data['Year']==2020][training_data['O-Zdj']==1]

  """Entry point for launching an IPython kernel.


Unnamed: 0,Title,Year,GG-Drama,GG-DramaW,GG-Comedy,GG-ComedyW,GG-DramaM1,GG-DramaM1W,GG-ComedyM1,GG-ComedyM1W,...,O-Kost,O-KostW,O-ChF,O-ChFW,O-Efekty,O-EfektyW,O-Mont,O-MontW,O-Sound,O-SoundW
0,1917,2020,1,1,0,0,0,0,0,0,...,0,0,1,0,1,1,0,0,2,1
82,Irlandczyk,2020,1,0,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
88,Joker,2020,1,0,0,0,1,1,0,0,...,1,0,1,0,0,0,1,0,2,0
112,Lighthouse,2020,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
156,Pewnego razu w Hollywood,2020,0,0,1,1,0,0,1,0,...,1,0,0,0,0,0,0,0,2,0


In [127]:
training_data[category_to_predict[input_category]].values[training_data['Title']==model_logreg_prediction_df['Title'].values[i]][0][0]

0