In [27]:
# GENERAL LIBRAIRIES IMPORT
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [28]:
# DATA IMPORT
netflix = pd.read_csv('../data/Netflix.csv')
netflix.drop('Unnamed: 0', axis = 1, inplace=True)
netflix.head()

Unnamed: 0,before_90s,90s,2000s,movie,other,tv movie,video movie,Action,Documentary,Comedy,...,Thriller,Romance,Other_genre,United States,United Kingdom,France,Other_country,English,Other_language,category
0,0.0,1.0,0.0,0,0,0,1,0,0,0,...,0,0,1,1,0,0,0,1,0,niche
1,0.0,1.0,0.0,1,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,1,other
2,0.0,0.0,1.0,1,0,0,0,0,1,1,...,0,0,1,1,0,0,0,1,1,other
3,0.0,1.0,0.0,1,0,0,0,0,0,1,...,0,0,1,1,0,0,0,1,0,other
4,1.0,0.0,0.0,0,0,0,1,0,1,0,...,0,0,1,1,0,0,0,1,0,niche


In [29]:
# SKLEARN IMPORTS
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve, confusion_matrix, auc, RocCurveDisplay
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [30]:
# SYNTHESIS FUNCTION
def synthetise(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    y_train_score = model.predict_proba(X_train)
    y_test_score = model.predict_proba(X_test)
    model_name = type(model).__name__
    print(re.sub(r"(\w)([A-Z])", r"\1 \2", model_name).upper())
    print('====================')
    print('TRAIN dataset')
    print(f'Accuracy score: {accuracy_score(y_train, y_train_pred):.1%}')
    print(f'Recall score (macro): {recall_score(y_train, y_train_pred, average="macro"):.1%}')
    print(f'Precision score (macro): {precision_score(y_train, y_train_pred, average="macro"):.1%}')
    print(f'ROC_AUC score (macro): {roc_auc_score(y_train, y_train_score, average="macro", multi_class="ovo"):.1%}')
    print(f'Confusion matrix')
    print(confusion_matrix(y_train, y_train_pred))
    print('====================')
    print('TEST dataset')
    print(f'Accuracy score: {accuracy_score(y_test, y_test_pred):.1%}')
    print(f'Recall score (macro): {recall_score(y_test, y_test_pred, average="macro"):.1%}')
    print(f'Precision score (macro): {precision_score(y_test, y_test_pred, average="macro"):.1%}')
    print(f'ROC_AUC score (macro): {roc_auc_score(y_test, y_test_score, average="macro", multi_class="ovo"):.1%}')
    print(f'Confusion matrix')
    print(confusion_matrix(y_test, y_test_pred))

In [31]:
# DATA PREPARATION

X = netflix.drop('category', axis = 1)
y = netflix.category

# random_state = 42 for reproductibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
VotingClassifier?

[0;31mInit signature:[0m
[0mVotingClassifier[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mestimators[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvoting[0m[0;34m=[0m[0;34m'hard'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mweights[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_jobs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mflatten_transform[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Soft Voting/Majority Rule classifier for unfitted estimators.

Read more in the :ref:`User Guide <voting_classifier>`.

.. versionadded:: 0.17

Parameters
----------
estimators : list of (str, estimator) tuples
    Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
    of those original estimators that

In [33]:
# VOTING CLASSIFIER

## Naive view: default hyperparameters & all features
### Model fitting

qdc = QuadraticDiscriminantAnalysis()
dtc = DecisionTreeClassifier()
gnb = GaussianNB()
vc = VotingClassifier(estimators=[('qdc', qdc), ('dtc', dtc), ('gnb', gnb)], voting='soft')
vc.fit(X_train, y_train)



VotingClassifier(estimators=[('qdc', QuadraticDiscriminantAnalysis()),
                             ('dtc', DecisionTreeClassifier()),
                             ('gnb', GaussianNB())],
                 voting='soft')

In [34]:
### Synthesis
synthetise(vc, X_train, X_test, y_train, y_test)

VOTING CLASSIFIER
TRAIN dataset
Accuracy score: 39.4%
Recall score (macro): 62.0%
Precision score (macro): 55.8%
ROC_AUC score (macro): 89.4%
Confusion matrix
[[ 662   10   23]
 [ 260  761   48]
 [2374  581  723]]
TEST dataset
Accuracy score: 34.5%
Recall score (macro): 57.1%
Precision score (macro): 50.2%
ROC_AUC score (macro): 77.3%
Confusion matrix
[[131   5  19]
 [ 60 156  13]
 [644 151 182]]


In [35]:
### Conclusions
#### Accuracy below 40%, medium recall / sensitivity and precision.
#### We reject the model.