In [10]:
# GENERAL LIBRAIRIES IMPORT
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [11]:
# DATA IMPORT
netflix = pd.read_csv('./data/Netflix.csv')
netflix.drop('Unnamed: 0', axis = 1, inplace=True)
netflix.head()

Unnamed: 0,before_90s,90s,2000s,movie,other,tv movie,video movie,Action,Documentary,Comedy,...,Thriller,Romance,Other_genre,United States,United Kingdom,France,Other_country,English,Other_language,category
0,0.0,1.0,0.0,0,0,0,1,0,0,0,...,0,0,1,1,0,0,0,1,0,niche
1,0.0,1.0,0.0,1,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,1,other
2,0.0,0.0,1.0,1,0,0,0,0,1,1,...,0,0,1,1,0,0,0,1,1,other
3,0.0,1.0,0.0,1,0,0,0,0,0,1,...,0,0,1,1,0,0,0,1,0,other
4,1.0,0.0,0.0,0,0,0,1,0,1,0,...,0,0,1,1,0,0,0,1,0,niche


In [12]:
# SKLEARN IMPORTS
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [20]:
# SYNTHESIS FUNCTION
def synthetise(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    y_train_score = model.predict_proba(X_train)
    y_test_score = model.predict_proba(X_test)
    model_name = type(model).__name__
    print(re.sub(r"(\w)([A-Z])", r"\1 \2", model_name).upper())
    print('====================')
    print('TRAIN dataset')
    print(f'Accuracy score: {accuracy_score(y_train, y_train_pred):.1%}')
    print(f'Recall score: {recall_score(y_train, y_train_pred, average="weighted"):.1%}')
    print(f'Precision score: {precision_score(y_train, y_train_pred, average="weighted"):.1%}')
    print(f'ROC_AUC score: {roc_auc_score(y_train, y_train_score, average="weighted", multi_class="ovo"):.1%}')
    print(f'Confusion matrix')
    print(confusion_matrix(y_train, y_train_pred))
    print('====================')
    print('TEST dataset')
    print(f'Accuracy score: {accuracy_score(y_test, y_test_pred):.1%}')
    print(f'Recall score: {recall_score(y_test, y_test_pred, average="weighted"):.1%}')
    print(f'Precision score: {precision_score(y_test, y_test_pred, average="weighted"):.1%}')
    print(f'ROC_AUC score: {roc_auc_score(y_test, y_test_score, average="weighted", multi_class="ovo"):.1%}')
    print(f'Confusion matrix')
    print(confusion_matrix(y_test, y_test_pred))

In [14]:
# DATA PREPARATION

X = netflix.drop('category', axis = 1)
y = netflix.category

# random_state = 42 for reproductibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
QuadraticDiscriminantAnalysis?

[0;31mInit signature:[0m
[0mQuadraticDiscriminantAnalysis[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpriors[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreg_param[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstore_covariance[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtol[0m[0;34m=[0m[0;36m0.0001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Quadratic Discriminant Analysis.

A classifier with a quadratic decision boundary, generated
by fitting class conditional densities to the data
and using Bayes' rule.

The model fits a Gaussian density to each class.

.. versionadded:: 0.17
   *QuadraticDiscriminantAnalysis*

Read more in the :ref:`User Guide <lda_qda>`.

Parameters
----------
priors : ndarray of shape (n_classes,), default=None
    Class priors. By default, the class proportions a

In [16]:
# QUADRATIC DISCRIMINANT ANALYSIS

## Naive view: default hyperparameters & all features
### Model fitting
qdc = QuadraticDiscriminantAnalysis()
qdc.fit(X_train, y_train)



QuadraticDiscriminantAnalysis()

In [21]:
### Synthesis
synthetise(qdc, X_train, X_test, y_train, y_test)

QUADRATIC DISCRIMINANT ANALYSIS
TRAIN dataset
Accuracy score: 35.7%
Recall score: 35.7%
Precision score: 67.5%
ROC_AUC score: 68.3%
Confusion matrix
[[ 608   12   75]
 [ 340  649   80]
 [2390  603  685]]
TEST dataset
Accuracy score: 32.5%
Recall score: 32.5%
Precision score: 68.3%
ROC_AUC score: 66.9%
Confusion matrix
[[124   4  27]
 [ 76 140  13]
 [650 149 178]]


In [18]:
### Conclusions
#### Low accuracy but precision is not that bad.
#### No overfit.
#### Collinearity issues, but too complex to solve.

In [19]:
    fpr, tpr, thresholds = roc_curve(x_test["y_initial"],x_test["y_pred"])
    roc_auc = auc(fpr, tpr)
    display = RocCurveDisplay(fpr=fpr, tpr=tpr)
    display.plot()
    plt.show()