# Modeling
- plug-in: paramétricos (LDA, KNN, Naive-bayes)
        - LDA e KNN (simples e complexo ou simples, dependendo de K)
- risk minimization: decision trees, nn, etc
    - 

    - Falso negativo é pior que falso positivo: causa menos dano mostrar algo que a pessoa não goste tanto do que deixar de mostrar algo que sabemos que a pessoa vai gostar.
    

# Models

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

## Prepare train test split datasets

In [2]:
df = pd.read_csv('../../data/final_features_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Income,faves_pca0,faves_pca1,unfaves_pca0,unfaves_pca1,accessories,alcohol,animamted,...,Drama.2,Entertainment (Variety Shows),Factual,Learning,Music,News,Religion &amp; Ethics,Sport.1,Weather,Rating_bin
0,0,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,1,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
2,2,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
3,3,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
4,4,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0


In [3]:
df_0 = df.fillna(0)

In [4]:
Y = df_0.pop('Rating_bin')
X = df_0

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state = 42)

## Baseline Models

### Plug-in Models
#### Simple: LDA

In [6]:
clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)

In [7]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92     10267
           1       0.41      0.13      0.20      1653

    accuracy                           0.85     11920
   macro avg       0.64      0.55      0.56     11920
weighted avg       0.81      0.85      0.82     11920



In [8]:
confusion_matrix(y_test, y_pred)

array([[9951,  316],
       [1432,  221]])

#### Complex: Naive Bayes

In [9]:
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.92      0.71      0.80     10267
           1       0.25      0.60      0.35      1653

    accuracy                           0.70     11920
   macro avg       0.58      0.66      0.58     11920
weighted avg       0.82      0.70      0.74     11920



array([[7302, 2965],
       [ 662,  991]])

## Risk Minimization
### Simple: Decision Tree

In [10]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.90      0.91      0.90     10267
           1       0.40      0.39      0.39      1653

    accuracy                           0.83     11920
   macro avg       0.65      0.65      0.65     11920
weighted avg       0.83      0.83      0.83     11920



array([[9300,  967],
       [1011,  642]])

#### Complex: Neural Networks

In [11]:
clf = MLPClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.92      0.76      0.83     10267
           1       0.29      0.59      0.39      1653

    accuracy                           0.74     11920
   macro avg       0.60      0.68      0.61     11920
weighted avg       0.83      0.74      0.77     11920



array([[7840, 2427],
       [ 677,  976]])

### Hyperparameters search

In [12]:
def hyperparameter_tunning(clf, parameters):
    clf = clf

    grid_search = GridSearchCV(clf, parameters)
    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)

    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

    return pd.DataFrame(grid_search.cv_results_)

#### LDA

In [13]:
parameters = {
    'solver': ('svd', 'lsqr', 'eigen'),
    'priors': [None, [0.5, 0.5]]
}


In [14]:
results = hyperparameter_tunning(LinearDiscriminantAnalysis(), parameters)
results

10 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/juliatessler/Unicamp/mo810/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/juliatessler/Unicamp/mo810/.venv/lib/python3.10/site-packages/sklearn/discriminant_analysis.py", line 605, in fit
    self._solve_eigen(
  File "/Users/juliatessler/Unicamp/mo810/.venv/lib/python3.10/site-packages/sklearn/discriminant_analysis.py", line 445, in _solve_eigen
    evals, evecs = linalg.eigh(Sb, Sw)
  File "/Users/juliatessler/Unicamp/mo810/.venv/lib/python3.10

              precision    recall  f1-score   support

           0       0.87      0.97      0.92     10267
           1       0.41      0.13      0.20      1653

    accuracy                           0.85     11920
   macro avg       0.64      0.55      0.56     11920
weighted avg       0.81      0.85      0.82     11920

[[9951  316]
 [1432  221]]


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_priors,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.239561,0.089176,0.029666,0.003537,,svd,"{'priors': None, 'solver': 'svd'}",0.858471,0.865496,0.86405,0.863017,0.865289,0.863264,0.002559,2
1,1.127772,0.033275,0.015215,0.003153,,lsqr,"{'priors': None, 'solver': 'lsqr'}",0.858471,0.865496,0.86405,0.863223,0.865289,0.863306,0.002556,1
2,1.093837,0.038706,0.0,0.0,,eigen,"{'priors': None, 'solver': 'eigen'}",,,,,,,,5
3,1.951723,0.048593,0.027878,0.002877,"[0.5, 0.5]",svd,"{'priors': [0.5, 0.5], 'solver': 'svd'}",0.738843,0.744628,0.734504,0.745041,0.75,0.742603,0.005376,3
4,1.105224,0.082127,0.021021,0.003851,"[0.5, 0.5]",lsqr,"{'priors': [0.5, 0.5], 'solver': 'lsqr'}",0.648967,0.652479,0.645248,0.655372,0.657645,0.651942,0.004432,4
5,1.344138,0.098441,0.0,0.0,"[0.5, 0.5]",eigen,"{'priors': [0.5, 0.5], 'solver': 'eigen'}",,,,,,,,6


#### Naive Bayes

In [15]:
parameters = {
    'priors': [None, [0.5, 0.5]],
    'var_smoothing': (1e-9, 0.0001, 0.01, 0.1)
}

In [16]:
results = hyperparameter_tunning(GaussianNB(), parameters)
results

              precision    recall  f1-score   support

           0       0.86      1.00      0.93     10267
           1       0.00      0.00      0.00      1653

    accuracy                           0.86     11920
   macro avg       0.43      0.50      0.46     11920
weighted avg       0.74      0.86      0.80     11920

[[10267     0]
 [ 1653     0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_priors,param_var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.268699,0.006829,0.061644,0.003116,,0.0,"{'priors': None, 'var_smoothing': 1e-09}",0.661157,0.681405,0.714256,0.692769,0.727479,0.695413,0.023495,4
1,0.283205,0.023178,0.060164,0.004012,,0.0001,"{'priors': None, 'var_smoothing': 0.0001}",0.868388,0.868388,0.868182,0.868182,0.868182,0.868264,0.000101,1
2,0.280305,0.006787,0.059953,0.001846,,0.01,"{'priors': None, 'var_smoothing': 0.01}",0.868388,0.868388,0.868182,0.868182,0.868182,0.868264,0.000101,1
3,0.296254,0.020225,0.062011,0.003036,,0.1,"{'priors': None, 'var_smoothing': 0.1}",0.868388,0.868388,0.868182,0.868182,0.868182,0.868264,0.000101,1
4,0.284696,0.016258,0.062628,0.003909,"[0.5, 0.5]",0.0,"{'priors': [0.5, 0.5], 'var_smoothing': 1e-09}",0.415083,0.39814,0.464256,0.398967,0.490909,0.433471,0.03748,5
5,0.270614,0.001294,0.058855,0.000626,"[0.5, 0.5]",0.0001,"{'priors': [0.5, 0.5], 'var_smoothing': 0.0001}",0.407851,0.418182,0.446074,0.402686,0.450413,0.425041,0.019639,8
6,0.271003,0.001251,0.058751,0.000519,"[0.5, 0.5]",0.01,"{'priors': [0.5, 0.5], 'var_smoothing': 0.01}",0.407231,0.420661,0.45124,0.402686,0.456198,0.427603,0.022183,6
7,0.284911,0.011112,0.06087,0.003291,"[0.5, 0.5]",0.1,"{'priors': [0.5, 0.5], 'var_smoothing': 0.1}",0.406198,0.420455,0.45124,0.39876,0.456198,0.42657,0.02329,7


#### Decision Trees

In [17]:
parameters = {
    'criterion': ('gini', 'entropy', 'log_loss'),
    'splitter': ('best', 'random')
}

In [18]:
results = hyperparameter_tunning(DecisionTreeClassifier(), parameters)
results

              precision    recall  f1-score   support

           0       0.90      0.91      0.90     10267
           1       0.40      0.39      0.39      1653

    accuracy                           0.83     11920
   macro avg       0.65      0.65      0.65     11920
weighted avg       0.83      0.83      0.83     11920

[[9300  967]
 [1011  642]]


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.665289,0.062644,0.029737,0.001173,gini,best,"{'criterion': 'gini', 'splitter': 'best'}",0.841736,0.842975,0.853926,0.842355,0.84876,0.84595,0.004713,1
1,0.579131,0.045733,0.029335,0.000449,gini,random,"{'criterion': 'gini', 'splitter': 'random'}",0.839463,0.844421,0.852273,0.83719,0.842562,0.843182,0.005183,5
2,0.53703,0.018744,0.028988,0.000775,entropy,best,"{'criterion': 'entropy', 'splitter': 'best'}",0.841736,0.842975,0.853926,0.842355,0.84876,0.84595,0.004713,1
3,0.483211,0.034711,0.030493,0.001079,entropy,random,"{'criterion': 'entropy', 'splitter': 'random'}",0.845248,0.842769,0.849793,0.83719,0.846901,0.84438,0.004259,4
4,0.536408,0.012133,0.02988,0.000944,log_loss,best,"{'criterion': 'log_loss', 'splitter': 'best'}",0.841736,0.842975,0.853926,0.842355,0.84876,0.84595,0.004713,1
5,0.463374,0.032645,0.038012,0.016197,log_loss,random,"{'criterion': 'log_loss', 'splitter': 'random'}",0.841116,0.843388,0.846694,0.836983,0.845248,0.842686,0.003409,6


#### Neural Network

In [21]:
parameters = {
    'activation': ('identity', 'logistic', 'tanh', 'relu'),
    #'solver': ('lbfgs', 'sgd', 'adam'),
    'alpha': (0.0001, 0.001, 0.1)
}

In [22]:
results = hyperparameter_tunning(MLPClassifier(), parameters)
results

              precision    recall  f1-score   support

           0       0.86      1.00      0.93     10267
           1       0.00      0.00      0.00      1653

    accuracy                           0.86     11920
   macro avg       0.43      0.50      0.46     11920
weighted avg       0.74      0.86      0.80     11920

[[10267     0]
 [ 1653     0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,11.094479,2.016994,0.036526,0.001343,identity,0.0001,"{'activation': 'identity', 'alpha': 0.0001}",0.521281,0.816116,0.863017,0.760331,0.869628,0.766074,0.128516,12
1,11.894125,3.783246,0.036052,0.000832,identity,0.001,"{'activation': 'identity', 'alpha': 0.001}",0.868388,0.868388,0.868182,0.866942,0.820661,0.858512,0.018933,7
2,6.981953,0.962796,0.034455,0.000502,identity,0.1,"{'activation': 'identity', 'alpha': 0.1}",0.868388,0.868388,0.868182,0.797727,0.870248,0.854587,0.02844,8
3,8.594739,3.175214,0.046728,0.007395,logistic,0.0001,"{'activation': 'logistic', 'alpha': 0.0001}",0.868388,0.868388,0.868182,0.868182,0.868182,0.868264,0.000101,1
4,9.273893,2.862996,0.038771,0.001369,logistic,0.001,"{'activation': 'logistic', 'alpha': 0.001}",0.868388,0.868388,0.868182,0.868182,0.868182,0.868264,0.000101,1
5,5.96692,3.892068,0.038889,0.001755,logistic,0.1,"{'activation': 'logistic', 'alpha': 0.1}",0.868388,0.868388,0.868182,0.868182,0.868182,0.868264,0.000101,1
6,5.587911,1.389407,0.038379,0.001475,tanh,0.0001,"{'activation': 'tanh', 'alpha': 0.0001}",0.868388,0.868388,0.868182,0.868182,0.868182,0.868264,0.000101,1
7,6.46754,0.836967,0.037696,0.000692,tanh,0.001,"{'activation': 'tanh', 'alpha': 0.001}",0.868388,0.868388,0.868182,0.868182,0.868182,0.868264,0.000101,1
8,5.487893,1.285797,0.041764,0.004483,tanh,0.1,"{'activation': 'tanh', 'alpha': 0.1}",0.868388,0.868388,0.868182,0.868182,0.868182,0.868264,0.000101,1
9,9.421624,1.249159,0.032581,0.000788,relu,0.0001,"{'activation': 'relu', 'alpha': 0.0001}",0.868388,0.868388,0.867975,0.359298,0.870248,0.76686,0.203783,11
