# Eye Color Prediction Parte 3: Criação do Modelo

In [1]:
import pickle

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

## Setup Inicial

Nesta parte não usaremos dados em que os dados para a snp rs12913832 estejam faltando.

E aplicação do nosso modelo terá essa restrição. 

In [2]:
df = pd.read_csv('datasets/df_ml_clean.csv')
# Mudança
df = df[df['rs12913832'] != "missing"]

In [3]:
df.head(2)

Unnamed: 0,rs12913832,rs1800407,rs12896399,rs16891982,rs1393350,rs12203592,color_cat
2,GG,CC,GG,missing,GG,CC,Brown
3,GG,CC,TG,GG,GG,CC,BGG


In [4]:
df.shape

(1113, 7)

In [49]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [50]:
X_train.nunique()

rs12913832    3
rs1800407     4
rs12896399    4
rs16891982    4
rs1393350     3
rs12203592    4
dtype: int64

In [7]:
# encoding
ohe = OneHotEncoder(sparse=False)
ohe.fit(X_train)

X_train_encoded = ohe.transform(X_train)
X_test_encoded = ohe.transform(X_test)

### Experimento 01: Regressão Logística

In [8]:
space01 = {"C":np.logspace(-4, 4, 50),
            "fit_intercept":[True,False]
            }

model01 = LogisticRegression(multi_class='multinomial', max_iter=1000 )
cv01 = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
search01 = GridSearchCV(model01, space01, cv=cv01, n_jobs=-1, verbose=10)
result01 = search01.fit(X_train_encoded, y_train)

Fitting 15 folds for each of 100 candidates, totalling 1500 fits


In [9]:
print(f"Os melhores parâmetros encontrados foram: {result01.best_params_}")
best_model01 = result01.best_estimator_

y_pred = best_model01.predict(X_test_encoded)
y_pred_probs = best_model01.predict_proba(X_test_encoded)
y_pred_train = best_model01.predict(X_train_encoded)

print(f"Roc auc score do modelo: {roc_auc_score(y_test, y_pred_probs, average='weighted', multi_class='ovr') }")
print(f"Acurácia de teste: {accuracy_score(y_test, y_pred):.2%}.")
print(f"Acurácia de treino: { accuracy_score(y_pred_train, y_train):.2%}")
print(" \n Classification Report: \n ")
print(classification_report(y_test, y_pred, target_names=y.unique()))

Os melhores parâmetros encontrados foram: {'C': 2.559547922699533, 'fit_intercept': True}
Roc auc score do modelo: 0.8944342824013797
Acurácia de teste: 79.82%.
Acurácia de treino: 81.57%
 
 Classification Report: 
 
              precision    recall  f1-score   support

       Brown       0.87      0.88      0.87        97
         BGG       0.80      0.93      0.86        86
         Int       0.52      0.33      0.40        40

    accuracy                           0.80       223
   macro avg       0.73      0.71      0.71       223
weighted avg       0.78      0.80      0.78       223



### Experimento 02: Random Forest

In [10]:
n_estimators = [120, 300, 500, 800]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [5, 8, 15, 25, None]
min_samples_split = [1, 2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

space02 = {'n_estimators': n_estimators,
           'max_features': max_features,
           'max_depth': max_depth,
           'min_samples_split': min_samples_split,
           'min_samples_leaf': min_samples_leaf,
           } 

model02 = RandomForestClassifier(random_state=42)
cv02 = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
search02 = RandomizedSearchCV(model02, space02, cv=cv02, n_jobs=-1, n_iter=30, verbose=10)
result02 = search02.fit(X_train_encoded, y_train)

Fitting 15 folds for each of 30 candidates, totalling 450 fits


45 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\glets\.conda\envs\ds\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\glets\.conda\envs\ds\lib\site-packages\sklearn\ensemble\_forest.py", line 476, in fit
    trees = Parallel(
  File "C:\Users\glets\.conda\envs\ds\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\glets\.conda\envs\ds\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "

In [11]:
print(f"Os melhores parâmetros encontrados foram: {result02.best_params_}")
best_model02 = result02.best_estimator_

y_pred = best_model02.predict(X_test_encoded)
y_pred_probs = best_model02.predict_proba(X_test_encoded)
y_pred_train = best_model02.predict(X_train_encoded)

print(f"Roc auc score do modelo: {roc_auc_score(y_test, y_pred_probs, average='weighted', multi_class='ovr') }")
print(f"Acurácia de teste: {accuracy_score(y_test, y_pred):.2%}.")
print(f"Acurácia de treino: { accuracy_score(y_pred_train, y_train):.2%}")
print(" \n Classification Report: \n ")
print(classification_report(y_test, y_pred, target_names=y.unique()))

Os melhores parâmetros encontrados foram: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 8}
Roc auc score do modelo: 0.8885362431680575
Acurácia de teste: 77.58%.
Acurácia de treino: 84.38%
 
 Classification Report: 
 
              precision    recall  f1-score   support

       Brown       0.86      0.91      0.88        97
         BGG       0.76      0.88      0.82        86
         Int       0.43      0.23      0.30        40

    accuracy                           0.78       223
   macro avg       0.68      0.67      0.67       223
weighted avg       0.75      0.78      0.75       223



### Experimento 03: XGBoosting

In [12]:
y_train_map = y_train.replace({"BGG":0, "Brown":1, "Int":2})
y_test_map = y_test.replace({"BGG":0, "Brown":1, "Int":2})


space03 = {'eta': [0.01, 0.015, 0.025, 0.05, 0.1],
           'gamma': [0.05,0.01,0.3,0.5,0.7,0.9,1.0],
           'max_depth': [3,5,7,9,12,15,17,25],
           'min_child_weight': [1,3,5,7],
           'subsample': [0.6,0.7,0.8,0.9,1.0],
           'colsample_bytree':[0.6,0.7,0.8,0.9,1.0],
           'lambda':[0.01,0.1,1.0],
           'alpha':[0,0.1,0.5,1.0],
           } 

model03 = xgb.XGBClassifier(objective="multi:softproba", num_class=3, random_state=42, verbosity=3)
cv03 = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
search03 = RandomizedSearchCV(model03, space03, cv=cv03, n_jobs=-1, n_iter=500, verbose=10)
result03 = search03.fit(X_train_encoded, y_train_map)

Fitting 15 folds for each of 500 candidates, totalling 7500 fits
[15:05:48] DEBUG: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/gbm/gbtree.cc:155: Using tree method: 2
[15:05:48] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/tree/updater_prune.cc:101: tree pruning end, 8 extra nodes, 6 pruned nodes, max_depth=3
[15:05:48] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/tree/updater_prune.cc:101: tree pruning end, 10 extra nodes, 4 pruned nodes, max_depth=4
[15:05:48] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/tree/updater_prune.cc:101: tree pruning end, 14 extra nodes, 14 pruned nodes, max_depth=4
[15:05:48] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/tree/updater_prune.cc:101: tree pruning end, 8 extra nodes, 6 pruned nodes, max_depth=3
[15:05:48] INFO: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/tree/updater_prune.cc:101: tree pruning end, 12

In [13]:
print(f"Os melhores parâmetros encontrados foram: {result03.best_params_} \n")
best_model03 = result03.best_estimator_

y_pred = best_model03.predict(X_test_encoded)
y_pred_probs = best_model03.predict_proba(X_test_encoded)
y_pred_train = best_model03.predict(X_train_encoded)

print(f"Roc auc score do modelo: {roc_auc_score(y_test_map, y_pred_probs, average='weighted', multi_class='ovr') }")
print(f"Acurácia de teste: {accuracy_score(y_test_map, y_pred):.2%}.")
print(f"Acurácia de treino: { accuracy_score(y_pred_train, y_train_map):.2%}")
print(" \n Classification Report: \n ")
print(classification_report(y_test_map, y_pred, target_names=y.unique()))

Os melhores parâmetros encontrados foram: {'subsample': 0.7, 'min_child_weight': 7, 'max_depth': 7, 'lambda': 1.0, 'gamma': 1.0, 'eta': 0.05, 'colsample_bytree': 1.0, 'alpha': 0} 

Roc auc score do modelo: 0.8948228691843373
Acurácia de teste: 78.03%.
Acurácia de treino: 81.69%
 
 Classification Report: 
 
              precision    recall  f1-score   support

       Brown       0.87      0.89      0.88        97
         BGG       0.76      0.92      0.83        86
         Int       0.45      0.23      0.30        40

    accuracy                           0.78       223
   macro avg       0.69      0.68      0.67       223
weighted avg       0.75      0.78      0.76       223



## Salvando o Modelo

In [14]:
best_model = best_model01

In [18]:
pipe = Pipeline([
    ('ohe', ohe), 
    ('model', best_model)
    ])

In [19]:
filename = 'app_streamlit/best_model.sav'
pickle.dump(pipe, open(filename, 'wb'))

In [21]:
pipe.classes_

array(['BGG', 'Brown', 'Int'], dtype=object)