# Eye Color Prediction Parte 3: Criação do Modelo

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import OneHotEncoder


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

## Setup Inicial

In [2]:
df = pd.read_csv('datasets/df_ml_clean.csv')

In [3]:
df.head(2)

Unnamed: 0,rs12913832,rs1800407,rs12896399,rs16891982,rs1393350,rs12203592,color_cat
0,missing,CC,missing,missing,missing,missing,Brown
1,missing,TC,missing,missing,missing,missing,BGG


In [4]:
df.shape

(1261, 7)

In [5]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [6]:
X_train.nunique()

rs12913832    4
rs1800407     4
rs12896399    4
rs16891982    4
rs1393350     4
rs12203592    4
dtype: int64

In [7]:
# encoding
ohe = OneHotEncoder(sparse=False)
ohe.fit(X_train)

X_train_encoded = ohe.transform(X_train)
X_test_encoded = ohe.transform(X_test)

### Experimento 01: Regressão Logística

In [8]:
space01 = {"C":np.logspace(-4, 4, 50),
            "fit_intercept":[True,False]
            }

model01 = LogisticRegression(multi_class='multinomial', max_iter=1000 )
cv01 = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
search01 = GridSearchCV(model01, space01, cv=cv01, n_jobs=-1, verbose=10)
result01 = search01.fit(X_train_encoded, y_train)

Fitting 15 folds for each of 100 candidates, totalling 1500 fits


In [9]:
print(f"Os melhores parâmetros encontrados foram: {result01.best_params_}")
best_model01 = result01.best_estimator_

y_pred = best_model01.predict(X_test_encoded)
y_pred_probs = best_model01.predict_proba(X_test_encoded)
y_pred_train = best_model01.predict(X_train_encoded)

print(f"Roc auc score do modelo: {roc_auc_score(y_test, y_pred_probs, average='weighted', multi_class='ovr') }")
print(f"Acurácia de teste: {accuracy_score(y_test, y_pred):.2%}.")
print(f"Acurácia de treino: { accuracy_score(y_pred_train, y_train):.2%}")
print(" \n Classification Report: \n ")
print(classification_report(y_test, y_pred, target_names=y.unique()))

Os melhores parâmetros encontrados foram: {'C': 159.98587196060572, 'fit_intercept': True}
Roc auc score do modelo: 0.9056870803769871
Acurácia de teste: 76.68%.
Acurácia de treino: 76.79%
 
 Classification Report: 
 
              precision    recall  f1-score   support

       Brown       0.85      0.85      0.85       128
         BGG       0.68      0.93      0.78        83
         Int       0.73      0.19      0.30        42

    accuracy                           0.77       253
   macro avg       0.75      0.66      0.65       253
weighted avg       0.77      0.77      0.74       253



### Experimento 02: Random Forest

In [10]:
n_estimators = [120, 300, 500, 800, 1200]
max_features = ['auto', 'sqrt','log2']
max_depth = [5, 8, 15, 25, 30, None]
min_samples_split = [1, 2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

space02 = {'n_estimators': n_estimators,
           'max_features': max_features,
           'max_depth': max_depth,
           'min_samples_split': min_samples_split,
           'min_samples_leaf': min_samples_leaf,
           } 

model02 = RandomForestClassifier(random_state=42)
cv02 = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
search02 = RandomizedSearchCV(model02, space02, cv=cv02, n_jobs=-1, n_iter=50, verbose=10)
result02 = search02.fit(X_train_encoded, y_train)

Fitting 15 folds for each of 50 candidates, totalling 750 fits


150 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Miniconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Miniconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "C:\ProgramData\Miniconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\ProgramData\Miniconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\ProgramData

In [11]:
print(f"Os melhores parâmetros encontrados foram: {result02.best_params_}")
best_model02 = result02.best_estimator_

y_pred = best_model02.predict(X_test_encoded)
y_pred_probs = best_model02.predict_proba(X_test_encoded)
y_pred_train = best_model02.predict(X_train_encoded)

print(f"Roc auc score do modelo: {roc_auc_score(y_test, y_pred_probs, average='weighted', multi_class='ovr') }")
print(f"Acurácia de teste: {accuracy_score(y_test, y_pred):.2%}.")
print(f"Acurácia de treino: { accuracy_score(y_pred_train, y_train):.2%}")
print(" \n Classification Report: \n ")
print(classification_report(y_test, y_pred, target_names=y.unique()))

Os melhores parâmetros encontrados foram: {'n_estimators': 800, 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 15}
Roc auc score do modelo: 0.8908652919909511
Acurácia de teste: 75.10%.
Acurácia de treino: 78.27%
 
 Classification Report: 
 
              precision    recall  f1-score   support

       Brown       0.83      0.84      0.84       128
         BGG       0.66      0.93      0.77        83
         Int       0.71      0.12      0.20        42

    accuracy                           0.75       253
   macro avg       0.74      0.63      0.61       253
weighted avg       0.76      0.75      0.71       253



### Experimento 03: XGBoosting

In [12]:
y_train_map = y_train.replace({"BGG":0, "Brown":1, "Int":2})
y_test_map = y_test.replace({"BGG":0, "Brown":1, "Int":2})


space03 = {'eta': [0.01, 0.015, 0.025, 0.05, 0.1],
           'gamma': [0.05,0.01,0.3,0.5,0.7,0.9,1.0],
           'max_depth': [3,5,7,9,12,15,17,25],
           'min_child_weight': [1,3,5,7],
           'subsample': [0.6,0.7,0.8,0.9,1.0],
           'colsample_bytree':[0.6,0.7,0.8,0.9,1.0],
           'lambda':[0.01,0.1,1.0],
           'alpha':[0,0.1,0.5,1.0],
           } 

model03 = xgb.XGBClassifier(objective="multi:softproba", num_class=3, random_state=42, verbosity=1)
cv03 = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
search03 = RandomizedSearchCV(model03, space03, cv=cv03, n_jobs=-1, n_iter=350, verbose=10)
result03 = search03.fit(X_train_encoded, y_train_map)

Fitting 15 folds for each of 350 candidates, totalling 5250 fits


In [13]:
print(f"Os melhores parâmetros encontrados foram: {result03.best_params_} \n")
best_model03 = result03.best_estimator_

y_pred = best_model03.predict(X_test_encoded)
y_pred_probs = best_model03.predict_proba(X_test_encoded)
y_pred_train = best_model03.predict(X_train_encoded)

print(f"Roc auc score do modelo: {roc_auc_score(y_test_map, y_pred_probs, average='weighted', multi_class='ovr') }")
print(f"Acurácia de teste: {accuracy_score(y_test_map, y_pred):.2%}.")
print(f"Acurácia de treino: { accuracy_score(y_pred_train, y_train_map):.2%}")
print(" \n Classification Report: \n ")
print(classification_report(y_test_map, y_pred, target_names=y.unique()))

Os melhores parâmetros encontrados foram: {'subsample': 0.6, 'min_child_weight': 1, 'max_depth': 3, 'lambda': 0.01, 'gamma': 0.05, 'eta': 0.05, 'colsample_bytree': 1.0, 'alpha': 0.1} 

Roc auc score do modelo: 0.9053484707072422
Acurácia de teste: 76.28%.
Acurácia de treino: 77.08%
 
 Classification Report: 
 
              precision    recall  f1-score   support

       Brown       0.85      0.84      0.84       128
         BGG       0.68      0.93      0.79        83
         Int       0.64      0.21      0.32        42

    accuracy                           0.76       253
   macro avg       0.72      0.66      0.65       253
weighted avg       0.76      0.76      0.74       253

