# Predictive models

## Classification

In [33]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
#from sklearn.neural_network import MLPClassifier

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.decomposition import PCA
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import balanced_accuracy_score


In [46]:
scorer = make_scorer(balanced_accuracy_score) # Teste outras

# Semente aleatória para reproducibilidade dos experimentos (reproducão dos experimentos)
seed = 20

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
gscv = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

algorithms = {
    'GB': GridSearchCV(
    Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest()),
        ('undersampling', RandomUnderSampler(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=seed))]),
    param_grid={
        'selector__k': [5, 10, 15, 20, 25],
        'gb__n_estimators': [100, 150, 200, 250, 300],
        'gb__max_depth':[2, 4,5,6,10],
        'gb__learning_rate':[0.05,0.1],
    },
    scoring=scorer,
    cv=gscv),
    'RF': GridSearchCV(
    Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest()),
        ('undersampling', RandomUnderSampler(random_state=42)),
        ('rf', RandomForestClassifier(random_state=seed))]),
    param_grid={
        'selector__k': [5, 10, 15, 20, 25],
        'rf__n_estimators': [100, 150, 200, 250, 300],
        'rf__max_depth':[2,4,5,6,10],
    },
    scoring=scorer,
    cv=gscv),
    'svmlinear': GridSearchCV(
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('selector', SelectKBest()),
            ('undersampling', RandomUnderSampler(random_state=42)),
            ('svm', SVC(kernel='linear', random_state=seed))]),
        param_grid={
            'selector__k': [5, 10, 15, 20, 25],
            'svm__C': [1.0, 10.0,100.0],
        },
        scoring=scorer,
        cv=gscv),
    'svmrbf': GridSearchCV(
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('selector', SelectKBest()),
            ('undersampling', RandomUnderSampler(random_state=42)),
            ('svm', SVC(kernel='rbf', random_state=seed))]),
        param_grid={
            'selector__k': [5, 10, 15, 20, 25],
            'svm__C': [1.0, 10.0,100.0],
            'svm__gamma': ['scale', 'auto'],
        },
        scoring=scorer,
        cv=gscv),
}

In [47]:
# load dataset

import pandas as pd

data = pd.read_csv("DatasetConstruction/dataset.csv")

data = data.drop(columns="Unnamed: 0")

data

Unnamed: 0,ID_PACIENTE,ALT (TGP),Basófilos,Bilirrubina Direta,Bilirrubina Indireta,CHCM,CK,Calcio Ionizavel,Creatinina,DHL,...,RDW,Sódio,TP_INR,TTPA - Paciente_Normal,Uréia,VCM,Volume plaquetário médio,Ano nascimento,SEXO,INFEC
0,004688799FD293C3ABE0A07209FD8B75,16.0,30.0,0.20,0.35,32.1,43.0,1.21,1.95,489.0,...,14.6,143.0,0.83,0.91,86.0,94.9,10.0,1953.0,1,0.0
1,0047AF5116BC8AC8EFE6BBB98DA14DFA,24.0,20.0,0.17,0.25,32.2,,,1.49,,...,14.8,142.0,1.20,0.98,37.0,81.9,10.8,,1,0.0
2,009F0D6B3BA6C0E2D406585697D679EB,25.0,20.0,0.19,0.24,34.9,153.0,1.24,1.22,716.0,...,13.1,138.0,0.98,1.11,39.0,89.5,10.3,1965.0,1,0.0
3,00DCB2411CFD2F3C9FB4CD35C3AACCA1,22.0,20.0,0.08,0.12,34.5,93.0,1.18,1.78,721.0,...,13.2,133.0,0.98,1.13,155.0,103.6,9.9,1933.0,1,0.0
4,010AF0C3418C765CDA28B5957210A819,162.0,20.0,0.12,0.15,33.7,51.0,1.18,0.82,654.0,...,14.2,137.0,1.00,1.13,23.0,88.2,10.1,1955.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1228,FEC09496E67CD8A4DE225245AF35799C,,10.0,,,34.6,,1.15,0.54,,...,12.5,137.0,1.00,0.90,59.0,90.3,11.3,1962.0,1,1.0
1229,FECC5CE1CFE3BCE881F29C2333527135,10.0,30.0,,,35.3,,,0.86,,...,12.9,138.0,,,25.0,92.5,10.4,1980.0,0,0.0
1230,FF19A1D8C1EB3A7A73541F3443B4FA00,15.0,20.0,0.17,0.18,32.9,138.0,1.22,0.75,440.0,...,14.8,139.0,1.30,1.05,33.0,95.7,10.8,1943.0,0,0.0
1231,FF4B2EED093AE641B9328FDB293C4116,35.0,10.0,0.16,0.25,34.2,,1.28,1.15,274.0,...,12.8,137.0,1.00,1.06,28.0,83.7,10.6,1969.0,1,0.0


In [48]:
# import X

X = data.drop(columns=["ID_PACIENTE", "INFEC"])

X

Unnamed: 0,ALT (TGP),Basófilos,Bilirrubina Direta,Bilirrubina Indireta,CHCM,CK,Calcio Ionizavel,Creatinina,DHL,"Dimeros D, quant",...,Proteína C-Reativa,RDW,Sódio,TP_INR,TTPA - Paciente_Normal,Uréia,VCM,Volume plaquetário médio,Ano nascimento,SEXO
0,16.0,30.0,0.20,0.35,32.1,43.0,1.21,1.95,489.0,748.0,...,5.13,14.6,143.0,0.83,0.91,86.0,94.9,10.0,1953.0,1
1,24.0,20.0,0.17,0.25,32.2,,,1.49,,,...,0.96,14.8,142.0,1.20,0.98,37.0,81.9,10.8,,1
2,25.0,20.0,0.19,0.24,34.9,153.0,1.24,1.22,716.0,271.0,...,11.04,13.1,138.0,0.98,1.11,39.0,89.5,10.3,1965.0,1
3,22.0,20.0,0.08,0.12,34.5,93.0,1.18,1.78,721.0,372.0,...,0.03,13.2,133.0,0.98,1.13,155.0,103.6,9.9,1933.0,1
4,162.0,20.0,0.12,0.15,33.7,51.0,1.18,0.82,654.0,537.0,...,6.59,14.2,137.0,1.00,1.13,23.0,88.2,10.1,1955.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1228,,10.0,,,34.6,,1.15,0.54,,,...,0.17,12.5,137.0,1.00,0.90,59.0,90.3,11.3,1962.0,1
1229,10.0,30.0,,,35.3,,,0.86,,460.0,...,1.41,12.9,138.0,,,25.0,92.5,10.4,1980.0,0
1230,15.0,20.0,0.17,0.18,32.9,138.0,1.22,0.75,440.0,1352.0,...,0.25,14.8,139.0,1.30,1.05,33.0,95.7,10.8,1943.0,0
1231,35.0,10.0,0.16,0.25,34.2,,1.28,1.15,274.0,361.0,...,0.20,12.8,137.0,1.00,1.06,28.0,83.7,10.6,1969.0,1


In [49]:
X = X.to_numpy()

X

array([[1.600e+01, 3.000e+01, 2.000e-01, ..., 1.000e+01, 1.953e+03,
        1.000e+00],
       [2.400e+01, 2.000e+01, 1.700e-01, ..., 1.080e+01,       nan,
        1.000e+00],
       [2.500e+01, 2.000e+01, 1.900e-01, ..., 1.030e+01, 1.965e+03,
        1.000e+00],
       ...,
       [1.500e+01, 2.000e+01, 1.700e-01, ..., 1.080e+01, 1.943e+03,
        0.000e+00],
       [3.500e+01, 1.000e+01, 1.600e-01, ..., 1.060e+01, 1.969e+03,
        1.000e+00],
       [5.200e+01, 1.000e+01, 1.500e-01, ..., 1.210e+01, 2.004e+03,
        0.000e+00]])

In [50]:
# import y

y = data["INFEC"]

y = y.to_numpy()

y

array([0., 0., 0., ..., 0., 0., 0.])

In [51]:
from sklearn.model_selection import cross_val_score

result = {}
for alg, clf in algorithms.items():
  result[alg] = cross_val_score(clf, X, y, cv=cv)

result = pd.DataFrame.from_dict(result)

In [52]:
print(result)

         GB        RF  svmlinear    svmrbf
0  0.667400  0.611875   0.616548  0.627268
1  0.567070  0.668774   0.666025  0.542331
2  0.639912  0.623969   0.667400  0.559098
3  0.680199  0.708236   0.698890  0.769276
4  0.661507  0.577103   0.634930  0.620911
5  0.594895  0.584351   0.579634  0.599612
6  0.703108  0.743063   0.703108  0.660655
7  0.526637  0.639567   0.504162  0.587680
8  0.620699  0.644284   0.619589  0.665372
9  0.629023  0.557159   0.552442  0.673696


In [53]:
result.apply(lambda x: "{:.2f} ± {:.2f}".format(x.mean(), x.std()))

GB           0.63 ± 0.05
RF           0.64 ± 0.06
svmlinear    0.62 ± 0.06
svmrbf       0.63 ± 0.07
dtype: object

## Deploy de um modelo

In [54]:
classifier = algorithms['RF']
classifier.fit(X, y) # Uso a base toda
print(classifier.best_estimator_)

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('selector', SelectKBest(k=20)),
                ('undersampling', RandomUnderSampler(random_state=42)),
                ('rf',
                 RandomForestClassifier(max_depth=5, n_estimators=200,
                                        random_state=20))])
