# Teste de classificadores comuns

In [1]:
import numpy as np
import pandas as pd

In [1]:
# declare a list tasks whose products you want to use as inputs
upstream = ['preparar_classificacao']
product = None

In [None]:
df_X = pd.read_parquet(upstream['preparar_classificacao']['data_X'])
df_y = pd.read_parquet(upstream['preparar_classificacao']['data_y']).iloc[:, 0]

## Teste com modelos de classificação

In [None]:
from numpy import mean
from numpy import std
from sklearn.metrics import make_scorer
import sklearn.metrics as metrics
from sklearn.linear_model import LinearRegression, RidgeClassifier, Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

kfold_data = KFold(n_splits=5, shuffle=True, random_state=31)
df_X_input = df_X

def cutoff_youdens_j(fpr, tpr, thresholds):
    j_scores = tpr-fpr
    j_ordered = sorted(zip(j_scores,thresholds))
    return j_ordered[-1][1]
    
def scorer(y_test, y_pred):
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_pred)
    cutoff = cutoff_youdens_j(fpr, tpr, thresholds)
    return metrics.f1_score(y_test, [1.0 if x >= cutoff+0.05 else 0.0 for x in y_pred])

def calculate_score_threshold(model, model_name):
    scores = cross_val_score(model, df_X_input, df_y, cv=kfold_data, scoring=make_scorer(scorer, response_method=["decision_function", "predict_proba"]))
    print(model_name + '\nf1: %.3f ,\nStandard Deviations :%.3f\n\n' % (mean(scores), std(scores)))
    
def calculate_score(model, model_name):
    scores = cross_val_score(model, df_X_input, df_y, cv=kfold_data, scoring='f1')
    print(model_name + '\nf1: %.3f ,\nStandard Deviations :%.3f\n\n' % (mean(scores), std(scores)))
    return mean(scores)

calculate_score_threshold(RidgeClassifier(), "linear ridge")
calculate_score_threshold(LogisticRegression(), "logistic")
calculate_score(DecisionTreeClassifier(random_state=13, max_depth=10), 'decision_tree')
calculate_score(RandomForestClassifier(random_state=15, n_estimators=50), 'random forest')
calculate_score_threshold(SVC(), "SVC")