In [None]:
!pip install adversarial-robustness-toolbox 

# Preparación

## Libraries. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import time

import plotly.graph_objects as go
from plotly.subplots import make_subplots

Datasets used.

In [None]:
from sklearn.datasets import load_iris
from sklearn.datasets import load_digits
from sklearn.datasets import load_breast_cancer
from art.utils import load_cifar10
from art.utils import load_mnist
from art.utils import load_nursery
from art.utils import load_nursery

Scikit-learn's utils used.

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

import sklearn.preprocessing as preprocessing
from sklearn.preprocessing import OrdinalEncoder

#Estadisticas
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

Adversarial Robustness Toolbox

In [None]:
from art.attacks.inference.membership_inference import MembershipInferenceBlackBox
from art.attacks.inference.membership_inference import LabelOnlyDecisionBoundary

from art.attacks.inference.attribute_inference import AttributeInferenceBaseline
from art.attacks.inference.attribute_inference import AttributeInferenceBlackBox
from art.attacks.inference.attribute_inference import AttributeInferenceMembership

Models.

In [None]:
#General
from art.estimators.classification.scikitlearn import ScikitlearnClassifier

#DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from art.estimators.classification.scikitlearn import ScikitlearnDecisionTreeClassifier

#ExtraTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from  art.estimators.classification.scikitlearn import ScikitlearnExtraTreeClassifier

#Adaboost
from sklearn.ensemble import AdaBoostClassifier
from art.estimators.classification.scikitlearn import ScikitlearnAdaBoostClassifier

#BaggingClassifier
from sklearn.ensemble import BaggingClassifier
from art.estimators.classification.scikitlearn import ScikitlearnBaggingClassifier

#ExtraTreesClassifier Ensemble
from sklearn.ensemble import ExtraTreesClassifier
from art.estimators.classification.scikitlearn import ScikitlearnExtraTreesClassifier

#GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from art.estimators.classification.scikitlearn import ScikitlearnGradientBoostingClassifier

#RandomForest
from sklearn.ensemble import RandomForestClassifier
from art.estimators.classification.scikitlearn import ScikitlearnRandomForestClassifier

#Logistic Regresion
from sklearn.linear_model import LogisticRegression
from art.estimators.classification.scikitlearn import ScikitlearnLogisticRegression

#SVC Classifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from art.estimators.classification.scikitlearn import ScikitlearnSVC

# Gaussian
from art.estimators.classification.scikitlearn import ScikitlearnGaussianNB

# Metodologia

## Load Datasets

### Dataset Nursery

In [None]:
train_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data"

features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health", "label"] 

nursery_df = pd.read_csv(train_url, names=features, sep=r'\s*,\s*', engine='python', na_values="?")

In [None]:
nursery_df.drop(nursery_df.loc[nursery_df["label"] == "recommend"].index, axis=0, inplace=True)

In [None]:
categories_parents = ['usual', 'pretentious', 'great_pret']
categories_has_nurs = ['proper', 'less_proper', 'improper', 'critical', 'very_crit']
categories_form = ['complete', 'completed', 'incomplete', 'foster']
categories_children = ['1','2', '3', 'more']
categories_housing = ['convenient','less_conv', 'critical']
categories_finance = ['convenient','inconv']
categories_social = ['nonprob','slightly_prob', 'problematic']
categories_health = ['recommended','priority', 'not_recom']
categories_label = ['not_recom','very_recom', 'priority', 'spec_prior']

encoderX = OrdinalEncoder(categories=[categories_parents, categories_has_nurs, categories_form, 
                                     categories_children, categories_housing, categories_finance,
                                      categories_social, categories_health])

encoderY = OrdinalEncoder(categories=[categories_label]) 

In [None]:
X = encoderX.fit_transform(nursery_df[['parents', 'has_nurs', 'form', 'children', 'housing', 'finance', 'social', 'health']])
y = encoderY.fit_transform(nursery_df[['label']])

In [None]:
x_train_nursery, x_test_nursery, y_train_nursery, y_test_nursery = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [None]:
x_train_nursery = x_train_nursery.astype(int)
x_test_nursery = x_test_nursery.astype(int)
y_train_nursery = np.ravel(y_train_nursery.astype(int))
y_test_nursery = np.ravel(y_test_nursery.astype(int))

In [None]:
X_nursery = np.concatenate((x_train_nursery, x_test_nursery))
Y_nursery = np.concatenate((y_train_nursery, y_test_nursery))

### Dataset Mnist

In [None]:
(x_train_mnist, y_train_mnist), (x_test_mnist, y_test_mnist), min_mnist, max_mnist = load_mnist()

x_train_mnist = x_train_mnist.reshape(x_train_mnist.shape[0], x_train_mnist.shape[1] * x_train_mnist.shape[2] * x_train_mnist.shape[3])[:10000]
x_test_mnist = x_test_mnist.reshape(x_test_mnist.shape[0], x_test_mnist.shape[1] * x_test_mnist.shape[2] * x_test_mnist.shape[3])[:10000]

y_train_mnist = np.argmax(y_train_mnist, axis=1)[:10000]
y_test_mnist = np.argmax(y_test_mnist, axis=1)[:10000]

######

X_mnist = np.concatenate((x_train_mnist, x_test_mnist))
Y_mnist = np.concatenate((y_train_mnist, y_test_mnist))

### Dataset Breast Cancer

In [None]:
X_BreastCancer, Y_BreastCancer = load_breast_cancer(return_X_y=True)
x_train_BreastCancer, x_test_BreastCancer, y_train_BreastCancer, y_test_BreastCancer = train_test_split(X_BreastCancer, Y_BreastCancer, random_state=0, stratify=Y_BreastCancer)

In [None]:
X_BreastCancer = np.concatenate((x_train_BreastCancer, x_test_BreastCancer))
Y_BreastCancer = np.concatenate((y_train_BreastCancer, y_test_BreastCancer))

### Dataset Adult

In [None]:
features = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
        "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
        "Hours per week", "Country", "Target"] 

adult_train = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
                             names=features, sep=r'\s*,\s*',  engine='python', na_values="?")
adult_test = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", 
                            names=features, sep=r'\s*,\s*',  engine='python', na_values="?", skiprows=1)

train_len = len(adult_train)

adultdf = pd.concat([adult_train, adult_test])

In [None]:
labels = adultdf['Target']
labels = labels.replace('<=50K', 0).replace('>50K', 1)
labels = labels.replace('<=50K.', 0).replace('>50K.', 1)

In [None]:
del adultdf["Education"]
del adultdf["Target"]

binary_data = pd.get_dummies(adultdf)
feature_cols = binary_data[binary_data.columns[:-2]]
scaler = preprocessing.StandardScaler()
data = pd.DataFrame(scaler.fit_transform(feature_cols), columns=feature_cols.columns)

x_train_adult = data[:train_len].to_numpy()
y_train_adult = labels[:train_len].to_numpy() 
x_test_adult = data[train_len:].to_numpy()
y_test_adult = labels[train_len:].to_numpy()

In [None]:
X_adult = np.concatenate((x_train_adult, x_test_adult))
Y_adult = np.concatenate((y_train_adult, y_test_adult))

### Car Evaluation

In [None]:
train_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"

features = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "label"] 

car_df = pd.read_csv(train_url, names=features, sep=r'\s*,\s*', 
                             engine='python', na_values="?")

In [None]:
categories_buying = ['low', 'med', 'high', 'vhigh']
categories_maint = ['low', 'med', 'high', 'vhigh']
categories_doors = ['2', '3', '4', '5more']
categories_persons = ['2','4', 'more']
categories_lugboot = ['small','med', 'big']
categories_safety = ['low','med', 'high']
categories_label = ['unacc','acc', 'good', 'vgood']

encoderX = OrdinalEncoder(categories=[categories_buying, categories_maint, categories_doors, 
                                     categories_persons, categories_lugboot, categories_safety])

encoderY = OrdinalEncoder(categories=[categories_label])     

In [None]:
X = encoderX.fit_transform(car_df[['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']])
y = encoderY.fit_transform(car_df[['label']])

In [None]:
prueba = encoderX.fit_transform(car_df[['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']])

In [None]:
x_train_car, x_test_car, y_train_car, y_test_car = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [None]:
x_train_car = x_train_car.astype(int)
x_test_car = x_test_car.astype(int)
y_train_car = np.ravel(y_train_car.astype(int))
y_test_car = np.ravel(y_test_car.astype(int))

In [None]:
X_car = np.concatenate((x_train_car, x_test_car))
Y_car = np.concatenate((y_train_car, y_test_car))

### Titanic

In [None]:
titanic_train = pd.read_csv('datasets/titanic/train.csv')

titanic_test = pd.read_csv('datasets/titanic/test.csv')

titanic_test_label = pd.read_csv('datasets/titanic/gender_submission.csv')

titanic_test = titanic_test.merge(titanic_test_label, on="PassengerId")
titanic = pd.concat([titanic_train, titanic_test])

In [None]:
del titanic['Name']
del titanic['Ticket']
del titanic['Cabin']
del titanic['PassengerId']

titanic['Age'] = titanic['Age'].fillna(titanic['Age'].mean())
titanic['Fare'] = titanic['Fare'].fillna(titanic['Fare'].mean())
titanic = titanic.dropna()

titanic['Sex'] = titanic['Sex'].replace('male', 0).replace('female', 1)

In [None]:
titanic_label = titanic['Survived']
del titanic['Survived']

In [None]:
titanic = pd.get_dummies(titanic) 
scaler = preprocessing.StandardScaler()
data_titanic = pd.DataFrame( titanic, columns=titanic.columns)

In [None]:
x_train_titanic, x_test_titanic, y_train_titanic, y_test_titanic = train_test_split(data_titanic, titanic_label, test_size=0.2, random_state=0,  stratify=titanic_label)

In [None]:
x_train_titanic = x_train_titanic.to_numpy()
x_test_titanic = x_test_titanic.to_numpy()
y_train_titanic = y_train_titanic.to_numpy()
y_test_titanic = y_test_titanic.to_numpy()

In [None]:
X_titanic = np.concatenate((x_train_titanic, x_test_titanic))
Y_titanic = np.concatenate((y_train_titanic, y_test_titanic))

## Auxiliary functions


Encapsulates a SKlearn object in an ART library object.

In [None]:
def objART(sklearnObject):
  if type(sklearnObject) == DecisionTreeClassifier:
    return ScikitlearnDecisionTreeClassifier(sklearnObject)
  elif type(sklearnObject) == ExtraTreeClassifier:
    return ScikitlearnExtraTreeClassifier(sklearnObject)
  elif type(sklearnObject) == AdaBoostClassifier:
    return ScikitlearnAdaBoostClassifier(sklearnObject)
  elif type(sklearnObject) == BaggingClassifier:
    return ScikitlearnBaggingClassifier(sklearnObject)
  elif type(sklearnObject) == ExtraTreesClassifier:
    return ScikitlearnExtraTreesClassifier(sklearnObject)
  elif type(sklearnObject) == GradientBoostingClassifier:
    return ScikitlearnGradientBoostingClassifier(sklearnObject)
  elif type(sklearnObject) == RandomForestClassifier:
    return ScikitlearnRandomForestClassifier(sklearnObject)
  elif type(sklearnObject) == LogisticRegression:
    return ScikitlearnLogisticRegression(sklearnObject)
  elif type(sklearnObject) == SVC or type(sklearnObject) == LinearSVC:
    return ScikitlearnSVC(sklearnObject)
  else:
    return ScikitlearnClassifier(sklearnObject)

## Calcular estadisticas

In [None]:
def calcularDatasetAtacante(train, test, ratio=0.8):
  train_size = int(len(train) * ratio)
  test_size = int(len(test) * ratio)

  return train_size, test_size

In [None]:
def obtenerEstadisticas(inferred_train_bb, inferred_test_bb):

  real_train_bb = np.ones(len(inferred_train_bb))
  real_test_bb = np.zeros(len(inferred_test_bb))

  inferred = np.concatenate((inferred_train_bb, inferred_test_bb))
  real = np.concatenate((real_train_bb, real_test_bb))

  train_acc = accuracy_score(real_train_bb, inferred_train_bb)
  test_acc = accuracy_score(real_test_bb, inferred_test_bb)

  acc = accuracy_score(real, inferred)

  return acc
 

In [None]:
def entrenarModeloAtacante(modeloAtaque, x_train, y_train, x_test, y_test, ratio=0.5):
  attack_train_size, attack_test_size = calcularDatasetAtacante( x_train, x_test, ratio=ratio)

  modeloAtaque.fit(x_train[:attack_train_size], y_train[:attack_train_size],  x_test[:attack_test_size], y_test[:attack_test_size])

  inferred_train_bb = modeloAtaque.infer(x_train[-attack_test_size:], y_train[-attack_test_size:])
  inferred_test_bb = modeloAtaque.infer(x_test[-attack_test_size:], y_test[-attack_test_size:])

  acc = obtenerEstadisticas(inferred_train_bb, inferred_test_bb)
  
  return acc


# Experiments

## Membership Inference: Shadow Training

In [None]:
def cargarModelos():
  modelos = []

  modelos.append(ExtraTreesClassifier())
  modelos.append(RandomForestClassifier())
  modelos.append(BaggingClassifier())
  modelos.append(DecisionTreeClassifier())
  modelos.append(AdaBoostClassifier())
  modelos.append(GradientBoostingClassifier())
  modelos.append(LogisticRegression(max_iter=2000))
  modelos.append(SVC())
  modelos.append(LinearSVC(max_iter=2000))

  return modelos


In [None]:
def experimento1(modelos, X, Y, Dataset, tipoModeloAtacante= "rf", repeticiones = 1 ):

  kf = KFold(n_splits=5)
  kf.get_n_splits(X)

  experimentdf = pd.DataFrame(columns=["Dataset", "Modelo", 'Hiperparametro','Metric', 'Score'])
  for modelo in modelos:
    tmp_score_train = list()
    tmp_score_test = list()
    tmp_accuracy = list()
    tmp_precision = list()
    tmp_recall = list()
    for i in range(repeticiones):
      for train_index, test_index in kf.split(X):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        #
        modelo.fit(x_train,y_train)
        modeloAtaque = MembershipInferenceBlackBox(objART(modelo), attack_model_type = tipoModeloAtacante)
        acc = entrenarModeloAtacante(modeloAtaque, x_train, y_train, x_test, y_test)

        tmp_score_train.append( modelo.score(x_train, y_train))
        tmp_score_test.append( modelo.score(x_test, y_test))
        tmp_accuracy.append(acc)

    sco_train_avg = np.average(tmp_score_train)
    sco_test_avg = np.average(tmp_score_test)
    acc_avg = np.average(tmp_accuracy)

    temporaldf = pd.DataFrame({
      'Dataset': [Dataset]*3,
      'Modelo': [type(modelo).__name__]*3,
      'Metric': ["Train Acc", "Test Acc","MI"],
      'Score': [sco_train_avg, sco_test_avg, acc_avg ]
      })
    experimentdf = pd.concat([experimentdf, temporaldf])

  return experimentdf

In [None]:
def experimentoCompararModelos(modelos):

  Car_df = experimento1(modelos, X_car, Y_car, "Car")
  Nursery_df = experimento1(modelos, X_nursery, Y_nursery, "Nursery")
  Titanic_df = experimento1(modelos, X_titanic, Y_titanic, "Titanic")
  BreastCancer_df = experimento1(modelos, X_BreastCancer, Y_BreastCancer, "Breast Cancer")
  Adult_df = experimento1(modelos, X_adult, Y_adult, "Adult")
  MNIST_df = experimento1(modelos, X_mnist, Y_mnist, "MNIST")

  df = pd.concat([Car_df, Nursery_df, Titanic_df, BreastCancer_df, Adult_df, MNIST_df])

  return df


In [None]:
def visualiceResults(df):
  gfg = pd.pivot_table(
    df,
	  index=['Dataset',"Modelo"],
    columns='Metric',
    values='Score',
    aggfunc='first'
  )
  df = gfg
  df = df.reindex(df.sort_values(by='MI', ascending=True).index)

  df.MI = df.MI * -1
  Diverging = make_subplots( rows=2, cols=3, shared_yaxes='all', shared_xaxes='all', vertical_spacing=0.1,
                            subplot_titles=("Adult", "Breast Cancer", "Car", "MNIST", "Nursery", "Titanic"))
	
  Diverging.update_xaxes(tickvals=[-1, -0.5, 0, 0.5, 1], ticktext=[1, 0.5, 0, 0.5, 1])
  n = {"Adult": [1,1], "Breast Cancer": [1,2], "Car": [1,3], "MNIST": [2,1], "Nursery": [2,2], "Titanic": [2,3]}
  for k in n:
    leyend =  k =='Adult'
    Diverging.add_trace(go.Bar(x=df.loc[k]['MI'], y=df.loc[k].index, orientation='h', name='MI',marker_color='rgb(188,128,189)',showlegend=leyend), row=n[k][0], col=n[k][1])
    Diverging.add_trace(go.Bar(x=df.loc[k]['Train Acc'], y=df.loc[k].index, orientation='h', name='Train Acc',marker_color='rgba(251,128,114,0.9)',showlegend=leyend), row=n[k][0], col=n[k][1])
    Diverging.add_trace(go.Bar(x=df.loc[k]['Test Acc'], y=df.loc[k].index, orientation='h', name='Test Acc',marker_color='rgb(128,177,211)',showlegend=leyend), row=n[k][0], col=n[k][1])

  Diverging.update_layout(barmode='overlay', height=700, width=1300,  
              legend = dict(font = dict(size = 15)),
							bargap=0.3, legend_orientation='h', legend_x=0.36, legend_y=1.13 )
  return Diverging

---

In [None]:
modelos = cargarModelos()
df_results = experimentoCompararModelos(modelos)

In [None]:
visualiceResults(df_results)

## Hyperparameter Selection


In [None]:
def experimentoHiperparametrosEjecucion(modelos, hiperparametros, X, Y, Dataset, tipoModeloAtacante= "rf", repeticiones = 1 ):
  kf = KFold(n_splits=5)
  kf.get_n_splits(X)

  experimentdf = pd.DataFrame(columns=["Dataset", "Modelo", 'Hiperparametro','Metric', 'Score'])
  for modelo,hiperparametro in zip(modelos,hiperparametros):
    tmp_score_train = list()
    tmp_score_test = list()
    tmp_accuracy = list()
    for i in range(repeticiones):
      for train_index, test_index in kf.split(X):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        #
        modelo.fit(x_train,y_train)
        modeloAtaque = MembershipInferenceBlackBox(objART(modelo), attack_model_type = tipoModeloAtacante)
        acc = entrenarModeloAtacante(modeloAtaque, x_train, y_train, x_test, y_test)

        tmp_score_train.append( modelo.score(x_train, y_train))
        tmp_score_test.append( modelo.score(x_test, y_test))
        tmp_accuracy.append(acc)

    sco_train_avg = np.average(tmp_score_train)
    sco_test_avg = np.average(tmp_score_test)
    acc_avg = np.average(tmp_accuracy)

    temporaldf = pd.DataFrame({
      'Dataset': [Dataset]*3,
      'Modelo': [type(modelo).__name__]*3,
      'Hiperparametro': [hiperparametro]*3,
      'Metric': ["Train Acc", "Test Acc","MI"],
      'Score': [sco_train_avg, sco_test_avg, acc_avg]
      })
    experimentdf = pd.concat([experimentdf, temporaldf])

  return experimentdf

In [None]:
def experimentoHiperparametros(modelos, hiperparametros):

  Adult_df = experimentoHiperparametrosEjecucion(modelos, hiperparametros, X_adult, Y_adult, "Adult")

  return Adult_df

### Decision Tree

In [None]:
hiperparametros = ["default", "Gini criterion", "Best splitter","Low max_features", "Low max_depth","min_samples_split", "min_samples_leaf", "Pruning"]

modelos = list()
modelos.append(DecisionTreeClassifier())
modelos.append(DecisionTreeClassifier(criterion='gini'))
modelos.append(DecisionTreeClassifier(splitter='best'))
modelos.append(DecisionTreeClassifier(max_features=1))
modelos.append(DecisionTreeClassifier(max_depth=10))
modelos.append(DecisionTreeClassifier(min_samples_split=25))
modelos.append(DecisionTreeClassifier(min_samples_leaf=25))
modelos.append(DecisionTreeClassifier(ccp_alpha=2.9346628038314405e-05))


decisiontreedf = experimentoHiperparametros(modelos, hiperparametros)

### Bagging

In [None]:
hiperparametros = ["default", "High n_estimators", "No bootstrap","bootstrap_features", "Low max_features"]

modelos = list()
modelos.append(BaggingClassifier())
modelos.append(BaggingClassifier(n_estimators=300))
modelos.append(BaggingClassifier(bootstrap=False))
modelos.append(BaggingClassifier(bootstrap_features=True))
modelos.append(BaggingClassifier(max_features=2))

baggingdf = experimentoHiperparametros(modelos, hiperparametros)

### Random Forest

In [None]:
hiperparametros = ["default", "High n_estimators", "Entropy criterion","Low max_features", "No bootstrap", "Low max_depth", "min_samples_split", "min_samples_leaf", "Pruning"]

modelos = list()
modelos.append(RandomForestClassifier())
modelos.append(RandomForestClassifier(n_estimators=300))
modelos.append(RandomForestClassifier(criterion='entropy'))
modelos.append(RandomForestClassifier(max_features=1))
modelos.append(RandomForestClassifier(bootstrap=False))
modelos.append(RandomForestClassifier(max_depth=10))
modelos.append(RandomForestClassifier(min_samples_split=25))
modelos.append(RandomForestClassifier(min_samples_leaf=25))
modelos.append(RandomForestClassifier(ccp_alpha=2.9346628038314405e-05))

randomforestdf = experimentoHiperparametros(modelos, hiperparametros)

### ExtraTreesClassifier

In [None]:
hiperparametros = ["default", "High n_estimators", "Entropy criterion", "bootstrap", "Low max_features", "Low max_depth", "min_samples_split", "min_samples_leaf", "Pruning"]

modelos = list()
modelos.append(ExtraTreesClassifier())
modelos.append(ExtraTreesClassifier(n_estimators=300))
modelos.append(ExtraTreesClassifier(criterion='entropy'))
modelos.append(ExtraTreesClassifier(bootstrap=True))
modelos.append(ExtraTreesClassifier(max_features=1))
modelos.append(ExtraTreesClassifier(max_depth=10))
modelos.append(ExtraTreesClassifier(min_samples_split=25))
modelos.append(ExtraTreesClassifier(min_samples_leaf=25))
modelos.append(ExtraTreesClassifier(ccp_alpha=2.9346628038314405e-05))

extratreesdf = experimentoHiperparametros(modelos, hiperparametros)

### AdaBoost

In [None]:
hiperparametros = ["default", "High n_estimators", "Low learning_rate","SAMME.R algorithm"]

modelos = list()
modelos.append(AdaBoostClassifier())
modelos.append(AdaBoostClassifier(n_estimators=300))
modelos.append(AdaBoostClassifier(learning_rate=0.5))
modelos.append(AdaBoostClassifier(algorithm='SAMME'))


adaboostdf = experimentoHiperparametros(modelos, hiperparametros)

### Gradient Boosting

In [None]:
hiperparametros = ["default", "High n_estimators", "Low learning_rate", "Low max_features", "Low max_depth", "min_samples_split", "min_samples_leaf", "Pruning"]

modelos = list()
modelos.append(GradientBoostingClassifier())
modelos.append(GradientBoostingClassifier(n_estimators=300))
modelos.append(GradientBoostingClassifier(learning_rate=0.5))
modelos.append(GradientBoostingClassifier(max_features=1))
modelos.append(GradientBoostingClassifier(max_depth=10))
modelos.append(GradientBoostingClassifier(min_samples_split=25))
modelos.append(GradientBoostingClassifier(min_samples_leaf=25))
modelos.append(GradientBoostingClassifier(ccp_alpha=2.9346628038314405e-05))

gradientboostingdf = experimentoHiperparametros(modelos, hiperparametros)

### Logistic Regresion

In [None]:
hiperparametros = ["default", "l2 with newton-cg", "l2 with lbfgs", "l1 with liblinear", "l2 with liblinear", "l2 with sag", "elasticnet with saga", "l1 with saga", "l2 with saga", "C"]

modelos = list()

modelos.append(LogisticRegression())
modelos.append(LogisticRegression(penalty="l2", solver="newton-cg"))
modelos.append(LogisticRegression(penalty="l2", solver="lbfgs"))
modelos.append(LogisticRegression(penalty="l1", solver="liblinear"))
modelos.append(LogisticRegression(penalty="l2", solver="liblinear"))
modelos.append(LogisticRegression(penalty="l2", solver="sag"))
modelos.append(LogisticRegression(penalty="elasticnet", solver="saga", l1_ratio=0.5))
modelos.append(LogisticRegression(penalty="l1", solver="saga"))
modelos.append(LogisticRegression(penalty="l2", solver="saga"))
modelos.append(LogisticRegression(C=1e2))


LogisticRegressiondf = experimentoHiperparametros(modelos, hiperparametros)

### SVC

In [None]:
hiperparametros = ["default", "poly kernel", "auto gamma", "C"]

modelos = list()

modelos.append(SVC())
modelos.append(SVC(kernel="poly"))
modelos.append(SVC(gamma="auto"))
modelos.append(SVC(C=1e-5))


SVCdf = experimentoHiperparametros(modelos, hiperparametros)

### Linear SVC

In [None]:
hiperparametros = ["default", "l1 penalty", "hinge loss", "C"]

modelos = list()

modelos.append(LinearSVC())
modelos.append(LinearSVC(penalty="l1", dual=False))
modelos.append(LinearSVC(loss="hinge"))
modelos.append(LinearSVC(C=1e-5))

LinearSVCdf = experimentoHiperparametros(modelos, hiperparametros)
LinearSVCdf.to_csv('LinearSVCdf.csv')

## Results

In [None]:
todosdf = pd.concat([adaboostdf, decisiontreedf, extratreesdf, randomforestdf, gradientboostingdf, baggingdf, LogisticRegressiondf, SVCdf, LinearSVCdf], ignore_index=True)

In [None]:
def visualiceResultsHyper(df):
  df = todosdf
  gfg = pd.pivot_table(
    df,
    index=["Modelo",'Hiperparametro'],
    columns='Metric',
    values='Score',
    aggfunc='first'
  )
  df = gfg

  df = df.reindex(df.sort_values(by='Train Acc', ascending=True).sort_values(by='MI', ascending=True).index)

  df.MI = df.MI * -1
  Diverging = make_subplots( rows=3, cols=3,
                            shared_yaxes=False, shared_xaxes='all', 
                            vertical_spacing=0.04,
                            horizontal_spacing = 0.1,
                            subplot_titles=("DecisionTreeClassifier", "BaggingClassifier", "RandomForestClassifier", "ExtraTreesClassifier", "AdaBoostClassifier", "GradientBoostingClassifier", "LogisticRegression", "SVC", "LinearSVC"))

  Diverging.update_xaxes(tickvals=[-1, -0.5, 0, 0.5, 1], ticktext=[1, 0.5, 0, 0.5, 1])
  n = {"DecisionTreeClassifier": [1,1], "BaggingClassifier": [1,2], "RandomForestClassifier": [1,3], "ExtraTreesClassifier": [2,1], "AdaBoostClassifier": [2,2], "GradientBoostingClassifier": [2,3], "LogisticRegression": [3,1], "SVC": [3,2], "LinearSVC": [3,3]}
  for k in n:
    leyend =  k =='DecisionTreeClassifier'
    Diverging.add_trace(go.Bar(x=df.loc[k]['MI'], y=df.loc[k].index, orientation='h', name='MI',marker_color='rgb(188,128,189)',showlegend=leyend), row=n[k][0], col=n[k][1])
    Diverging.add_trace(go.Bar(x=df.loc[k]['Train Acc'], y=df.loc[k].index, orientation='h', name='Train Acc',marker_color='rgb(251,128,114)',showlegend=leyend), row=n[k][0], col=n[k][1])
    Diverging.add_trace(go.Bar(x=df.loc[k]['Test Acc'], y=df.loc[k].index, orientation='h', name='Test Acc',marker_color='rgb(128,177,211)',showlegend=leyend), row=n[k][0], col=n[k][1])

  Diverging.update_layout(barmode='overlay', xaxis_range=[-1.1,1.1],
              height=1100, width=1600,
              legend = dict(font = dict(size = 15)),
              bargap=0.3, legend_orientation='h', legend_x=0.39, legend_y=1.07
              )

  return Diverging

visualiceResultsHyper(todosdf)