In [13]:
# Imports
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
df = pd.read_csv('automobile_data.csv')

#movendo a variável target para a extrema direita do dataset
column_names = ['symboling', 'normalized-losses', 'make', 'fuel-type',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price', 'aspiration']

df = df.reindex(columns=column_names)

#separando variáveis preditoras da target
X = df.iloc[:,:-1]
y = df['aspiration'].values

#loop que percorre lista de colunas categóricas que terão valores por categoria convertidas em número 
X = X.astype(str)
novo_valor_numerico = 0
i = 0
for x in X.columns:
    for z in X[x].drop_duplicates():        
        X[x].replace({z: str(i)}, inplace=True)
        i+=1
    i = 0
X = X.astype(int)

# Divisão em dados de treino e de teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Bagging Classifier

 é usado para construção de múltiplos modelos (normalmente do mesmo tipo) a partir de diferentes subsets no dataset de treino.

In [5]:
# Construção do Classificador
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples = 0.5, max_features = 0.5)

In [6]:
# Score do modelo
scores = cross_val_score(bagging, X, y)

In [7]:
# Média do score
print(scores.mean())

0.8195121951219513


# Extra Tree Classifier

Treina decisions tree randomizadas

In [10]:
# Cria o classificador com uma árvore de decisão
clf = DecisionTreeClassifier(max_depth = None, min_samples_split = 2, random_state = 0)
scores = cross_val_score(clf, X, y)
mean = scores.mean()
print(scores)
print(mean)

[0.80487805 0.85365854 0.73170732 0.87804878 0.85365854]
0.8243902439024391


In [11]:
# Cria o classificador com Random Forest
clf = RandomForestClassifier(n_estimators = 10, max_depth = None, min_samples_split = 2, random_state = 0)
scores = cross_val_score(clf, X, y)
mean = scores.mean()
print(scores)
print(mean)

[0.87804878 0.85365854 0.82926829 0.85365854 0.80487805]
0.8439024390243903


In [12]:
# Cria o classificador com Extra Tree
clf = ExtraTreesClassifier(n_estimators = 10, max_depth = None, min_samples_split = 2, random_state = 0)
scores = cross_val_score(clf, X, y)
mean = scores.mean()
print(scores)
print(mean)

[0.87804878 0.90243902 0.95121951 0.82926829 0.80487805]
0.873170731707317


Extra tree atingiu melhor performance comparado a Decision Tree e Random Forest

# AdaBoost Classifier

Cria um classificador inicial e vai fazendo ajustes de pesos em cópia adicionais do classificador

In [14]:
# Construindo o estimador base
estim_base = DecisionTreeClassifier(max_depth = 1, min_samples_leaf = 1)

In [15]:
# Construindo a primeira versão do modelo Adaboost
ada_clf_v1 = AdaBoostClassifier(base_estimator = estim_base, 
                                learning_rate = 0.1, 
                                n_estimators = 400, 
                                algorithm = "SAMME")

In [16]:
# Treinamento do modelo
ada_clf_v1.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                            

In [17]:
# Score
scores = cross_val_score(ada_clf_v1, X_test, y_test)
print(scores)
means = scores.mean()
print(means)

[0.77777778 0.625      0.875      0.625      0.875     ]
0.7555555555555555


In [18]:
# Construindo a segunda versão do modelo Adaboost
ada_clf_v2 = AdaBoostClassifier(base_estimator = estim_base, 
                                learning_rate = 1.0, 
                                n_estimators = 400, 
                                algorithm = "SAMME")

In [19]:
# Treinamento do modelo
ada_clf_v2.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                            

In [20]:
# Score
scores = cross_val_score(ada_clf_v2, X_test, y_test)
print(scores)
means = scores.mean()
print(means)

[0.77777778 0.625      0.875      0.625      1.        ]
0.7805555555555556
