**Importando as bibliotecas básicas**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
from plotly import tools
py.init_notebook_mode(connected=True)
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
#from catboost import CatBoostClassifier
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from mlxtend.plotting import plot_decision_regions

**Devido o Plotly estar sendo utilizado no Google Collab, precisaremos definir a função abaixo e chamá-la sempre que quisermos exibir um gráfico**

In [None]:
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-1.5.1.min.js?noext',
            },
          });
        </script>
        '''))

**Lendo o conjunto de dados**

In [None]:
cars = pd.read_csv('.../car_evaluation.csv')
cars.shape

(1727, 7)

**Atribuindo os títulos das colunas**

In [None]:
cars.columns = ['Buying', 'Maint', 'Doors','Persons','LugBoot','Safety','Evaluation']

**Visão geral dos dados**

In [None]:
cars.sample(10)

Unnamed: 0,Buying,Maint,Doors,Persons,LugBoot,Safety,Evaluation
169,vhigh,high,4,2,big,high,unacc
615,high,high,4,more,med,med,acc
650,high,med,2,2,med,low,unacc
714,high,med,4,4,med,med,acc
1537,low,med,2,more,big,high,vgood
1569,low,med,4,2,med,med,unacc
247,vhigh,med,3,2,med,high,unacc
725,high,med,4,more,big,low,unacc
80,vhigh,vhigh,5more,2,small,low,unacc
1227,med,low,3,4,med,med,acc


**Visão geral dos dados aceitáveis**

In [None]:
a_df=[]
for i in cars.values:
    if i[6] == 'acc':
        a_df.append(i)

df=pd.DataFrame(a_df)
df.sample(10)

Unnamed: 0,0,1,2,3,4,5,6
128,high,med,4,4,big,high,acc
267,med,med,3,more,med,med,acc
255,med,med,2,4,med,high,acc
145,high,low,2,4,med,high,acc
298,low,vhigh,2,4,big,high,acc
306,low,vhigh,3,more,small,high,acc
309,low,vhigh,3,more,big,med,acc
74,high,high,2,4,big,med,acc
106,high,high,5more,more,big,med,acc
76,high,high,2,more,med,high,acc


**Verificando se há algum valor ausente no dataset**

In [None]:
cars.isnull().sum()

Buying        0
Maint         0
Doors         0
Persons       0
LugBoot       0
Safety        0
Evaluation    0
dtype: int64

**Como foi visto, não há velores ausentes. Agora, faremos uma análise analítica no dataset.**

In [None]:
cars.describe()

Unnamed: 0,Buying,Maint,Doors,Persons,LugBoot,Safety,Evaluation
count,1727,1727,1727,1727,1727,1727,1727
unique,4,4,4,3,3,3,4
top,high,high,4,4,big,high,unacc
freq,432,432,432,576,576,576,1209


**Como pode ser visto, nossos dados são categóricos**

In [None]:
cars.columns

Index(['Buying', 'Maint', 'Doors', 'Persons', 'LugBoot', 'Safety',
       'Evaluation'],
      dtype='object')

**Agora, vamos listar o número de carros em cada classe de avaliação**

In [None]:
cars['Evaluation'].value_counts().sort_index()

acc       384
good       69
unacc    1209
vgood      65
Name: Evaluation, dtype: int64

**Gráfico "Distribuição de carros avaliados"**

In [None]:
configure_plotly_browser_state()
fig = {
  "data": [
    {
      "values": [1210,384,69,65],
      "labels": [
        "Unacceptable",
        "Acceptable",
        "Good",
        "Very Good"
      ],
      "domain": {"column": 0},
      "name": "Car Evaluation",
      "hoverinfo":"label+percent+name",
      "hole": .6,
      "type": "pie"
    }],
  "layout": {
        #"title":"Distribuição de carros avaliados",
        "grid": {"rows": 1, "columns": 1},
        "annotations": [
            {
                "font": {
                    "size": 36
                },
                "showarrow": False,
                "text": "",
                "x": 0.5,
                "y": 0.5
            }
        ]
    }
}
print("\n\n\t\t\t\tDistribuição de carros avaliados")
py.iplot(fig, filename='cars_donut')



				Distribuição de carros avaliados


**Substituindo dados não numéricos de colunas numéricas**

In [None]:
cars.Doors.replace(('5more'),('5'),inplace=True)
cars.Persons.replace(('more'),('5'),inplace=True)

**Preparando dados para gráficos de correlação das features com a classe**

In [None]:
features = cars.iloc[:,:-1]
features[:5]
a=[]
for i in features:
    a.append(features[i].value_counts())

In [None]:
buy = pd.crosstab(cars['Buying'], cars['Evaluation'])
mc = pd.crosstab(cars['Maint'], cars['Evaluation'])
drs = pd.crosstab(cars['Doors'], cars['Evaluation'])
prsn = pd.crosstab(cars['Persons'], cars['Evaluation'])
lb = pd.crosstab(cars['LugBoot'], cars['Evaluation'])
sfty = pd.crosstab(cars['Safety'], cars['Evaluation'])
buy

Evaluation,acc,good,unacc,vgood
Buying,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
high,108,0,324,0
low,89,46,258,39
med,115,23,268,26
vhigh,72,0,359,0


**Gráfico "Preço de venda vs avaliação"**

In [None]:
configure_plotly_browser_state()
data = [
    go.Bar(
        x=a[0].index, # assign x as the dataframe column 'x'
        y=buy['unacc'],
        name='Unacceptable'
    ),
    go.Bar(
        x=a[0].index,
        y=buy['acc'],
        name='Acceptable'
    ),
    go.Bar(
        x=a[0].index,
        y=buy['good'],
        name='Good'
    ),
    go.Bar(
        x=a[0].index,
        y=buy['vgood'],
        name='Very Good'
    )

]

layout = go.Layout(
    barmode='stack',
    #title='Preço de venda vs avaliação'
)

fig = go.Figure(data=data, layout=layout)
print("\n\n\t\t\t\tPreço de venda vs avaliação")
py.iplot(fig, filename='distri')



				Preço de venda vs avaliação


**Gráfico "Custo de manutenção vs avaliação"**

In [None]:
configure_plotly_browser_state()
data = [
    go.Bar(
        x=a[1].index, # assign x as the dataframe column 'x'
        y=mc['unacc'],
        name='Unacceptable'
    ),
    go.Bar(
        x=a[1].index,
        y=mc['acc'],
        name='Acceptable'
    ),
    go.Bar(
        x=a[1].index,
        y=mc['good'],
        name='Good'
    ),
    go.Bar(
        x=a[1].index,
        y=mc['vgood'],
        name='Very Good'
    )

]

layout = go.Layout(
    barmode='stack',
    #title='Custo de manutenção vs avaliação'
)
fig = go.Figure(data=data, layout=layout)
print("\n\n\t\t\t\tCusto de manutenção vs avaliação")
py.iplot(fig, filename='cars_donut')



				Custo de manutenção vs avaliação


**Gráfico "Portas vs Avaliação"**

In [None]:
configure_plotly_browser_state()
data = [
    go.Bar(
        x=a[2].index, # assign x as the dataframe column 'x'
        y=drs['unacc'],
        name='Unacceptable'
    ),
    go.Bar(
        x=a[2].index,
        y=drs['acc'],
        name='Acceptable'
    ),
    go.Bar(
        x=a[2].index,
        y=drs['good'],
        name='Good'
    ),
    go.Bar(
        x=a[2].index,
        y=drs['vgood'],
        name='Very Good'
    )

]

layout = go.Layout(
    barmode='stack',
    #title='Portas vs Avaliação'
)

fig = go.Figure(data=data, layout=layout)
print("\n\n\t\t\t\tPortas vs Avaliação")
py.iplot(fig, filename='cars_donut')



				Portas vs Avaliação


**Gráfico "Número de passageiros vs avaliação"**

In [None]:
configure_plotly_browser_state()
data = [
    go.Bar(
        x=a[3].index, # assign x as the dataframe column 'x'
        y=prsn['unacc'],
        name='Unacceptable'
    ),
    go.Bar(
        x=a[3].index,
        y=prsn['acc'],
        name='Acceptable'
    ),
    go.Bar(
        x=a[3].index,
        y=prsn['good'],
        name='Good'
    ),
    go.Bar(
        x=a[3].index,
        y=prsn['vgood'],
        name='Very Good'
    )

]

layout = go.Layout(
    barmode='stack',
    #title='Número de passageiros vs avaliação'
)

fig = go.Figure(data=data, layout=layout)
print("\n\n\t\t\t\tNúmero de passageiros vs avaliação")
py.iplot(fig, filename='cars_donut')



				Número de passageiros vs avaliação


**Gráfico "Porta-malas vs avaliação"**

In [None]:
configure_plotly_browser_state()
data = [
    go.Bar(
        x=a[4].index, # assign x as the dataframe column 'x'
        y=lb['unacc'],
        name='Unacceptable'
    ),
    go.Bar(
        x=a[4].index,
        y=lb['acc'],
        name='Acceptable'
    ),
    go.Bar(
        x=a[4].index,
        y=lb['good'],
        name='Good'
    ),
    go.Bar(
        x=a[4].index,
        y=lb['vgood'],
        name='Very Good'
    )

]

layout = go.Layout(
    barmode='stack',
    #title='Porta-malas vs avaliação'
)

fig = go.Figure(data=data, layout=layout)
print("\n\n\t\t\t\tPorta-malas vs avaliação")
py.iplot(fig, filename='cars_donut')



				Porta-malas vs avaliação


**Gráfico "Segurança vs Avaliação"**

In [None]:
configure_plotly_browser_state()
data = [
    go.Bar(
        x=a[5].index, # assign x as the dataframe column 'x'
        y=sfty['unacc'],
        name='Unacceptable'
    ),
    go.Bar(
        x=a[5].index,
        y=sfty['acc'],
        name='Acceptable'
    ),
    go.Bar(
        x=a[5].index,
        y=sfty['good'],
        name='Good'
    ),
    go.Bar(
        x=a[5].index,
        y=sfty['vgood'],
        name='Very Good'
    )

]

layout = go.Layout(
    barmode='stack',
    #title='Segurança vs Avaliação'
)

fig = go.Figure(data=data, layout=layout)
print("\n\n\t\t\t\tSegurança vs Avaliação")
py.iplot(fig, filename='cars_donut')



				Segurança vs Avaliação


### **Preparando dados utilizando a função Replace**

**Nomeando as colunas**

In [None]:
cars.columns = ['Buying', 'Maint', 'Doors','Persons','LugBoot','Safety','Evaluation']
cars.head()

Unnamed: 0,Buying,Maint,Doors,Persons,LugBoot,Safety,Evaluation
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


**Codificando os dados considerando a ordem**

In [None]:
cars['Buying'].replace(('low', 'med', 'high', 'vhigh'), (0, 1, 2, 3), inplace = True)
cars['Maint'].replace(('low', 'med', 'high', 'vhigh'), (0, 1, 2, 3), inplace = True)
cars['LugBoot'].replace(('small', 'med', 'big'), (0, 1, 2), inplace = True)
cars['Safety'].replace(('low', 'med', 'high'), (0, 1, 2), inplace = True)
cars['Evaluation'].replace(('unacc', 'acc', 'good', 'vgood'), (0, 1, 2, 3), inplace = True)

cars['Doors'] = pd.to_numeric(cars['Doors'])
cars['Persons'] = pd.to_numeric(cars['Persons'])
cars

Unnamed: 0,Buying,Maint,Doors,Persons,LugBoot,Safety,Evaluation
0,3,3,2,2,0,1,0
1,3,3,2,2,0,2,0
2,3,3,2,2,1,0,0
3,3,3,2,2,1,1,0
4,3,3,2,2,1,2,0
...,...,...,...,...,...,...,...
1722,0,0,5,5,1,1,2
1723,0,0,5,5,1,2,3
1724,0,0,5,5,2,0,0
1725,0,0,5,5,2,1,2


**Dividindo em x e y**

In [None]:
x = cars.iloc[:,:-1]
y = cars.iloc[:,-1]

**Dividindo em treino e teste**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42, stratify=y)

### **Aplicação dos modelos de classificação**

In [None]:
clfs = [
        KNeighborsClassifier(n_neighbors = 5),
        LogisticRegression(random_state = 42),
        SVC(kernel = 'rbf', random_state = 42), 
        GaussianNB(),
        DecisionTreeClassifier(criterion = 'entropy', random_state = 42),
        RandomForestClassifier(n_estimators = 25, criterion = 'entropy', random_state = 42),
        GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42),
        XGBClassifier(random_state=42),
        lgb.LGBMClassifier(random_state=42),
        Perceptron(tol=1e-3, random_state=42),
        MLPClassifier(random_state=42, max_iter=300)
      ]
scaler = np.array([['', 'NaoEscalado'],[StandardScaler(), 'Padronizado'], [MinMaxScaler(), 'Normalizado']]);
clf_labels = ['KNN', 'Logistic Regression', 'SVC rbf', 'Naive Bayes',  'Decision Tree',  'Random Forest',  'Gradient Boosting',  'XGB',  'LGBM',  'Perceptron',  'MLP']
i = 0
for clf, label in zip(clfs, clf_labels):
  for sc in scaler:
    if sc[1] == 'NaoEscalado':
      pipe = Pipeline([(label, clf)])
    else:
      pipe = Pipeline([('transform', sc[0]), (label, clf)])
    pipe.fit(x_train, y_train)
    y_pred = pipe.predict(x_test)
    if i == 0 :
      f1Score = np.array([[sc[1], f1_score(y_test,y_pred, average='macro')]])
    else:
      f1Score = np.append(f1Score, [[sc[1], f1_score(y_test,y_pred, average='macro')]], axis=0)
    print("\n\n====================================={} - {}=============================\n\n".format(label, sc[1]))
    print("Training Accuracy: ",pipe.score(x_train, y_train))
    print("Testing Accuracy: ", pipe.score(x_test, y_test))
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print(classification_report(y_test,y_pred))
    i = i + 1






Training Accuracy:  0.9799227799227799
Testing Accuracy:  0.9490740740740741
[[298   5   0   0]
 [  9  87   0   0]
 [  0   5  12   0]
 [  0   3   0  13]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       303
           1       0.87      0.91      0.89        96
           2       1.00      0.71      0.83        17
           3       1.00      0.81      0.90        16

    accuracy                           0.95       432
   macro avg       0.96      0.85      0.90       432
weighted avg       0.95      0.95      0.95       432





Training Accuracy:  0.9907335907335907
Testing Accuracy:  0.9629629629629629
[[295   8   0   0]
 [  1  95   0   0]
 [  0   4  11   2]
 [  0   1   0  15]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       303
           1       0.88      0.99      0.93        96
           2       1.00      0.65      0.79        17
           3       0.88      0.9

#### **Comparando Algoritmos com F1 score Macro**

In [None]:
f1Score = pd.DataFrame(f1Score)

configure_plotly_browser_state()

trace1 = go.Bar(x = clf_labels,
                y = f1Score[f1Score[0] == 'NaoEscalado'][1].to_numpy(),
                name = 'Original',)
trace2 = go.Bar(x = clf_labels,
                y = f1Score[f1Score[0] == 'Padronizado'][1].to_numpy(),
                name = 'Padronizado',)
trace3 = go.Bar(x = clf_labels,
                y = f1Score[f1Score[0] == 'Normalizado'][1].to_numpy(),
                name = 'Normalizado',)
data = [trace1, trace2, trace3]
py.iplot(data)

# **Grid Search**

In [None]:
pipe = Pipeline([('classifier', LogisticRegression())])

search_space = [
                {
                 'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [1, 2, 3, 4, 5],
                 'classifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
                },
                {
                 'classifier': [LogisticRegression(random_state = 42)],
                 'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],
                 'classifier__C': np.logspace(0, 4, 5)
                },
                {
                 'classifier': [SVC(random_state = 42)],
                 'classifier__C': np.logspace(0, 4, 10),
                 'classifier__kernel': ['linear', 'rbf', 'sigmoid'],
                 'classifier__gamma': ['scale', 'auto'],
                 'classifier__max_iter': [10, 100, 1000]
                 },
                 {
                  'classifier': [GaussianNB()],
                  'classifier__var_smoothing': np.logspace(0, 4, 5)
                 },
                 {
                 'classifier': [DecisionTreeClassifier(random_state = 42)],
                 'classifier__criterion': ['gini', 'entropy'],
                 'classifier__splitter':['best', 'random']
                 },
                 {
                 'classifier': [RandomForestClassifier(random_state = 42)],
                 'classifier__criterion': ['gini', 'entropy'],
                 'classifier__n_estimators':[10, 20, 30, 40, 50]
                 },
                 {
                 'classifier': [GradientBoostingClassifier(random_state=42)],
                 'classifier__n_estimators':[10, 20, 30, 40, 50,100],
                 'classifier__learning_rate':[0.1, 0.2, 0.4, 0.6, 0.8, 1.0],
                 'classifier__max_depth':[-1, 1, 2, 3],
                 },
                 {
                 'classifier': [XGBClassifier(random_state=42)],
                 'classifier__n_estimators':[10, 20, 30, 40, 50,100],
                 'classifier__learning_rate':[0.1, 0.2, 0.4, 0.6, 0.8, 1.0],
                 'classifier__max_depth':[-1, 1, 2, 3],
                 },
                 {
                 'classifier': [lgb.LGBMClassifier(random_state=42)],
                 'classifier__n_estimators':[10, 20, 30, 40, 50,100],
                 'classifier__learning_rate':[0.1, 0.2, 0.4, 0.6, 0.8, 1.0],
                 'classifier__max_depth':[-1, 1, 2, 3],
                 },
                 {
                 'classifier': [Perceptron(random_state=42)],
                 'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],
                 'classifier__max_iter': [10, 100, 1000],
                 'classifier__tol':[1e-1, 1e-2, 1e-3]
                 },
                 {
                 'classifier': [MLPClassifier(random_state=42)],
                 'classifier__learning_rate': ['constant', 'invscaling', 'adaptive'],
                 'classifier__max_iter': [10, 100, 1000],
                 'classifier__tol':[1e-1, 1e-2, 1e-3]
                 },
                ]
                


In [None]:
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, scoring = "f1_macro")
best_model = clf.fit(x_train, y_train)
print('Melhor F1-SCORE: ', best_model.best_score_)
best = best_model.best_estimator_.get_params()['classifier']
best

Melhor F1-SCORE:  0.9863221960448346


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=1.0, max_depth=3,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=40, n_jobs=-1, num_leaves=31, objective=None,
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
best.fit(x_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=1.0, max_depth=3,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=40, n_jobs=-1, num_leaves=31, objective=None,
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
best_return = best.predict(x_test)
metrics.f1_score(y_test, best_return, average='macro', zero_division=0)

0.9982792378619423