In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier #classificador ruim

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

In [2]:
# LOAD DATA
dSet = datasets.load_iris() #datasets.load tem outros exemplos
print(dSet.keys())
print(dSet.DESCR)

data = dSet.data
target = dSet.target
tgNames = dSet.target_names
# print(tgNames)

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes

In [3]:
# VIEW DATA
print('DATA')
print(data.shape)
print()

print('TARGET')
print(target.shape)
print()

DATA
(150, 4)

TARGET
(150,)



In [4]:
# SPLIT - TEST & TRAIN
XTrain, XTest, yTrain, yTest = train_test_split(data, target, test_size=0.7, random_state=42)

In [6]:
#CLASSIFIERS
cls_decTree = DecisionTreeClassifier()
cls_naive = GaussianNB()
cls_logreg = LogisticRegression(C=1, solver='lbfgs')
cls_knn = KNeighborsClassifier(n_neighbors=5)
cls_mlp = MLPClassifier(hidden_layer_sizes=2)
cls_dm = DummyClassifier()

In [7]:
# PREPROCESSING - escolher um para rodar o código corretamente
#pipe_cls = make_pipeline(StandardScaler(), cls_decTree)
#pipe_cls = make_pipeline(StandardScaler(), cls_naive)
pipe_cls = make_pipeline(StandardScaler(), cls_logreg) #--------funcionando-------
#pipe_cls = make_pipeline(StandardScaler(), cls_knn)
#pipe_cls = make_pipeline(StandardScaler(), cls_mlp)
#pipe_cls = cls_dm

# CLASSIFICATION
pipe_cls.fit(XTrain, yTrain)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(C=1))])

In [8]:
# PREDICT
predict = pipe_cls.predict(XTest)
print('PREDICT')
print(predict)
print()

PREDICT
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0 1 1 2 1 2 1 2 1 0 2 1 0 0 0 1 2 0 0 0 1 0 1 2 0 1 2 0 2 2
 1 1 2 1 0 1 2 0 0 1 2 0 2 0 0 2 1 2 2 1 2 1 0 0 1 2 0 0 0 1 2]



In [9]:
# RELEVANCE
print('Report')
# print(classification_report(yTest, predict, zero_division=0))
print(classification_report(yTest, predict, target_names=tgNames, zero_division=0))

Report
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        40
  versicolor       0.91      0.91      0.91        33
   virginica       0.91      0.91      0.91        32

    accuracy                           0.94       105
   macro avg       0.94      0.94      0.94       105
weighted avg       0.94      0.94      0.94       105



In [10]:
print('Confusion Matrix')
print(confusion_matrix(yTest, predict))
print()

Confusion Matrix
[[40  0  0]
 [ 0 30  3]
 [ 0  3 29]]



In [11]:
# TUNNING
scores = {'accuracy', 'precision_micro', 'recall_micro', 'f1_micro'}
cv = cross_validate(pipe_cls, data, target, cv=4, scoring = scores)

In [12]:
print('Accuracy: ', cv['test_accuracy'])
print('Precision: ', cv['test_precision_micro'])
print('Recall: ', cv['test_recall_micro'])
print('F1-score: ', cv['test_f1_micro'])

Accuracy:  [0.97368421 0.97368421 0.89189189 1.        ]
Precision:  [0.97368421 0.97368421 0.89189189 1.        ]
Recall:  [0.97368421 0.97368421 0.89189189 1.        ]
F1-score:  [0.97368421 0.97368421 0.89189189 1.        ]


In [13]:
# AFTER TRAINING
print(cls_logreg.get_params().keys())
print(pipe_cls.get_params().keys())

print(pipe_cls)

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])
dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'logisticregression', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'logisticregression__C', 'logisticregression__class_weight', 'logisticregression__dual', 'logisticregression__fit_intercept', 'logisticregression__intercept_scaling', 'logisticregression__l1_ratio', 'logisticregression__max_iter', 'logisticregression__multi_class', 'logisticregression__n_jobs', 'logisticregression__penalty', 'logisticregression__random_state', 'logisticregression__solver', 'logisticregression__tol', 'logisticregression__verbose', 'logisticregression__warm_start'])
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(C=1))])


In [14]:
params = {'logisticregression__C': [0.5, 1, 0.2], # é possível explorar outros parâmetros e decidir o melhor conforme a análise da saida
          'logisticregression__fit_intercept': [True, False],
          'logisticregression__solver': ['newton-cg', 'saga', 'sag','lbfgs']}

gs = GridSearchCV(estimator = pipe_cls, param_grid = params, cv = 3, scoring ='accuracy')
gs.fit(data, target)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(C=1))]),
             param_grid={'logisticregression__C': [0.5, 1, 0.2],
                         'logisticregression__fit_intercept': [True, False],
                         'logisticregression__solver': ['newton-cg', 'saga',
                                                        'sag', 'lbfgs']},
             scoring='accuracy')

In [16]:
print(gs.best_params_)
print(gs.best_score_)

{'logisticregression__C': 1, 'logisticregression__fit_intercept': True, 'logisticregression__solver': 'newton-cg'}
0.96
