## Evaluating Methodology 1 (Traditional Machine Learning Methods)

### Packages

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
    classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import numpy as np
import pandas as pd

### Functions

In [14]:
def evaluateModels(X_train, y_train, models, n_splits):
    print(f"{n_splits}-Fold Cross validation")
    results = []
    names = []
    for name, model in models:
        kfold = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2)
        cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
        results.append(cv_results)
        names.append(name)
        print(f"{name}: Mean Accuracy={cv_results.mean():.5f}, Standard Deviation={cv_results.std():.5f}")

### Get dataset

In [7]:
df_diff = pd.read_csv('../../assets/df_diff.csv', index_col=0)
df_diff

# example of read_csv on google drive
# df_diff = pd.read_csv('/content/drive/MyDrive/Empresas/ZRP/Desafio Técnico/zrp_case-main/assets/df_diff.csv')

Unnamed: 0,diff_read0,diff_read1,diff_read2,diff_read3,diff_read4,diff_read5,diff_read6,diff_read7,diff_read8,diff_read9,mean,start_timestamp,end_timestamp,diff_timestemp,inference
0,0.22,-0.86,0.43,0.04,-0.86,-0.19,-0.33,0.05,0.23,-0.18,-0.145,1665656955,1665656967,12,1
1,0.21,0.15,0.08,0.30,-0.15,0.11,0.09,-0.10,0.03,-0.19,0.053,1665656968,1665656980,12,1
2,0.32,0.15,0.08,-0.02,0.03,0.11,-0.27,0.06,-0.04,0.57,0.099,1665656982,1665656994,12,1
3,0.05,0.13,0.34,0.53,0.55,0.51,0.43,0.33,0.28,0.37,0.352,1665656914,1665656926,12,1
4,0.16,0.42,0.46,0.48,0.44,0.41,-0.17,-0.29,0.34,0.34,0.259,1665656928,1665656940,12,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3510,-1.35,-1.34,-1.34,-1.35,-1.35,-1.34,-1.35,-1.37,-1.35,-1.36,-1.350,1674811711,1674811723,12,0
3511,-1.29,-1.52,-1.45,-1.44,-1.46,-1.45,-1.45,-1.20,0.47,-0.92,-1.171,1674811725,1674811737,12,1
3512,-1.23,-1.30,-1.29,-1.29,-1.27,-1.29,-1.28,-1.28,-1.28,-0.27,-1.178,1674811654,1674811667,13,1
3513,-1.09,-1.41,-1.43,-1.42,-1.41,-1.43,-1.42,-1.42,-1.42,-1.42,-1.387,1674811669,1674811681,12,1


### Preparing dataset (split, scaling and numpy array conversion)

In [15]:
# Separete features and labels
df_m1 = df_diff.copy()
m1_array = df_m1.values
X = m1_array[:, :-1]
y = m1_array[:, -1]

In [16]:
# normalize features
ss = MinMaxScaler()
X = ss.fit_transform(X)

In [17]:
# split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

### Tuning Models before compare

#### Logistic Regression

In [18]:
parameters = {
    'penalty': ['l1', 'l2'],
    'C': np.logspace(-3, 3, 7),
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
}

logreg = LogisticRegression(max_iter=10000, multi_class='ovr')
clf = model_selection.GridSearchCV(logreg,  # model
                                   param_grid=parameters,  # hyperparameters
                                   scoring='accuracy',  # metric for scoring
                                   cv=10)

clf.fit(X_train, y_train)
print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy :", clf.best_score_)

Tuned Hyperparameters : {'C': 0.001, 'penalty': 'l2', 'solver': 'newton-cg'}
Accuracy : 0.565789606521794


140 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
70 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Guilherme\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Guilherme\AppData\Roaming\Python\Python310\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Guilherme\AppData\Roaming\Python\Python310\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports on

#### SVM

In [19]:
tuning models
parameters = [
    {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
    {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
svm = SVC()
clf = model_selection.GridSearchCV(svm,
                                   param_grid=parameters,
                                   scoring='accuracy',
                                   cv=10)

clf.fit(X_train, y_train)
print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy :", clf.best_score_)

#### KNN

In [20]:
# gridsearch for KNN
parameters = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn = KNeighborsClassifier()
clf = model_selection.GridSearchCV(knn,
                                   param_grid=parameters,
                                   scoring='accuracy',
                                   cv=10)

clf.fit(X_train, y_train)
print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy :", clf.best_score_)

Tuned Hyperparameters : {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
Accuracy : 0.724758335226269


#### Naive Bayes

In [21]:
# gridsearch for Naive Bayes
parameters = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0],
    'fit_prior': [True, False]
}

nb = MultinomialNB()
clf = model_selection.GridSearchCV(nb,
                                   param_grid=parameters,
                                   scoring='accuracy',
                                   cv=10)
clf.fit(X_train, y_train)
print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy :", clf.best_score_)

Tuned Hyperparameters : {'alpha': 0.1, 'fit_prior': True}
Accuracy : 0.565789606521794


### Compare Models using 10-fold cross validation

In [22]:
models = []

models.append(('LR', LogisticRegression(max_iter=10000, multi_class='ovr', C=0.001, penalty='l2', solver='newton-cg')))
models.append(('SVM', SVC(C=1, kernel='linear')))
models.append(('KNN', KNeighborsClassifier(metric='euclidean', n_neighbors=7, weights='distance')))
models.append(('NB', MultinomialNB(alpha=0.1, fit_prior=True)))

evaluateModels(X_train, y_train, models, 10)

### Predictions with best model

In [23]:
best_model = KNeighborsClassifier(metric='euclidean', n_neighbors=7, weights='distance')
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.63      0.78      0.70       305
         1.0       0.79      0.65      0.72       398

    accuracy                           0.71       703
   macro avg       0.71      0.72      0.71       703
weighted avg       0.72      0.71      0.71       703

