# PNP

In [93]:
import numpy as np
import pandas as pd

## Passo 1: Leitura dos dados
Leitura dos bancos de dados em DataFrame Pandas. 

In [94]:
caminho_dados_leo = '../Datasets/Leonardo/dados_pre_processados_leonardo.csv'

In [95]:
df = pd.read_csv(caminho_dados_leo)

In [96]:
df.head()

Unnamed: 0,SensorID,Date,Value,Voltage,X,Y,Z,Comportamento,Season_fall,Season_spring,Season_summer,magn,sma,ent,erg,pit,rol,icl
0,175810,2019-01-21 08:15:03,1.115,3.04,-0.171,1.115,0.242,Pastejando,0,0,1,1.153703,1.528,7.474426,1.771641,-12.108253,-8.71913,-81.476304
1,175810,2019-01-21 08:16:04,0.583,3.02,-0.702,0.071,0.583,Pastejando,0,0,1,0.915278,1.356,-0.089163,0.701798,-39.565868,-84.224766,-39.916535
2,175810,2019-01-21 08:16:05,0.683,3.01,-0.594,-0.026,0.683,Pastejando,0,0,1,0.905539,1.303,0.138071,0.672402,-48.959606,-92.506296,-49.0073
3,175810,2019-01-21 08:16:06,0.631,3.01,-0.706,0.059,0.631,Pastejando,0,0,1,0.948724,1.396,-0.031235,0.81014,-41.690272,-85.222926,-41.913249
4,175810,2019-01-21 08:16:07,0.604,3.01,-0.805,0.081,0.604,Pastejando,0,0,1,1.009654,1.49,-0.197988,1.03918,-36.742843,-84.254174,-37.126721


In [97]:
# Remove as linhas com o sensor 434138
df = df[df.SensorID != 434138]

## Passo 2: Treinamento e Validação

In [98]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
# Para a busca do melhores hiperparêmetros
from sklearn.model_selection import RandomizedSearchCV
from skopt import gp_minimize

### Seleção de colunas aplicáveis

In [99]:
df.columns

Index(['SensorID', 'Date', 'Value', 'Voltage', 'X', 'Y', 'Z', 'Comportamento',
       'Season_fall', 'Season_spring', 'Season_summer', 'magn', 'sma', 'ent',
       'erg', 'pit', 'rol', 'icl'],
      dtype='object')

In [100]:
# Lista com os atributos aplicáveis
features_aplicaveis = ['X', 'Y' ,'Z', 'Comportamento', 'Season_fall', 'Season_spring', 'Season_summer', 'magn', 'sma', 'ent', 'pit', 'rol', 'icl']

# Seleciona os abributos correspondentes da lista features_aplicaveis
df = df[features_aplicaveis]

In [101]:
df = df.rename({'X': 'a', 'Y': 'b', 'Z': 'c'}, axis=1)  

In [102]:
df = df.rename({'a': 'Y', 'b': 'Z', 'c': 'X'}, axis=1)  

#### Função para imprimir a quantidade de dados por conjunto

In [103]:
def imprimir_quantidade_dados(X_train, X_test):
    print(10*"-"," Quantidade de dados ",  10*"-")
    
    print('Treino: ', X_train.shape[0])
    print('Teste: ', X_test.shape[0])
    
    print(43*"-")

#### Divisão entre variáveis preditoras(X) e variáveis de resposta(y)

In [104]:
y = df.Comportamento
y_pnp = y
y_pnp = y_pnp.replace(to_replace=['Ocio', 'Ruminando'], value='Não Pastejando')
y = y_pnp

X = df.drop('Comportamento', axis=1)

# Conjunto de dados PRO: Pastejando x Ruminando x Ócio
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 0, stratify=y)

##### Quantidade de dados por conjunto

In [105]:
imprimir_quantidade_dados(X_train, X_test)

----------  Quantidade de dados  ----------
Treino:  94509
Teste:  23628
-------------------------------------------


In [91]:
# Para reescalar os dados
from sklearn.preprocessing import StandardScaler

X_train_season = X_train[['Season_fall', 'Season_spring', 'Season_summer']]
X_test_season =  X_test[['Season_fall', 'Season_spring', 'Season_summer']]

X_train.drop(['Season_fall', 'Season_spring', 'Season_summer'], axis=1, inplace=True)
X_test.drop(['Season_fall', 'Season_spring', 'Season_summer'], axis=1, inplace=True)

columns = X_train.columns
index_train = X_train.index
index_test = X_test.index


sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

X_train = pd.DataFrame(X_train, columns=columns, index=index_train)
X_test = pd.DataFrame(X_test, columns=columns, index=index_test)

X_train = pd.concat([X_train, X_train_season], axis=1)
X_test = pd.concat([X_test, X_test_season], axis=1)

In [92]:
del X_test_season
del X_train_season

### Função para imprimir e salvar matriz de confusão

In [69]:
# Função para imprimir e salvar matriz de confusão
def imprimeMatrizDeConfusão(y_test, preds, model, salvarFigura = False):
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import ConfusionMatrixDisplay
    import matplotlib.pyplot as plt

    cm = confusion_matrix(y_test, preds, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    disp.plot(values_format= '.2%', xticks_rotation=60, colorbar=False)
    disp.ax_.set_title("Matriz de confusão normalizada sobre as condições verdadeiras")
    if(salvarFigura):
        plt.savefig(input('Nome da figura a ser salva:' ) + '.png')
    plt.show()

In [70]:
from sklearn.metrics import classification_report

def classification_report_extendido(y_true: np.array, y_pred: np.array, classes: set = None):
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    
    if classes is None: # Determina classes pelos valores
        classes = set(np.concatenate((np.unique(y_true), np.unique(y_pred))))
    for cls in classes:
        y_true_cls = (y_true == cls).astype(int)
        y_pred_cls = (y_pred == cls).astype(int)

        fp = sum(y_pred_cls[y_true_cls != 1])
        tn = sum(y_pred_cls[y_true_cls == 0] == False)
        fn = sum(y_pred_cls[y_true_cls == 1] == False)

        
        specificity_val = tn / (tn + fp)
        report[cls]['specificity'] = specificity_val
        npv_val = tn / (tn + fn)
        report[cls]['npv'] = npv_val
        
    return report

### Treinamento e validação variando o tamanho do conjunto de dados

In [106]:
fracoes = [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] # Frações testadas

df_amostras = {}
for fracao in fracoes:
    df_amostras[fracao] = df.sample(frac=fracao, random_state=0)

In [107]:
def amostrarEvalidar(df_amostras, model):
    print("Nome do modelo: ", type(model).__name__)
    for fracao in df_amostras:
        y = df_amostras[fracao].Comportamento
        X = df_amostras[fracao].drop('Comportamento', axis=1)
        y = y.replace(to_replace=['Ocio', 'Ruminando'], value='Não Pastejando')
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 0)
        
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        print("Parcela usada:", fracao*100,"%", " Acurácia:",accuracy_score(y_test, preds)*100)
        print(classification_report(y_test, preds, digits=4))
        print('\n')

#### Random Forest

In [108]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=0, n_jobs=-1)
amostrarEvalidar(df_amostras, rf_model)

Nome do modelo:  RandomForestClassifier
Parcela usada: 0.1 %  Acurácia: 54.166666666666664
                precision    recall  f1-score   support

Não Pastejando     0.5556    0.4167    0.4762        12
    Pastejando     0.5333    0.6667    0.5926        12

      accuracy                         0.5417        24
     macro avg     0.5444    0.5417    0.5344        24
  weighted avg     0.5444    0.5417    0.5344        24



Parcela usada: 0.5 %  Acurácia: 72.26890756302521
                precision    recall  f1-score   support

Não Pastejando     0.7037    0.8636    0.7755        66
    Pastejando     0.7632    0.5472    0.6374        53

      accuracy                         0.7227       119
     macro avg     0.7334    0.7054    0.7064       119
  weighted avg     0.7302    0.7227    0.7140       119



Parcela usada: 1.0 %  Acurácia: 74.68354430379746
                precision    recall  f1-score   support

Não Pastejando     0.7958    0.7847    0.7902       144
    Pastejando

#### LGBMClassifier

In [109]:
from lightgbm import LGBMClassifier

lgb_model = LGBMClassifier(n_jobs=-1, random_state=0)
amostrarEvalidar(df_amostras, lgb_model)

Nome do modelo:  LGBMClassifier
Parcela usada: 0.1 %  Acurácia: 70.83333333333334
                precision    recall  f1-score   support

Não Pastejando     0.7778    0.5833    0.6667        12
    Pastejando     0.6667    0.8333    0.7407        12

      accuracy                         0.7083        24
     macro avg     0.7222    0.7083    0.7037        24
  weighted avg     0.7222    0.7083    0.7037        24



Parcela usada: 0.5 %  Acurácia: 76.47058823529412
                precision    recall  f1-score   support

Não Pastejando     0.7436    0.8788    0.8056        66
    Pastejando     0.8049    0.6226    0.7021        53

      accuracy                         0.7647       119
     macro avg     0.7742    0.7507    0.7538       119
  weighted avg     0.7709    0.7647    0.7595       119



Parcela usada: 1.0 %  Acurácia: 74.68354430379746
                precision    recall  f1-score   support

Não Pastejando     0.8000    0.7778    0.7887       144
    Pastejando     0.67

#### KNN

In [110]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_jobs=-1)
amostrarEvalidar(df_amostras, knn_model)

Nome do modelo:  KNeighborsClassifier
Parcela usada: 0.1 %  Acurácia: 54.166666666666664
                precision    recall  f1-score   support

Não Pastejando     0.5455    0.5000    0.5217        12
    Pastejando     0.5385    0.5833    0.5600        12

      accuracy                         0.5417        24
     macro avg     0.5420    0.5417    0.5409        24
  weighted avg     0.5420    0.5417    0.5409        24



Parcela usada: 0.5 %  Acurácia: 67.22689075630252
                precision    recall  f1-score   support

Não Pastejando     0.6709    0.8030    0.7310        66
    Pastejando     0.6750    0.5094    0.5806        53

      accuracy                         0.6723       119
     macro avg     0.6729    0.6562    0.6558       119
  weighted avg     0.6727    0.6723    0.6641       119



Parcela usada: 1.0 %  Acurácia: 67.9324894514768
                precision    recall  f1-score   support

Não Pastejando     0.7297    0.7500    0.7397       144
    Pastejando   

#### Redes Neurais

In [111]:
%%capture --no-stdout

from sklearn.neural_network import MLPClassifier

rn_model = MLPClassifier(random_state=0)
amostrarEvalidar(df_amostras, rn_model)

Nome do modelo:  MLPClassifier
Parcela usada: 0.1 %  Acurácia: 62.5
                precision    recall  f1-score   support

Não Pastejando     0.6154    0.6667    0.6400        12
    Pastejando     0.6364    0.5833    0.6087        12

      accuracy                         0.6250        24
     macro avg     0.6259    0.6250    0.6243        24
  weighted avg     0.6259    0.6250    0.6243        24



Parcela usada: 0.5 %  Acurácia: 62.18487394957983
                precision    recall  f1-score   support

Não Pastejando     0.6235    0.8030    0.7020        66
    Pastejando     0.6176    0.3962    0.4828        53

      accuracy                         0.6218       119
     macro avg     0.6206    0.5996    0.5924       119
  weighted avg     0.6209    0.6218    0.6043       119



Parcela usada: 1.0 %  Acurácia: 65.40084388185655
                precision    recall  f1-score   support

Não Pastejando     0.7095    0.7292    0.7192       144
    Pastejando     0.5618    0.5376  

#### Gradient Boosting

In [112]:
from xgboost import XGBClassifier

from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()

df_oe = df.replace(to_replace=['Ocio', 'Ruminando'], value='Não Pastejando')
df_oe['Comportamento'] = ordinal_encoder.fit_transform(df_oe['Comportamento'].array.reshape(-1,1))

df_amostras_oe = {}
for fracao in fracoes:
    df_amostras_oe[fracao] = df_oe.sample(frac=fracao, random_state=0)

xgb_model = XGBClassifier(use_label_encoder=False, random_state=0, n_jobs=-1, verbosity = 0)
amostrarEvalidar(df_amostras_oe,  xgb_model)

Nome do modelo:  XGBClassifier
Parcela usada: 0.1 %  Acurácia: 62.5
              precision    recall  f1-score   support

         0.0     0.6667    0.5000    0.5714        12
         1.0     0.6000    0.7500    0.6667        12

    accuracy                         0.6250        24
   macro avg     0.6333    0.6250    0.6190        24
weighted avg     0.6333    0.6250    0.6190        24



Parcela usada: 0.5 %  Acurácia: 68.90756302521008
              precision    recall  f1-score   support

         0.0     0.6986    0.7727    0.7338        66
         1.0     0.6739    0.5849    0.6263        53

    accuracy                         0.6891       119
   macro avg     0.6863    0.6788    0.6800       119
weighted avg     0.6876    0.6891    0.6859       119



Parcela usada: 1.0 %  Acurácia: 75.10548523206751
              precision    recall  f1-score   support

         0.0     0.8058    0.7778    0.7915       144
         1.0     0.6735    0.7097    0.6911        93

    accura

### Treinamento e validação variando o tamanho do conjunto de dados de treino, com teste fixo (20% dos dados)

In [113]:
df_train = pd.concat([X_train, y_train], axis=1)

fracoes = [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] # Frações testadas

df_train_amostras = {}
for fracao in fracoes:
    df_train_amostras[fracao] = df_train.sample(frac=fracao, random_state=0)

In [114]:
def treinar_e_validar_amostras(df_train_amostras, X_test, y_test, model):
    print("Nome do modelo: ", type(model).__name__)
    for fracao in df_train_amostras:
        y_train = df_train_amostras[fracao].Comportamento
        X_train = df_train_amostras[fracao].drop('Comportamento', axis=1)

        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        
        print("Parcela usada:", fracao*100,"%", " Acurácia:",accuracy_score(y_test, preds)*100)
        print(classification_report(y_test, preds, digits=4))
        print('\n')

#### Random Forest

In [115]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=0, n_jobs=-1)
treinar_e_validar_amostras(df_train_amostras, X_test, y_test, rf_model)

Nome do modelo:  RandomForestClassifier
Parcela usada: 0.1 %  Acurácia: 68.45691552395463
                precision    recall  f1-score   support

Não Pastejando     0.6923    0.8351    0.7570     13905
    Pastejando     0.6655    0.4693    0.5505      9723

      accuracy                         0.6846     23628
     macro avg     0.6789    0.6522    0.6538     23628
  weighted avg     0.6813    0.6846    0.6720     23628



Parcela usada: 0.5 %  Acurácia: 74.4159471813103
                precision    recall  f1-score   support

Não Pastejando     0.7676    0.8106    0.7886     13905
    Pastejando     0.7056    0.6491    0.6762      9723

      accuracy                         0.7442     23628
     macro avg     0.7366    0.7299    0.7324     23628
  weighted avg     0.7421    0.7442    0.7423     23628



Parcela usada: 1.0 %  Acurácia: 76.69713898764178
                precision    recall  f1-score   support

Não Pastejando     0.7905    0.8219    0.8059     13905
    Pastejando  

#### LGBMClassifier

In [116]:
from lightgbm import LGBMClassifier

lgb_model = LGBMClassifier(n_jobs=-1, random_state=0)
treinar_e_validar_amostras(df_train_amostras, X_test, y_test, lgb_model)

Nome do modelo:  LGBMClassifier
Parcela usada: 0.1 %  Acurácia: 66.69206026747926
                precision    recall  f1-score   support

Não Pastejando     0.6896    0.7894    0.7361     13905
    Pastejando     0.6202    0.4918    0.5486      9723

      accuracy                         0.6669     23628
     macro avg     0.6549    0.6406    0.6423     23628
  weighted avg     0.6610    0.6669    0.6589     23628



Parcela usada: 0.5 %  Acurácia: 73.25630607753513
                precision    recall  f1-score   support

Não Pastejando     0.7509    0.8164    0.7823     13905
    Pastejando     0.7000    0.6127    0.6534      9723

      accuracy                         0.7326     23628
     macro avg     0.7254    0.7145    0.7179     23628
  weighted avg     0.7300    0.7326    0.7293     23628



Parcela usada: 1.0 %  Acurácia: 75.65600135432537
                precision    recall  f1-score   support

Não Pastejando     0.7817    0.8134    0.7973     13905
    Pastejando     0.71

#### KNN

In [117]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_jobs=-1)
treinar_e_validar_amostras(df_train_amostras, X_test, y_test, knn_model)

Nome do modelo:  KNeighborsClassifier
Parcela usada: 0.1 %  Acurácia: 65.78635517183004
                precision    recall  f1-score   support

Não Pastejando     0.6744    0.8095    0.7358     13905
    Pastejando     0.6181    0.4410    0.5148      9723

      accuracy                         0.6579     23628
     macro avg     0.6463    0.6253    0.6253     23628
  weighted avg     0.6512    0.6579    0.6448     23628



Parcela usada: 0.5 %  Acurácia: 67.22532588454376
                precision    recall  f1-score   support

Não Pastejando     0.7032    0.7667    0.7336     13905
    Pastejando     0.6169    0.5372    0.5743      9723

      accuracy                         0.6723     23628
     macro avg     0.6600    0.6519    0.6539     23628
  weighted avg     0.6677    0.6723    0.6680     23628



Parcela usada: 1.0 %  Acurácia: 69.57846622651091
                precision    recall  f1-score   support

Não Pastejando     0.7209    0.7883    0.7531     13905
    Pastejando   

#### Redes Neurais

In [118]:
%%capture --no-stdout

from sklearn.neural_network import MLPClassifier

rn_model = MLPClassifier(random_state=0)
treinar_e_validar_amostras(df_train_amostras, X_test, y_test, rn_model)

Nome do modelo:  MLPClassifier
Parcela usada: 0.1 %  Acurácia: 65.49432876248518
                precision    recall  f1-score   support

Não Pastejando     0.6778    0.7885    0.7290     13905
    Pastejando     0.6053    0.4640    0.5253      9723

      accuracy                         0.6549     23628
     macro avg     0.6416    0.6262    0.6271     23628
  weighted avg     0.6480    0.6549    0.6452     23628



Parcela usada: 0.5 %  Acurácia: 65.60859996614187
                precision    recall  f1-score   support

Não Pastejando     0.6968    0.7357    0.7157     13905
    Pastejando     0.5892    0.5422    0.5648      9723

      accuracy                         0.6561     23628
     macro avg     0.6430    0.6390    0.6402     23628
  weighted avg     0.6526    0.6561    0.6536     23628



Parcela usada: 1.0 %  Acurácia: 67.0348738784493
                precision    recall  f1-score   support

Não Pastejando     0.6979    0.7755    0.7347     13905
    Pastejando     0.6182

#### Gradient Boosting

In [119]:
%%capture --no-stdout

from xgboost import XGBClassifier

from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

y_test_df = pd.DataFrame(y_test)

df_train_oe = df_train
df_train_oe['Comportamento'] = ordinal_encoder.fit_transform(df_train_oe['Comportamento'].array.reshape(-1,1))
y_test_oe =  ordinal_encoder.transform(y_test_df)

df_train_amostras_oe = {}
for fracao in fracoes:
    df_train_amostras_oe[fracao] = df_train_oe.sample(frac=fracao, random_state=0)

xgb_model = XGBClassifier(use_label_encoder=False, random_state=0, n_jobs=-1, verbosity = 0)
treinar_e_validar_amostras(df_train_amostras_oe, X_test, y_test_oe, xgb_model)

Nome do modelo:  XGBClassifier
Parcela usada: 0.1 %  Acurácia: 65.4139156932453
              precision    recall  f1-score   support

         0.0     0.6752    0.7944    0.7300     13905
         1.0     0.6067    0.4536    0.5191      9723

    accuracy                         0.6541     23628
   macro avg     0.6410    0.6240    0.6245     23628
weighted avg     0.6470    0.6541    0.6432     23628



Parcela usada: 0.5 %  Acurácia: 73.70492635855764
              precision    recall  f1-score   support

         0.0     0.7534    0.8224    0.7864     13905
         1.0     0.7077    0.6149    0.6581      9723

    accuracy                         0.7370     23628
   macro avg     0.7306    0.7187    0.7222     23628
weighted avg     0.7346    0.7370    0.7336     23628



Parcela usada: 1.0 %  Acurácia: 75.22854240731336
              precision    recall  f1-score   support

         0.0     0.7762    0.8137    0.7945     13905
         1.0     0.7138    0.6644    0.6882      9723