In [8]:
import pandas as pd
import numpy as np

## Leave One Group Out

In [9]:
from sklearn.model_selection import LeaveOneGroupOut
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OrdinalEncoder

In [30]:
# Modelos PRO
lgb_model = LGBMClassifier(learning_rate=0.1, num_leaves=512, min_child_samples=1, subsample=1.0, 
                           colsample_bytree=0.662, random_state=0, subsample_freq=1, n_estimators=100, n_jobs=-1)

rf_model = RandomForestClassifier(n_estimators=200, max_features='sqrt', max_depth=None, 
                                  criterion='entropy', min_samples_split=10, min_samples_leaf=2 , bootstrap=True, random_state=0, n_jobs=-1)

knn_model = KNeighborsClassifier(n_neighbors=37, weights='distance', leaf_size=20, p=1, n_jobs=-1)

xgb_model = XGBClassifier(learning_rate=0.1, max_depth=16, min_child_weight=1, gamma=0.0, colsample_bytree=0.5370264280766771, use_label_encoder=False, random_state=0, n_jobs=-1)

models = [xgb_model]

In [26]:
def executarLogo(models, groups, X, y):
    logo = LeaveOneGroupOut()
    for model in models:
        if(model == xgb_model):
            y = enc.transform(y.array.reshape(-1, 1))
        cv = logo.split(X, y, groups)
        resultado = cross_validate(model, X, y, cv=cv, n_jobs=-1, return_estimator=True)
        print(resultado['estimator'][0], '-> Acurácia: ' , resultado['test_score'].mean()*100, '% Tempo de treino: ',
             resultado['fit_time'].sum())

In [13]:
caminho_dados_lazaro = 'Datasets/Lazaro/dados_pre_processados_lazaro.csv'

In [14]:
df = pd.read_csv(caminho_dados_lazaro)

In [15]:
# Remove as linhas com o sensor 434138
df = df[df.SensorID != 434138]

In [16]:
df.columns

Index(['Piquete', 'Altura real', 'Altura esperada', 'Animal', 'SensorID',
       'Sensor Name', 'Date', 'X', 'Y', 'Z', 'Comportamento', 'Season_fall',
       'Season_spring', 'Season_summer', 'magn', 'sma', 'ent', 'pit', 'rol',
       'icl'],
      dtype='object')

#### Divisão entre variáveis preditoras(X) e variáveis de resposta(y)

In [19]:
y = df.Comportamento
X = df[['X', 'Y' ,'Z', 'Season_fall', 'Season_spring', 'Season_summer', 'magn', 'sma', 'ent', 'pit', 'rol', 'icl']]

enc = OrdinalEncoder()
enc.fit(y.array.reshape(-1, 1))

OrdinalEncoder()

In [35]:
# Conjunto de dados PNP: Pastejando x Não Pastejando
y = y.replace(to_replace=['Ocio', 'Ruminando'], value='Não Pastejando')

In [21]:
# Para reescalar os dados
from sklearn.preprocessing import StandardScaler

X_season = X[['Season_fall', 'Season_spring', 'Season_summer']]

X = X.drop(['Season_fall', 'Season_spring', 'Season_summer'], axis=1)

columns = X.columns
index= X.index

sc = StandardScaler()
X = sc.fit_transform(X)

X = pd.DataFrame(X, columns=columns, index=index)

X = pd.concat([X, X_season], axis=1)

#### Leave Fall Out

In [22]:
groups = np.array(X.Season_fall.values)

In [23]:
lfo = LeaveOneGroupOut()
lfo.get_n_splits(X, y, groups)

2

In [24]:
cv = lfo.split(X, y, groups)

In [27]:
executarLogo(models, groups, X, y)

LGBMClassifier(colsample_bytree=0.662, min_child_samples=1, num_leaves=512,
               random_state=0, subsample_freq=1) -> Acurácia:  56.05743227758413 % Tempo de treino:  19.188790798187256
RandomForestClassifier(criterion='entropy', max_features='sqrt',
                       min_samples_leaf=2, min_samples_split=10,
                       n_estimators=200, n_jobs=-1, random_state=0) -> Acurácia:  55.42669762180896 % Tempo de treino:  143.33033275604248
KNeighborsClassifier(leaf_size=20, n_jobs=-1, n_neighbors=37, p=1,
                     weights='distance') -> Acurácia:  55.052964369606826 % Tempo de treino:  2.4779584407806396
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5370264280766771,
              enable_categorical=False, gamma=0.0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=16,
              min_child_

#### Leave Summer Out

In [31]:
groups = np.array(X.Season_summer.values)

In [32]:
executarLogo(models, groups, X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5370264280766771,
              enable_categorical=False, gamma=0.0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=16,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              objective='multi:softprob', predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None) -> Acurácia:  54.491073504413514 % Tempo de treino:  329.0474798679352


#### Leave Spring Out

In [33]:
groups = np.array(X.Season_spring.values)

In [34]:
executarLogo(models, groups, X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5370264280766771,
              enable_categorical=False, gamma=0.0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=16,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              objective='multi:softprob', predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None) -> Acurácia:  47.62834623107763 % Tempo de treino:  301.28445196151733
