# <center> [INDICIUM] - Processo Seletivo - Lighthouse Programa de Formação em Dados (Remoto) </center>
    
# <center> Treino e Teste do Modelo</center>

### Objetivo: Identificar quais máquinas apresentam potencial de falha tendo como base dados extraídos através de sensores durante o processo de manufatura. ###

In [1]:
#Import das libs necessárias no projeto
import pandas as pd
import seaborn as sn
import numpy as np

#bliblotecas para a preparação do modelo
from sklearn.model_selection import RepeatedKFold, KFold, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error

#from lightgbm import LGBMRegressor

import lightgbm as lgb


#import label encoder
from sklearn import preprocessing

In [2]:
#Ler os dados do csv e transformar em um dataframe para possibilitar a sua manipulação
df_treino = pd.read_csv("./desafio_manutencao_preditiva_treino.csv")
df_teste = pd.read_csv("./desafio_manutencao_preditiva_teste.csv")

In [3]:
print("Shape train: %s, test: %s" % (df_treino.shape, df_teste.shape))

Shape train: (6667, 9), test: (3333, 8)


In [4]:
df_treino.head()

Unnamed: 0,udi,product_id,type,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min,failure_type
0,1,M14860,M,298.1,308.6,1551,42.8,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,No Failure
2,5,L47184,L,298.2,308.7,1408,40.0,9,No Failure
3,6,M14865,M,298.1,308.6,1425,41.9,11,No Failure
4,7,L47186,L,298.1,308.6,1558,42.4,14,No Failure


## Parte 1 - Transformação e Preprocessamento dos dados

In [5]:
#Criando outro dataframe com base no original para poder manipular os dados 
#Importante: Só serão copiados as colunas necessárias para o processo de transformação e treino do modelo. 
df_treino_tf = df_treino[["air_temperature_k", "process_temperature_k", "rotational_speed_rpm", "torque_nm", "tool_wear_min", "failure_type"]].copy()

In [6]:
#o novo dataframe criado
df_treino_tf.head()

Unnamed: 0,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min,failure_type
0,298.1,308.6,1551,42.8,0,No Failure
1,298.2,308.7,1408,46.3,3,No Failure
2,298.2,308.7,1408,40.0,9,No Failure
3,298.1,308.6,1425,41.9,11,No Failure
4,298.1,308.6,1558,42.4,14,No Failure


In [7]:
df_treino_tf['failure_type'].unique()

array(['No Failure', 'Power Failure', 'Tool Wear Failure',
       'Overstrain Failure', 'Random Failures',
       'Heat Dissipation Failure'], dtype=object)

In [8]:
#transformar o y  
#make an instance of Label Encoder
label_encoder = preprocessing.LabelEncoder()
df_treino_tf["failure_type"] = label_encoder.fit_transform(df_treino_tf["failure_type"])

In [9]:
df_treino_tf['failure_type'].unique()

array([1, 3, 5, 2, 4, 0])

In [10]:
#O dataframe com os dados da coluna failure_type transformados
df_treino_tf.head()

Unnamed: 0,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min,failure_type
0,298.1,308.6,1551,42.8,0,1
1,298.2,308.7,1408,46.3,3,1
2,298.2,308.7,1408,40.0,9,1
3,298.1,308.6,1425,41.9,11,1
4,298.1,308.6,1558,42.4,14,1


In [11]:
#definindo o y
y = df_treino_tf["failure_type"]
y.sample(3)

5341    1
3043    1
5154    1
Name: failure_type, dtype: int64

In [12]:
#retirando o y do conjunto de treino e as colunas que não seram utilizadas durante o processo de treinamento
df_treino_tf.drop(["failure_type"], axis=1, inplace=True)

In [13]:
df_treino_tf

Unnamed: 0,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min
0,298.1,308.6,1551,42.8,0
1,298.2,308.7,1408,46.3,3
2,298.2,308.7,1408,40.0,9
3,298.1,308.6,1425,41.9,11
4,298.1,308.6,1558,42.4,14
...,...,...,...,...,...
6662,298.8,308.3,1634,27.9,12
6663,298.8,308.4,1604,29.5,14
6664,298.9,308.4,1632,31.8,17
6665,299.0,308.7,1408,48.5,25


In [14]:
#definindo o conjunto de treino
X_train, X_test, y_train, y_test = train_test_split(df_treino_tf, y, test_size=0.2, random_state=42)

## Parte 2 - Criação dos modelos

In [15]:
#create model
def model_lgbmr():
    model = LGBMRegressor(objective='regression',
                          num_leaves=166,
                          learning_rate=0.05, 
                          n_estimators=120,
                          max_bin = 55, 
                          bagging_fraction = 0.8,
                          bagging_freq = 5, 
                          feature_fraction = 0.2319,
                          feature_fraction_seed=9, 
                          bagging_seed=9,
                          min_data_in_leaf =6, 
                          min_sum_hessian_in_leaf = 11)
    return model

In [16]:
gbm = model_lgbmr()

In [17]:
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1')

[1]	valid_0's l1: 0.0901653	valid_0's l2: 0.116488
[2]	valid_0's l1: 0.089679	valid_0's l2: 0.115775
[3]	valid_0's l1: 0.089217	valid_0's l2: 0.115133
[4]	valid_0's l1: 0.0891168	valid_0's l2: 0.115048
[5]	valid_0's l1: 0.0888455	valid_0's l2: 0.114352
[6]	valid_0's l1: 0.0889382	valid_0's l2: 0.114241
[7]	valid_0's l1: 0.0890263	valid_0's l2: 0.114154
[8]	valid_0's l1: 0.0891141	valid_0's l2: 0.113561
[9]	valid_0's l1: 0.0886885	valid_0's l2: 0.112785
[10]	valid_0's l1: 0.0887659	valid_0's l2: 0.112273
[11]	valid_0's l1: 0.0887702	valid_0's l2: 0.112267
[12]	valid_0's l1: 0.0882188	valid_0's l2: 0.111713
[13]	valid_0's l1: 0.0881691	valid_0's l2: 0.111745
[14]	valid_0's l1: 0.0880297	valid_0's l2: 0.1114
[15]	valid_0's l1: 0.0880628	valid_0's l2: 0.111404
[16]	valid_0's l1: 0.0879208	valid_0's l2: 0.111118
[17]	valid_0's l1: 0.0877908	valid_0's l2: 0.110874
[18]	valid_0's l1: 0.0878518	valid_0's l2: 0.110874
[19]	valid_0's l1: 0.0877538	valid_0's l2: 0.110668
[20]	valid_0's l1: 0.0872

LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
              feature_fraction=0.2319, feature_fraction_seed=9,
              learning_rate=0.05, max_bin=55, min_data_in_leaf=6,
              min_sum_hessian_in_leaf=11, n_estimators=120, num_leaves=166,
              objective='regression')

In [None]:
#criando o cross validation

# 5 Fold Cross validation
kf = KFold(n_splits=5, shuffle=True)
cv_scores, cv_std = [], []

In [19]:
y_pred = gbm.predict(X_train, num_iteration=gbm.best_iteration_)

In [20]:
# Basic RMSE
print('The rmse of prediction is:', round(mean_squared_log_error(y_pred, y_train) ** 0.5, 5))

The rmse of prediction is: 0.12563


In [22]:
test_pred = np.expm1(gbm.predict(df_teste, num_iteration=gbm.best_iteration_))

ValueError: Number of features of the model must match the input. Model n_features_ is 5 and input n_features is 8

In [None]:
df_teste["failure_type"] = test_pred
df_test.to_csv("resultado.csv", columns=["Id", "failure_type"], index=False)

In [18]:
# evaluate model
cv = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

NameError: name 'model' is not defined

In [None]:
# Creation of the RMSE metric:    
def rmse(model):
    return np.sqrt(-cross_val_score(model, data, target, scoring="neg_mean_squared_error", cv=kf))

In [None]:
def apply_learning_algorithm(model):
    score = rmse(model)
    cv_scores.append(score.mean())
    cv_std.append(score.std())

In [None]:
model = model_lgbmr()
model.fit()