## Stack - Projeto Human Resources Analytics

### Machine Learning.

In [1]:
import pandas as pd
import datetime
import glob
from minio import Minio
import numpy as np
import matplotlib.pyplot as plt

In [2]:
client = Minio(
        "localhost:9000",
        access_key="minioadmin",
        secret_key="minioadmin",
        secure=False
    )

### Baixando o Dataset do Data Lake.

In [3]:
client.fget_object(
            "processing",
            "employees_dataset.parquet",
            "temp_.parquet",
)
df = pd.read_parquet("temp_.parquet")

In [None]:
df.head()

#### Organizando o dataset.

In [None]:
df = df[['department', 'salary', 'mean_work_last_3_months',
       'number_projects', 'satisfaction_level', 'last_evaluation',
       'time_in_company', 'work_accident','left']]

In [None]:
df.head()

#### Verificando os registros missing.

In [None]:
df.isnull().sum()

In [None]:
df[df.notnull()]

In [None]:
df = df[:14998]

#### Alterando os tipos de dados.

In [None]:
df["number_projects"] = df["number_projects"].astype(int)
df["mean_work_last_3_months"] = df["mean_work_last_3_months"].astype(int)
df["time_in_company"] = df["time_in_company"].astype(int)
df["work_accident"] = df["work_accident"].astype(int)
df["left"] = df["left"].astype(int)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df = df[:14998]

#### Renomeando atributos

In [None]:
df = df.rename(columns={'satisfaction_level': 'satisfaction', 
                        'last_evaluation': 'evaluation',
                        'number_projects': 'projectCount',
                        'mean_work_last_3_months': 'averageMonthlyHours',
                        'time_in_company': 'yearsAtCompany',
                        'work_accident': 'workAccident',
                        'left' : 'turnover'
                        })

In [None]:
df.head()

###  Importancia de Features

#### Converte os atributos categoricos valores numéricos.

In [None]:
df["department"] = df["department"].astype('category').cat.codes
df["salary"] = df["salary"].astype('category').cat.codes

In [None]:
df.head()

#### Separando os conjuntos de dados.

In [None]:
target_name = 'turnover'
X = df.drop('turnover', axis=1)
y = df[target_name]

#### Transformando os dados.

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
X = scaler.fit_transform(X)

In [None]:
X

#### Separando os conjuntos.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
        X
       ,y
       ,test_size = 0.2
       ,random_state = 123
       ,stratify = y
)

#### Treinando o algoritmo de arvore de decisão.

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()
dtree = dtree.fit(X_train,y_train)

In [None]:
importances = dtree.feature_importances_
feat_names = df.drop(['turnover'],axis=1).columns

In [None]:
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(12,4))
plt.title("Feature importances by DecisionTreeClassifier")
plt.bar(range(len(indices)), importances[indices], color='lightblue',  align="center")
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical',fontsize=14)
plt.xlim([-1, len(indices)])
plt.show()

#### Filtrando apenas os atributos relevantes.

In [None]:
X = df[["satisfaction","evaluation","averageMonthlyHours","yearsAtCompany"]]

#### Separando os conjuntos de dados.

In [None]:
scaler = MinMaxScaler()

In [None]:
X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
        X
       ,y
       ,test_size = 0.2
       ,random_state = 123
       ,stratify = y
)

In [None]:
X_train

#### Função do modelo de base.

In [None]:
def base_rate_model(X) :
    y = np.zeros(X.shape[0])
    return y

#### Importando métodos de métrica de avaliação.

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
def accuracy_result(y_test,y_predict):
    acc = accuracy_score(y_test, y_predict)
    print ("Accuracy = %2.2f" % acc)

In [None]:
def roc_classification_report_results(model,y_test,y_predict):
    roc_ = roc_auc_score(y_test, y_predict)
    classfication_report = classification_report(y_test, y_predict)
    
    print ("\n{} AUC = {}\n".format(model, roc_))
    print(classfication_report)

#### Análise do modelo de baseline

In [None]:
y_predict = base_rate_model(X_test)

In [None]:
accuracy_result(y_test, y_predict)

In [None]:
roc_classification_report_results("Base Model", y_test, y_predict)

### Modelo de Regressão Logística.

#### Instânciando o algoritmo.

In [None]:
from sklearn.linear_model import LogisticRegression
logis = LogisticRegression()

#### Realizando o treinamento.

In [None]:
logis.fit(X_train, y_train)

#### Calculando as predições.

In [None]:
y_predict = logis.predict(X_test)

#### Avaliando o resultado.

In [None]:
accuracy_result(y_test, y_predict)

In [None]:
roc_classification_report_results("Logistic Regression", y_test, y_predict)

### Modelo de Arvore de decisão.

#### Instânciando o algoritmo.

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()

#### Realizando o treinamento.

In [None]:
dtree = dtree.fit(X_train,y_train)

#### Calculando as predições.

In [None]:
y_predict = dtree.predict(X_test)

#### Avaliando o resultado.

In [None]:
accuracy_result(y_test, y_predict)

In [None]:
roc_classification_report_results("Decision Tree", y_test, y_predict)

### Modelo de Arvore Aleatória (Random Forest)

#### Instânciando o algoritmo.

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()

#### Realizando o treinamento.

In [None]:
rf = rf.fit(X_train,y_train)

#### Calculando as predições.

In [None]:
y_predict = rf.predict(X_test)

#### Avaliando o resultado.

In [None]:
accuracy_result(y_test, y_predict)

In [None]:
roc_classification_report_results("Random Forest", y_test, y_predict)

### Pycaret

In [None]:
pip install pycaret

#### Importando os métodos.

In [None]:
from pycaret.classification import *

#### Definindo o Setup.

In [None]:
s = setup( df[["satisfaction","evaluation","averageMonthlyHours","yearsAtCompany","turnover"]]
          ,target = "turnover"
          ,numeric_features = ["yearsAtCompany"]
          ,normalize = True
          ,normalize_method = "minmax"
          ,data_split_stratify = True
          ,fix_imbalance = True,
         )

#### Comparando diferentes modelos.

In [None]:
best = compare_models(fold = 5,sort = 'AUC')

#### Criando o modelo.

In [None]:
gbc = create_model('gbc', fold = 5)

#### Realizando o tunning do modelo.

In [None]:
tuned_gbc = tune_model(gbc
                      ,fold = 5
                      ,custom_grid = {
                                      "learning_rate":[0.1,0.2,0.5]
                                     ,"n_estimators":[100,500,1000]
                                     ,"min_samples_split":[1,2,5,10]
                                     ,"max_depth":[1,3,9]
                                     }
                      ,optimize = 'AUC')

#### Finalizando o modelo.

In [None]:
final_model = finalize_model(gbc)

In [None]:
save_model(final_model,'model')

#### Transferindo os arquivos para o Data Lake.

#### Modelo de Classificação.

In [None]:
client.fput_object(
    "curated",
    "model.pkl",
    "model.pkl"
)

#### Exportando o conjunto de dados para o disco.

In [None]:
df.to_csv("dataset.csv",index=False)

In [None]:
client.fput_object(
    "curated",
    "dataset.csv",
    "dataset.csv"
)