# Modelagem Preditiva (Random Forest)

notebook responsável por realizar o treinando do modelo, utilizando o algoritmo de random forest, para predizer (diabetes_flag = 1).

## 1.0. Leitura do dataset

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from google.colab import files

path = 'dataset_ajustado.csv'
df = pd.read_csv(path)
df.shape

(253680, 18)

In [3]:
df.head()

Unnamed: 0,pressao_alta,colesterol_alto,imc,fumante,avc,ataque_cardiaco,atividade_fisica,frutas,vegetais,alto_consumo_alcool,convenio_medico,dificuldade_subir_escadas,sexo,idade,faixa_etaria,diabetes_flag,risco_total,categoria_risco
0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,9.0,60-64,0,6,Alto
1,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,50-54,0,2,Moderado
2,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,9.0,60-64,0,5,Alto
3,1.0,0.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,11.0,70-74,0,3,Moderado
4,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,11.0,70-74,0,3,Moderado


## 2.0. Seleção dos dados

In [4]:
features = []
for c in ['imc','idade','pressao_alta','colesterol_alto','atividade_fisica','fumante','ataque_cardiaco','avc', 'frutas', 'vegetais','alto_consumo_alcool', 'sexo']:
    if c in df.columns:
        features.append(c)

X = df[features].fillna(0)
y = df['diabetes_flag']

In [5]:
X

Unnamed: 0,imc,idade,pressao_alta,colesterol_alto,atividade_fisica,fumante,ataque_cardiaco,avc,frutas,vegetais,alto_consumo_alcool,sexo
0,40.0,9.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,25.0,7.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,28.0,9.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,27.0,11.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,24.0,11.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
253675,45.0,5.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
253676,18.0,11.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
253677,28.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
253678,23.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0


In [6]:
y

Unnamed: 0,diabetes_flag
0,0
1,0
2,0
3,0
4,0
...,...
253675,0
253676,1
253677,0
253678,0


## 3.0. Data Splitting

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_train.shape, X_test.shape)

(190260, 12) (63420, 12)


## 4.0. Random Forest

In [8]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

## 4.1. Avaliando a perfomance do modelo

In [9]:
y_pred = rf.predict(X_test)
print('Acurácia:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print('ROC AUC (prob):', roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))

Acurácia: 0.843519394512772
              precision    recall  f1-score   support

           0       0.88      0.95      0.91     54657
           1       0.37      0.19      0.25      8763

    accuracy                           0.84     63420
   macro avg       0.63      0.57      0.58     63420
weighted avg       0.81      0.84      0.82     63420

ROC AUC (prob): 0.7266346066141977


## 4.2. Features Importances

In [10]:
importances = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
importances

Unnamed: 0,0
imc,0.408703
idade,0.230079
pressao_alta,0.090904
colesterol_alto,0.045105
ataque_cardiaco,0.036375
frutas,0.034289
sexo,0.033895
fumante,0.03192
vegetais,0.030037
atividade_fisica,0.026524


## 4.3. Salvar previsões no dataset

In [13]:
probs = rf.predict_proba(X)[:,1]
df['Probabilidade_RF'] = probs


In [14]:
df.head()

Unnamed: 0,pressao_alta,colesterol_alto,imc,fumante,avc,ataque_cardiaco,atividade_fisica,frutas,vegetais,alto_consumo_alcool,convenio_medico,dificuldade_subir_escadas,sexo,idade,faixa_etaria,diabetes_flag,risco_total,categoria_risco,Probabilidade_RF
0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,9.0,60-64,0,6,Alto,0.157405
1,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,50-54,0,2,Moderado,0.0
2,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,9.0,60-64,0,5,Alto,0.035
3,1.0,0.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,11.0,70-74,0,3,Moderado,0.192143
4,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,11.0,70-74,0,3,Moderado,0.090569


In [16]:
out='dataset_com_previsoes.csv'
df.to_csv(out,index=False)
files.download(out)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>