In [1]:
import urllib.request
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [2]:
file_name = 'dataR2.csv'

def download_file(file_name):
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00451/dataR2.csv'
    urllib.request.urlretrieve(url, file_name)

download_file(file_name)

In [3]:
data = pd.read_csv(file_name)
data.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697,1
3,68,21.367521,77,3.226,0.612725,9.8827,7.16956,12.766,928.22,1
4,86,21.111111,92,3.549,0.805386,6.6994,4.81924,10.57635,773.92,1


In [4]:
data["Classification"].value_counts()

2    64
1    52
Name: Classification, dtype: int64

In [5]:
data.shape

(116, 10)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             116 non-null    int64  
 1   BMI             116 non-null    float64
 2   Glucose         116 non-null    int64  
 3   Insulin         116 non-null    float64
 4   HOMA            116 non-null    float64
 5   Leptin          116 non-null    float64
 6   Adiponectin     116 non-null    float64
 7   Resistin        116 non-null    float64
 8   MCP.1           116 non-null    float64
 9   Classification  116 non-null    int64  
dtypes: float64(7), int64(3)
memory usage: 9.2 KB


**Insights**:

* O conjunto de dados é composto por 10 colunas, 9 das quais correspondem a variáveis ​​independentes, que usaremos para prever o alvo.

* A classificação é a variável a prever. Todas as variáveis ​​são numéricas, inteiras ou reais, e não possuem valores nulos.

In [7]:
X = data.drop(["Classification"], axis=1)
y = data["Classification"]

In [8]:
# Vamos separar em 70/30
(X_train, X_test,
y_train, y_test) = train_test_split(X,
                                    y,
                                    stratify=y,
                                    test_size = 0.30,
                                    random_state = 11)

# Primeiro uma decision tree

In [9]:
#Criamos a função e ajustamos
dstree = DecisionTreeClassifier(random_state=11).fit(X_train, y_train)

y_test_pred = dstree.predict(X_test)

In [10]:
test_accuracy = accuracy_score(y_test, y_test_pred)

print('% Acurácia:', test_accuracy)

% Acurácia: 0.6


# Agora Radom Forest

In [11]:
#Criamos um random forest!
rf = RandomForestClassifier(random_state=11).fit(X_train, y_train)

y_test_pred = rf.predict(X_test) #Prediccion en Test

In [12]:
#Acurácia
test_accuracy = accuracy_score(y_test, y_test_pred)

print('% Acurácia:', test_accuracy)

% Acurácia: 0.6857142857142857


Conclusão!

Simplesmente usando o algoritmo de floresta aleatória, obtemos uma precisão melhor do que usando um DecisionTreeClassifier 😉

# Indo mais fundo na floresta

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [13]:
# n_estimators = 50, max_depth = 2, max_features = 3

rf = RandomForestClassifier(random_state = 11, 
                            n_estimators = 40, 
                            max_depth = 2, 
                            max_features = 2).fit(X_train, y_train)

y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

In [14]:
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print('% Acurácia:', train_accuracy, test_accuracy)

% Acurácia: 0.8518518518518519 0.8285714285714286


In [15]:
train_AUC = roc_auc_score(y_train, y_train_pred)
test_AUC  = roc_auc_score(y_test, y_test_pred)

print('% AUC:', train_AUC, test_AUC)

% AUC: 0.8444444444444443 0.8273026315789473


In [16]:
def gini_score(y_actual, y_pred):
    gini = lambda a, p: 2 * roc_auc_score(a, p) - 1
    return gini(y_actual, y_pred) / gini(y_actual, y_actual)

In [17]:
train_gini = gini_score(y_train, y_train_pred)
test_gini  = gini_score(y_test, y_test_pred)

print('Gini:', train_gini, test_gini)

Gini: 0.6888888888888887 0.6546052631578947
