In [41]:
import numpy as np 
import pandas as pd 
import pickle

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [32]:
data = "/Users/felipeoliveira/Documents/test-cases/casas-bahia-assingment/data/liver_cirrhosis.csv"
df = pd.read_csv(data)
df.head()

Unnamed: 0,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,2221,C,Placebo,18499,F,N,Y,N,N,0.5,149.0,4.04,227.0,598.0,52.7,57.0,256.0,9.9,1
1,1230,C,Placebo,19724,M,Y,N,Y,N,0.5,219.0,3.93,22.0,663.0,45.0,75.0,220.0,10.8,2
2,4184,C,Placebo,11839,F,N,N,N,N,0.5,320.0,3.54,51.0,1243.0,122.45,80.0,225.0,10.0,2
3,2090,D,Placebo,16467,F,N,N,N,N,0.7,255.0,3.74,23.0,1024.0,77.5,58.0,151.0,10.2,2
4,2105,D,Placebo,21699,F,N,Y,N,N,1.9,486.0,3.54,74.0,1052.0,108.5,109.0,151.0,11.5,1


In [33]:
df["Status"].value_counts()

Status
C     13516
D      9456
CL     2028
Name: count, dtype: int64

In [34]:
df.shape

(25000, 19)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   N_Days         25000 non-null  int64  
 1   Status         25000 non-null  object 
 2   Drug           25000 non-null  object 
 3   Age            25000 non-null  int64  
 4   Sex            25000 non-null  object 
 5   Ascites        25000 non-null  object 
 6   Hepatomegaly   25000 non-null  object 
 7   Spiders        25000 non-null  object 
 8   Edema          25000 non-null  object 
 9   Bilirubin      25000 non-null  float64
 10  Cholesterol    25000 non-null  float64
 11  Albumin        25000 non-null  float64
 12  Copper         25000 non-null  float64
 13  Alk_Phos       25000 non-null  float64
 14  SGOT           25000 non-null  float64
 15  Tryglicerides  25000 non-null  float64
 16  Platelets      25000 non-null  float64
 17  Prothrombin    25000 non-null  float64
 18  Stage 

In [36]:
# removendo features
df.drop(columns=["Alk_Phos", "SGOT", "Tryglicerides", "Platelets"], axis=1, inplace=True)

In [37]:
# Separando features e target
X = df.drop('Status', axis=1)
y = df['Status']

# Definindo as colunas numéricas e categóricas
numeric_features = ['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Prothrombin', 'Stage']
categorical_features = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']

# Criando os transformadores para colunas numéricas e categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Aplicando as transformações
X_preprocessed = preprocessor.fit_transform(X)

# Aplicando LabelEncoder no target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Separando os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y_encoded, test_size=0.2, random_state=42)

In [38]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((20000, 21), (5000, 21), (20000,), (5000,))

In [47]:
type(X_test)

numpy.ndarray

In [39]:
# Treinando o modelo RandomForest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Fazendo previsões no conjunto de teste
y_pred = rf_model.predict(X_test)

# Avaliando o desempenho do modelo
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.9836
Classification Report:
               precision    recall  f1-score   support

           C       0.98      0.99      0.99      2729
          CL       0.99      0.96      0.98       415
           D       0.98      0.98      0.98      1856

    accuracy                           0.98      5000
   macro avg       0.99      0.98      0.98      5000
weighted avg       0.98      0.98      0.98      5000



In [42]:
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)

with open('preprocessor.pkl', 'wb') as preprocessor_file:
    pickle.dump(preprocessor, preprocessor_file)

with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)

In [43]:
df.head()

Unnamed: 0,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Prothrombin,Stage
0,2221,C,Placebo,18499,F,N,Y,N,N,0.5,149.0,4.04,227.0,9.9,1
1,1230,C,Placebo,19724,M,Y,N,Y,N,0.5,219.0,3.93,22.0,10.8,2
2,4184,C,Placebo,11839,F,N,N,N,N,0.5,320.0,3.54,51.0,10.0,2
3,2090,D,Placebo,16467,F,N,N,N,N,0.7,255.0,3.74,23.0,10.2,2
4,2105,D,Placebo,21699,F,N,Y,N,N,1.9,486.0,3.54,74.0,11.5,1


In [44]:
df["Drug"].value_counts()

Drug
Placebo            15827
D-penicillamine     9173
Name: count, dtype: int64

In [49]:
df["Status"].value_counts()

Status
C     13516
D      9456
CL     2028
Name: count, dtype: int64

In [46]:
[
    {
        "N_Days": 1500,
        "Age": 45,
        "Bilirubin": 1.2,
        "Cholesterol": 220.0,
        "Albumin": 3.5,
        "Cooper": 105.0,
        "Prothrombin": 12.5,
        "Stage": 2,
        "Drug": "D-penicillamine",
        "Sex": "M",
        "Ascites": "no",
        "Hepatomegaly": "yes",
        "Spiders": "no",
        "Edema": "no"
    }
]

[{'N_Days': 1500,
  'Age': 45,
  'Bilirubin': 1.2,
  'Cholesterol': 220.0,
  'Albumin': 3.5,
  'Cooper': 105.0,
  'Prothrombin': 12.5,
  'Stage': 2,
  'Drug': 'D-penicillamine',
  'Sex': 'M',
  'Ascites': 'no',
  'Hepatomegaly': 'yes',
  'Spiders': 'no',
  'Edema': 'no'}]

<br>
<br>
<hr>
<br>