In [1]:
# IMPORTS
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [113]:
# DATASET
df = pd.read_csv("../datasets/stroke/healthcare-dataset-stroke-data_train.csv")
df.drop(["id", "Unnamed: 0"], axis=1, inplace=True)

### BMI por edad (medianas)
* Hasta:  10 años - BMI: 18.30  
* Hasta:  20 años - BMI: 23.80  
* Hasta:  30 años - BMI: 26.70  
* Hasta:  40 años - BMI: 29.70  
* Hasta:  50 años - BMI: 30.00  
* Hasta:  60 años - BMI: 30.90  
* Hasta:  70 años - BMI: 30.00  
* Mayor 70 años - BMI: 28.30  
* Mayor 30 años - BMI: 29.80  

Hago grupos hasta 10, hasta 20, hasta 30 y mayor a 30 para rellenar NaNs.

In [114]:
bmi_fill = {
    "hasta_10": 18.3,
    "hasta_20": 23.8,
    "hasta_30": 26.7,
    "mayor_30": 29.8,
}

In [115]:
# LLENO NaNs SEGUN EDAD
df.bmi.fillna(0, inplace=True)
for i in range(len(df)):
    if df.bmi.iloc[i] == 0:
        if (df.age.iloc[i]<=10):
            df.bmi.iloc[i] = bmi_fill["hasta_10"]
        elif (df.age.loc[i]<=20)&(df.age.loc[i]>10):
            df.bmi.iloc[i] = bmi_fill["hasta_20"]
        elif (df.age.loc[i]<=30)&(df.age.loc[i]>20):
            df.bmi.iloc[i] = bmi_fill["hasta_30"]
        else:
            df.bmi.iloc[i] = bmi_fill["mayor_30"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


## ENCODING

In [54]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

## La idea ahora sería hacerle OHE a work_type y smoking_status; y label encoding a gender y Residence_type, y después probar de nuevo los modelos.  
Quizás habría que usar el ColumnTransformer?

In [79]:
ohe_cols = ["work_type", "smoking_status"]
label_cols = ["gender", "ever_married", "Residence_type"]
cat_cols = ohe_cols+label_cols
num_cols = ["avg_glucose_level", "bmi"]

In [117]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

class ModifiedLabelEncoder(LabelEncoder):

    def fit_transform(self, y, *args, **kwargs):
        return super().fit_transform(y).reshape(-1, 1)

    def transform(self, y, *args, **kwargs):
        return super().transform(y).reshape(-1, 1)

# num_pipeline = Pipeline(["std_scaler", StandardScaler()])

# label_pipeline = Pipeline(["label_enc", LabelEncoder()])

# ohe_pipeline = Pipeline(["ohe", OneHotEncoder()])

full_pipeline = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("ohe", OneHotEncoder(), ohe_cols),
    ("label_1", ModifiedLabelEncoder(), label_cols[0]),
    ("label_2", ModifiedLabelEncoder(), label_cols[1]),
    ("label_3", ModifiedLabelEncoder(), label_cols[2]),
])

In [118]:
df_prepared = full_pipeline.fit_transform(df)

In [119]:
df_processed = pd.DataFrame(df_prepared)
# df_processed.columns = num_cols + ohe_cols + label_cols

In [97]:
df_processed[0].to_numpy().std()

1.0

In [120]:
df_listo = pd.concat([df_processed, df[["hypertension", "heart_disease"]]], axis=1)

In [121]:
df_listo

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,hypertension,heart_disease
0,-0.896710,-0.390718,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0
1,-1.039486,0.074254,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0
2,-0.886402,-0.933185,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0,0
3,-0.742529,-0.067821,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0
4,-0.509613,0.100085,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4083,-0.143132,-1.372325,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0,0
4084,0.958504,-0.468213,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0,0
4085,-0.086109,0.048422,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0,0
4086,-0.129973,2.747841,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0,0


## MODELOS

In [122]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

lr = LogisticRegression()
lr.fit(df_listo, df.stroke)
ypred = cross_val_predict(lr, df_listo, df.stroke)
y_score_log = cross_val_predict(lr, df_listo, df.stroke, method="predict_proba")
confusion_matrix(df.stroke, ypred)

array([[3884,    0],
       [ 204,    0]], dtype=int64)

In [127]:
# Con pesos
pesos = {0:1, 1:10}
lr = LogisticRegression(class_weight=pesos)
lr.fit(df_listo, df.stroke)
ypred = cross_val_predict(lr, df_listo, df.stroke)
y_score_log = cross_val_predict(lr, df_listo, df.stroke, method="predict_proba")
confusion_matrix(df.stroke, ypred)

array([[3415,  469],
       [ 131,   73]], dtype=int64)

In [112]:
df_listo

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,hypertension,heart_disease
0,-0.896710,-0.386927,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0
1,-1.039486,0.070569,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0
2,-0.886402,-0.920673,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0,0
3,-0.742529,-0.069221,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0
4,-0.509613,0.095986,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4083,-0.143132,,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0,0
4084,0.958504,-0.463177,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0,0
4085,-0.086109,0.045153,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0,0
4086,-0.129973,2.701174,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0,0
