In [1]:
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [34]:
# Importando dados

data = pd.read_csv('data/data_tr.csv')
X = data.drop('Outcome', axis=1)
y = data['Outcome']

In [35]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,,33.6,0.627,50
1,1,85.0,66.0,29.0,,26.6,0.351,31
2,8,183.0,64.0,,,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33


In [27]:
# Baseline

imp = SimpleImputer(strategy='mean')
std_sc = StandardScaler()
log_reg = LogisticRegression()

steps = [('imputer', imp), ('scaler', std_sc), ('log_reg', log_reg)]
pipeline = Pipeline(steps)

scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=5)
print(f'Accuracy mean: {scores.mean()}, std= {scores.std()}')

Accuracy mean: 0.7721585603938544, std= 0.01446254764680255


# NaN

In [13]:
# Features com NaN

data.isna().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [5]:
# Linhas com NaN

nan_per_row.value_counts()

0    392
2    199
1    142
3     28
4      7
dtype: int64

In [14]:
# Linhas com NaN percentual

nan_per_row.value_counts()*100/nan_per_row.value_counts().sum()

0    51.041667
2    25.911458
1    18.489583
3     3.645833
4     0.911458
dtype: float64

In [28]:
# Retirando colunas com muitos NaN

X = data.drop(['Outcome', 'Insulin'], axis=1)
y = data['Outcome']

scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=5)
print(f'Accuracy mean: {scores.mean()}, std= {scores.std()}')

Accuracy mean: 0.7682539682539682, std= 0.01689891026164006


In [29]:
# Retirando colunas com muitos NaN

X = data.drop(['Outcome', 'SkinThickness'], axis=1)
y = data['Outcome']

scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=5)
print(f'Accuracy mean: {scores.mean()}, std= {scores.std()}')

Accuracy mean: 0.7734657499363381, std= 0.0172800005238109


In [30]:
# Retirando colunas com muitos NaN

X = data.drop(['Outcome', 'DiabetesPedigreeFunction'], axis=1)
y = data['Outcome']

scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=5)
print(f'Accuracy mean: {scores.mean()}, std= {scores.std()}')

Accuracy mean: 0.7734657499363381, std= 0.02238288242592355


In [33]:
# Retirando colunas com muitos NaN

X = data.drop(['Outcome', 'BloodPressure'], axis=1)
y = data['Outcome']

scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=5)
print(f'Accuracy mean: {scores.mean()}, std= {scores.std()}')

Accuracy mean: 0.7760716407775231, std= 0.01829712567289078


In [43]:
# Retirando colunas com muitos NaN

X = data[['Glucose', 'BMI', 'Age', 'BloodPressure']]
y = data['Outcome']

scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=5)
print(f'Accuracy mean: {scores.mean()}, std= {scores.std()}')

Accuracy mean: 0.7721755368814192, std= 0.01926044052066881


In [39]:
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0
764,2,122.0,70.0,27.0,,36.8,0.340,27,0
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30,0
766,1,126.0,60.0,,,30.1,0.349,47,1


In [None]:
# Sistematizar busca de drop de colunas