In [12]:
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [13]:
set_config(display="diagram")

In [14]:
seed = 42

In [15]:
df = pd.read_csv('../data/diabetes_with_nan.csv')
print(df.sample(5))

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
297          0.0    126.0           84.0           29.0    215.0  30.7   
707          2.0    127.0           46.0           21.0    335.0  34.4   
334          1.0     95.0           60.0           18.0     58.0  23.9   
591          2.0    112.0           78.0           50.0    140.0  39.4   
242          3.0    139.0           54.0            0.0      0.0  25.6   

     DiabetesPedigreeFunction   Age  Outcome  
297                     0.520  24.0        0  
707                     0.176  22.0        0  
334                     0.260  22.0        0  
591                     0.175  24.0        0  
242                     0.402  22.0        1  


In [16]:
df.isnull().sum()

Pregnancies                 1
Glucose                     1
BloodPressure               1
SkinThickness               1
Insulin                     1
BMI                         1
DiabetesPedigreeFunction    0
Age                         1
Outcome                     0
dtype: int64

In [17]:
X = df.drop("Outcome", axis=1).values
y = df["Outcome"].values

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

In [19]:
steps = [
    ("Mean_imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ("pca", PCA(n_components=2)),
    ("Classifier", LogisticRegression())
]

In [20]:
pipe = Pipeline(steps)

In [21]:
pipe.fit(X_train, y_train)

In [22]:
pipe.predict(X_test)

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)