In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.pipeline import make_union,make_pipeline
from sklearn.model_selection import train_test_split,GridSearchCV

In [3]:
x = np.array([[10,3],
              [0,4],
              [5,3],
              [np.nan,3]
              ])

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit_transform(x)

array([[10.,  3.],
       [ 0.,  4.],
       [ 5.,  3.],
       [ 5.,  3.]])

In [4]:
x_test = np.array([[12,5],
                   [40,2],
                   [5,5],
                   [np.nan,np.nan]
                   ])

imputer.transform(x_test)

array([[12.  ,  5.  ],
       [40.  ,  2.  ],
       [ 5.  ,  5.  ],
       [ 5.  ,  3.25]])

In [6]:
x = np.array([[1,100],
              [3,20],
              [1,15],
              [np.nan,20]
              ])

inputer = KNNImputer(n_neighbors=1)
inputer.fit_transform(x)


array([[  1., 100.],
       [  3.,  20.],
       [  1.,  15.],
       [  3.,  20.]])

In [None]:
x = np.array([[1,100],
              [3,20],
              [1,15],
              [np.nan,np.nan]
              ])

MissingIndicator().fit_transform(x)

array([[False, False],
       [False, False],
       [False, False],
       [ True,  True]])

In [11]:
pipeline = make_union(
    SimpleImputer(strategy="constant", fill_value=-99),
    MissingIndicator()
)

pipeline.fit_transform(x)

array([[  1., 100.,   0.,   0.],
       [  3.,  20.,   0.,   0.],
       [  1.,  15.,   0.,   0.],
       [-99., -99.,   1.,   1.]])

# Bonnus IterativeImputer

In [15]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer



x = np.array([[1,100],
              [3,20],
              [1,15],
              [np.nan,20]
              ])

inputer = IterativeImputer()
inputer.fit_transform(x)

array([[  1.        , 100.        ],
       [  3.        ,  20.        ],
       [  1.        ,  15.        ],
       [  1.70496649,  20.        ]])

# Retour au cours

In [21]:
titanic = sns.load_dataset("titanic")
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [23]:
x = titanic[["pclass","age"]]
y = titanic["survived"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5)


In [24]:
model = make_pipeline(KNNImputer(),SGDClassifier())

In [25]:
params ={
    "knnimputer__n_neighbors":[1,2,3,4],
    "sgdclassifier__alpha":[0.1,0.5,1],
    "sgdclassifier__penalty":["l1","l2"]
}

In [26]:

grid = GridSearchCV(model,param_grid=params,cv=5)

In [27]:
grid.fit(x_train,y_train)

0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"{'knnimputer__n_neighbors': [1, 2, ...], 'sgdclassifier__alpha': [0.1, 0.5, ...], 'sgdclassifier__penalty': ['l1', 'l2']}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,missing_values,
,n_neighbors,4
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,loss,'hinge'
,penalty,'l2'
,alpha,0.1
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [28]:
grid.best_params_

{'knnimputer__n_neighbors': 4,
 'sgdclassifier__alpha': 0.1,
 'sgdclassifier__penalty': 'l2'}