# Imputasi

Imputasi adalah mengganti nilai/data yang hilang (missing value; NaN; blank) dengan nilai pengganti

### Mean

In [1]:
import pandas as pd
import numpy as np

kolom = {'col1':[2, 9, 19],
         'col2':[5, np.nan, 17],
         'col3':[3, 9, np.nan],
         'col4':[6, 0, 9],
         'col5':[np.nan, 7, np.nan]}

data = pd.DataFrame(kolom)

In [2]:
data

Unnamed: 0,col1,col2,col3,col4,col5
0,2,5.0,3.0,6,
1,9,,9.0,0,7.0
2,19,17.0,,9,


In [3]:
data.fillna(data.mean())

Unnamed: 0,col1,col2,col3,col4,col5
0,2,5.0,3.0,6,7.0
1,9,11.0,9.0,0,7.0
2,19,17.0,6.0,9,7.0


### Arbitrary (Nilai Suka-Suka)

In [5]:
umur = {'umur' : [29, 43, np.nan, 25, 34, np.nan, 50]}

data = pd.DataFrame(umur)
data

Unnamed: 0,umur
0,29.0
1,43.0
2,
3,25.0
4,34.0
5,
6,50.0


In [6]:
data.fillna(99)

Unnamed: 0,umur
0,29.0
1,43.0
2,99.0
3,25.0
4,34.0
5,99.0
6,50.0


### End of Tail

In [7]:
umur = {'umur' : [29, 43, np.nan, 25, 34, np.nan, 50]}

data = pd.DataFrame(umur)
data

Unnamed: 0,umur
0,29.0
1,43.0
2,
3,25.0
4,34.0
5,
6,50.0


In [9]:
#install library feature-engine
pip install feature-engine

Collecting feature-engineNote: you may need to restart the kernel to use updated packages.
  Downloading feature_engine-1.2.0-py2.py3-none-any.whl (205 kB)

Installing collected packages: feature-engine
Successfully installed feature-engine-1.2.0


In [10]:
#import EndTailImputer
from feature_engine.imputation import EndTailImputer

#buat Imputer
imputer = EndTailImputer(imputation_method='gaussian', tail='right')

#fit-kan imputer ke set
imputer.fit(data)

#ubah data
test_data = imputer.transform(data)

#tampil data
test_data

Unnamed: 0,umur
0,29.0
1,43.0
2,66.896905
3,25.0
4,34.0
5,66.896905
6,50.0


## Data Kategorikal

### Modus

In [13]:
from sklearn.impute import SimpleImputer
mobil = {'mobil':['Ford', 'Ford', 'Toyota', 'Honda', np.nan, 'Toyota', 'Honda', 'Toyota', np.nan, np.nan]}

data = pd.DataFrame(mobil)

In [14]:
data

Unnamed: 0,mobil
0,Ford
1,Ford
2,Toyota
3,Honda
4,
5,Toyota
6,Honda
7,Toyota
8,
9,


In [15]:
imp = SimpleImputer(strategy='most_frequent')

In [16]:
imp.fit_transform(data)

array([['Ford'],
       ['Ford'],
       ['Toyota'],
       ['Honda'],
       ['Toyota'],
       ['Toyota'],
       ['Honda'],
       ['Toyota'],
       ['Toyota'],
       ['Toyota']], dtype=object)

### Random Sample

In [18]:
#import Random Sample
from feature_engine.imputation import RandomSampleImputer

#buat data missing value
data = {'Jenis Kelamin' : ['Laki-laki', 'Perempuan', 'Laki-laki', np.nan, 'Laki-laki', 'Perempuan', 'Perempuan', np.nan, 'Laki-laki', np.nan],
        'Umur' : [29, np.nan, 32, 43, 50, 22, np.nan, 52, np.nan, 17]}

df = pd.DataFrame(data)

In [19]:
df

Unnamed: 0,Jenis Kelamin,Umur
0,Laki-laki,29.0
1,Perempuan,
2,Laki-laki,32.0
3,,43.0
4,Laki-laki,50.0
5,Perempuan,22.0
6,Perempuan,
7,,52.0
8,Laki-laki,
9,,17.0


In [20]:
#membuat imputer
imputer = RandomSampleImputer(random_state=29)

#fit-kan
imputer.fit(df)

#ubah data
testing_df = imputer.transform(df)

In [21]:
testing_df

Unnamed: 0,Jenis Kelamin,Umur
0,Laki-laki,29.0
1,Perempuan,52.0
2,Laki-laki,32.0
3,Perempuan,43.0
4,Laki-laki,50.0
5,Perempuan,22.0
6,Perempuan,50.0
7,Laki-laki,52.0
8,Laki-laki,22.0
9,Perempuan,17.0
