In [3]:
import pandas as pd
import numpy as np
from io import StringIO
import sys

In [4]:
from IPython.display import Image
%matplotlib inline

In [5]:
# Bangun data - digunakan data csv

csv_dataset = \
'''A,B,C,D
4.0,3.0,2.0,1.0
5.0,6.0,,7.0
8.0,9.0,10.0,'''

# Read dataset csv_dataset
# Deklarasi dataset menjadi df
df = pd.read_csv(StringIO(csv_dataset))

In [6]:
df

Unnamed: 0,A,B,C,D
0,4.0,3.0,2.0,1.0
1,5.0,6.0,,7.0
2,8.0,9.0,10.0,


In [7]:
# cek data yang hilang menggunakan isnull
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True


In [8]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [9]:
# Hapus Missing Values

# Hapus row nya
df.dropna(axis = 0)

Unnamed: 0,A,B,C,D
0,4.0,3.0,2.0,1.0


In [10]:
# Hapus COlumnnya
df.dropna(axis = 1)

Unnamed: 0,A,B
0,4.0,3.0
1,5.0,6.0
2,8.0,9.0


In [11]:
# Drop rows where all columns are NaN
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,4.0,3.0,2.0,1.0
1,5.0,6.0,,7.0
2,8.0,9.0,10.0,


In [12]:
# drop rows that have fewer than 3 real values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,4.0,3.0,2.0,1.0


In [13]:
# drop rows where NaN appear in specific columns 
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,4.0,3.0,2.0,1.0
2,8.0,9.0,10.0,


In [14]:
df.values

array([[ 4.,  3.,  2.,  1.],
       [ 5.,  6., nan,  7.],
       [ 8.,  9., 10., nan]])

In [15]:
# isi missing values dengan value

# pakai nilai mean
from sklearn.impute import SimpleImputer

imr = SimpleImputer(missing_values=np.nan, strategy='mean') # mean di setiap kolom
imr = imr.fit(df.values)
data_imputed = imr.transform(df.values)

In [16]:
data_imputed

array([[ 4.,  3.,  2.,  1.],
       [ 5.,  6.,  6.,  7.],
       [ 8.,  9., 10.,  4.]])

In [17]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,4.0,3.0,2.0,1.0
1,5.0,6.0,6.0,7.0
2,8.0,9.0,10.0,4.0


In [18]:
# handling data kategori - misal terdapat fitur warna, ukuran, harga, kelas

df = pd.DataFrame([
    ['merah', 'XL', 10, 'kelas1'],
    ['kuning', 'M', 15, 'kelas2'],
    ['biru', 'S', 7, 'kelas3']
])
df.columns = ['warna', 'ukuran', 'harga', 'kelas']

In [19]:
size_mapping = {'XL': 3,
               'M': 2,
               'S': 1}
df['ukuran'] = df['ukuran'].map(size_mapping)

In [20]:
df

Unnamed: 0,warna,ukuran,harga,kelas
0,merah,3,10,kelas1
1,kuning,2,15,kelas2
2,biru,1,7,kelas3


In [21]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['ukuran'].map(inv_size_mapping)

0    XL
1     M
2     S
Name: ukuran, dtype: object

In [22]:
# create mapping dict
# to convert class labels from strings to integers

class_mapping = {label: idx for idx, label in enumerate(np.unique(df['kelas']))}
class_mapping

{'kelas1': 0, 'kelas2': 1, 'kelas3': 2}

In [23]:
# to convert class labels from strings to integers
df['kelas'] = df['kelas'].map(class_mapping)
df

Unnamed: 0,warna,ukuran,harga,kelas
0,merah,3,10,0
1,kuning,2,15,1
2,biru,1,7,2


In [24]:
# reverse the class label mapping

inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['kelas'] = df['kelas'].map(inv_class_mapping)
df

Unnamed: 0,warna,ukuran,harga,kelas
0,merah,3,10,kelas1
1,kuning,2,15,kelas2
2,biru,1,7,kelas3


In [25]:
# Label Encoding

from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()
y = class_le.fit_transform(df['kelas'].values)
y

array([0, 1, 2])

In [26]:
# reverse mapping
class_le.inverse_transform(y)

array(['kelas1', 'kelas2', 'kelas3'], dtype=object)

In [27]:
X = df[['warna', 'ukuran','harga']].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X

array([[2, 3, 10],
       [1, 2, 15],
       [0, 1, 7]], dtype=object)

In [28]:
from sklearn.preprocessing import OneHotEncoder

X = df[['warna', 'ukuran', 'harga']].values
color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [29]:
from sklearn.compose import ColumnTransformer

X = df[['warna', 'ukuran', 'harga']].values
c_transf = ColumnTransformer([('onehot', OneHotEncoder(), [0]), ('nothing', 'passthrough', [1, 2])])
c_transf.fit_transform(X).astype(float)

array([[ 0.,  0.,  1.,  3., 10.],
       [ 0.,  1.,  0.,  2., 15.],
       [ 1.,  0.,  0.,  1.,  7.]])

In [30]:
# One hot encoding via pandas
pd.get_dummies(df[['harga', 'warna', 'ukuran']])

Unnamed: 0,harga,ukuran,warna_biru,warna_kuning,warna_merah
0,10,3,0,0,1
1,15,2,0,1,0
2,7,1,1,0,0


In [31]:
# Multicollinearity guard in get dummies
pd.get_dummies(df[['harga', 'warna', 'ukuran']], drop_first = True)

Unnamed: 0,harga,ukuran,warna_kuning,warna_merah
0,10,3,0,1
1,15,2,1,0
2,7,1,0,0


In [32]:
# Multicollinearity guard for the OneHotEncoder
color_ohe = OneHotEncoder(categories = 'auto', drop = 'first')
c_transf = ColumnTransformer([('onehot', color_ohe, [0]), ('nothing', 'passthrough', [1, 2])])
c_transf.fit_transform(X).astype(float)

array([[ 0.,  1.,  3., 10.],
       [ 1.,  0.,  2., 15.],
       [ 0.,  0.,  1.,  7.]])

In [33]:
# From URI
# df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)

# From Local Path
df_wine = pd.read_csv('Datasets/wine.data', header=None)

df_wine.columns = ['Class Label', 'Alcohol', 'Malic Acid', 'Ash', 'Alcalinity of Ash', 
                   'Magnesium', 'Total Phenols', 'Flavanoids', 'Nonflavanoid Phenols', 'Proanthocyanins', 
                  'Color Intensity', 'Hue', 'OD280/OD315 of Diluted Wines', 'Proline']

print('Class Label', np.unique(df_wine['Class Label']))
df_wine.head()

Class Label [1 2 3]


Unnamed: 0,Class Label,Alcohol,Malic Acid,Ash,Alcalinity of Ash,Magnesium,Total Phenols,Flavanoids,Nonflavanoid Phenols,Proanthocyanins,Color Intensity,Hue,OD280/OD315 of Diluted Wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [34]:
from sklearn.model_selection import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=0,
                                                    stratify=y)

In [35]:
# Normalization Min Max

from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

In [36]:
# Standarization Standard

from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.fit_transform(X_test)

In [37]:
# Visual example of normalization and standarization

ex = np.array([0, 1, 2, 3, 4, 5])

print('standarized: ', (ex - ex.mean()) / ex.std())

print('normalized: ', (ex - ex.min()) / (ex.max() - ex.min()))

standarized:  [-1.46385011 -0.87831007 -0.29277002  0.29277002  0.87831007  1.46385011]
normalized:  [0.  0.2 0.4 0.6 0.8 1. ]
