# **EGCO 425: Chapter 3 (Preprocessing)**

## Google Colab



In [None]:
## If using Colab

from google.colab import drive
drive.mount('/content/drive')
#drive.mount('/content/drive', force_remount=True)

import os
os.chdir('/content/drive/MyDrive/Workspace/425')          ## replace Workspace/425 with your folder
%cd /content/drive/MyDrive/Workspace/425

In [None]:
import pandas as pd
import numpy as np

## Golf Data

In [None]:
GolfDF = pd.read_csv('./data/golf.csv', sep = ';')
GolfDF.info()

## Titanic Data

In [None]:
TitanicDF = pd.read_excel('./data/TitanicExcel.xlsx')

### Rename column names
mapping = {TitanicDF.columns[1]: 'PassengerClass', TitanicDF.columns[3]: 'SiblingsSpouses', TitanicDF.columns[4]: 'ParentsChildren', TitanicDF.columns[5]: 'Fare'}
TitanicDF.rename(columns = mapping, inplace = True)

### Set categorical type
for col in ['PassengerClass', 'Sex', 'Survived']:
  TitanicDF[col] = TitanicDF[col].astype('category')

TitanicDF.info()

In [None]:
TitanicDF.sample(10)

## Preprocessing by Scikit-Learn
- Standardization --> **[Manual: sklearn.preprocessing.StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler)**
- Column Normalization --> **[Manual: sklearn.preprocessing.MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler)**
- Nomical to numeric (onehot encoding) --> **[Manual: sklearn.preprocessing.OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder)**
- Discretization --> **[Manual: sklearn.preprocessing.KBinsDiscretizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html#sklearn.preprocessing.KBinsDiscretizer)**

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

humidityDF = pd.DataFrame( GolfDF['Humidity'] )

scaler1 = StandardScaler().fit(humidityDF)                      ## calculate the transformation from given data
scaler2 = MinMaxScaler(feature_range = (0,1)).fit(humidityDF)

out1 = scaler1.transform(humidityDF)                            ## apply the transformation to the given data
out2 = scaler2.transform(humidityDF)

humidityDF['standardized'] = out1
humidityDF['ranged 0-1']   = out2
humidityDF

### Note
- StandardScaler uses population SD
- RapidMiner uses sample SD

In [None]:
h = humidityDF['Humidity'].to_numpy()
mean = h.mean()
std1 = h.std()            ## population SD
std2 = h.std(ddof=1)      ## sample SD (-1 df)

x1 = (h-mean)/std1
x2 = (h-mean)/std2

print("Using population SD \n", x1, "\n")
print("Using sample SD \n", x2)

In [None]:
from sklearn.preprocessing import OneHotEncoder

subTitanicDF = pd.DataFrame(TitanicDF[ ['Sex','PassengerClass'] ]).sample(20)

#encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False).fit(subTitanicDF)
#out3    = encoder.transform(subTitanicDF)

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
out3    = encoder.fit_transform(subTitanicDF)

features = encoder.get_feature_names_out(['Sex', 'PassengerClass'])
print("New features = ", features, "\n\n")

subTitanicDF[features] = out3
subTitanicDF

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

subTitanicDF = pd.DataFrame( TitanicDF['Age'] ).sample(30)

discretizer1 = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')      ## equal width
discretizer2 = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')     ## equal freq

out1 = discretizer1.fit_transform(subTitanicDF)
out2 = discretizer2.fit_transform(subTitanicDF)

subTitanicDF['equal_width'] = out1
subTitanicDF['equal_freq']  = out2
#subTitanicDF

sortedDF = subTitanicDF.sort_values(by='Age')
sortedDF

## Imputation
- Simple imputer --> **[Manual: sklearn.preprocessing.SimpleImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer)**
- KNN imputer --> **[Manual: sklearn.preprocessing.KNNImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html#sklearn.impute.KNNImputer)**

In [None]:
missingDF = pd.read_csv('./data/golf_missing.csv')
missingDF

In [None]:
from sklearn.impute import SimpleImputer

### Numeric attribute

imputer1 = SimpleImputer(missing_values=np.nan, strategy='mean', add_indicator=True)
#imputer1 = SimpleImputer(missing_values=np.nan, strategy='median', add_indicator=True)
#imputer1 = SimpleImputer(missing_values=np.nan, strategy='most_frequent', add_indicator=True)
#imputer1 = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1, add_indicator=True)

out1 = imputer1.fit_transform(missingDF[ ['Temperature', 'Humidity'] ])
pd.DataFrame(out1, columns=['Temperature', 'Humidity', 'Temperature_missing', 'Humidity_missing'])

In [None]:
from sklearn.impute import SimpleImputer

### Nominal attribute

imputer2 = SimpleImputer(missing_values=np.nan, strategy='most_frequent', add_indicator=True)
#imputer2 = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value="value", add_indicator=True)

out2 = imputer2.fit_transform(missingDF[ ['Wind'] ])
pd.DataFrame(out2, columns=['Wind', 'Wind_missing'])

In [None]:
from sklearn.impute import KNNImputer

imputer3 = KNNImputer(missing_values=np.nan, n_neighbors=5, weights='uniform', add_indicator=True)
#imputer3 = KNNImputer(missing_values=np.nan, n_neighbors=5, weights='distance', add_indicator=True)

out3 = imputer3.fit_transform(missingDF[ ['Temperature', 'Humidity', 'Wind'] ])
pd.DataFrame(out3, columns=['Temperature', 'Humidity', 'Wind', 'Temperature_missing', 'Humidity_missing', 'Wind_missing'])