# Connect to G-Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
default_dir = "/content/drive/MyDrive/.../Data Preprocessing I"
os.chdir(default_dir)

# Recap: exploring data with pandas

## Hiking Dataset

In [None]:
import pandas as pd
hiking = pd.read_json("hiking.json")
hiking.head()

In [None]:
hiking.info()

# Missing Value Handlings

## College Dataset

### Load Dataset

In [None]:
college = pd.read_csv("college.csv")
college.head()

In [None]:
college.info()

### Detect Missing Values

In [None]:
csat_unique = college.csat.unique()

In [None]:
import numpy as np
np.sort(csat_unique)

### Replace Missing Values with NaN

In [None]:
college = pd.read_csv("college.csv", na_values='.')
college.head()

In [None]:
college.info()

## Diabetes Dataset

### Load Dataset

In [None]:
diabetes = pd.read_csv('pima-indians-diabetes.csv')
diabetes.head()

In [None]:
diabetes.info()

### Detect Missing Values

In [None]:
diabetes.describe()

In [None]:
diabetes.BMI[diabetes.BMI == 0]

### Replace Missing Values with NaN

In [None]:
diabetes.loc[diabetes.BMI == 0, 'BMI'] = np.nan

#### Re-check if the replacement have been done!

In [None]:
diabetes.BMI[np.isnan(diabetes.BMI)]

In [None]:
diabetes.BMI[diabetes.BMI.isna()]

In [None]:
diabetes.BMI[diabetes.BMI.isnull()]

## Imputations: Basic techniques

### Mean Imputation

In [None]:
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy='mean')
diabetes_mean = pd.DataFrame(
    data=mean_imputer.fit_transform(diabetes),
    columns=diabetes.columns
)

In [None]:
diabetes_mean

In [None]:
diabetes_mean.info()

### Median Imputation

In [None]:
median_imputer = SimpleImputer(strategy='median')
diabetes_median = pd.DataFrame(
    data=median_imputer.fit_transform(diabetes),
    columns=diabetes.columns
)

### Mode Imputation

In [None]:
mode_imputer = SimpleImputer(strategy='most_frequent')
diabetes_mode = pd.DataFrame(
    data=mode_imputer.fit_transform(diabetes),
    columns=diabetes.columns
)

### Imputing a constant

In [None]:
constant_imputer = SimpleImputer(strategy='constant', fill_value=0)
diabetes_constant = pd.DataFrame(
    data=constant_imputer.fit_transform(diabetes),
    columns=diabetes.columns
)

### Scatterplot of imputation

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
nullity = diabetes['Serum_Insulin'].isnull() + diabetes['Glucose'].isnull()

In [None]:
diabetes_mean.plot(
    x='Serum_Insulin',
    y='Glucose',
    kind='scatter',
    alpha=0.5,
    c=nullity,
    cmap='rainbow',
    title='Mean Imputation'
);

### Visualizing Imputations

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 10))
nullity = diabetes['Serum_Insulin'].isnull() + diabetes['Glucose'].isnull()
imputations = {
    'Mean Imputation': diabetes_mean,
    'Median Imputation': diabetes_median,
    'Most Frequent Imputation': diabetes_mode,
    'Constant Imputation': diabetes_constant
}

for ax, df_key in zip(axes.flatten(), imputations):
    imputations[df_key].plot(
        x='Serum_Insulin',
        y='Glucose',
        kind='scatter',
        alpha=0.5,
        c=nullity,
        cmap='rainbow',
        ax=ax,
        colorbar=False, title=df_key
    )

# Label Encoder
Encoding Categorical Variables


In [None]:
users = pd.DataFrame(
    data = {
        'user': [1, 2, 3, 4],
        'subscribed': ['y', 'n', 'n', 'y'],
        'fav_color': ['blue', 'green', 'orange', 'green'],
        'loyalty_badge': ['Gold', 'Silver', 'Silver', 'Platinum']
    })

users

## Encoding Binary Variables

### with Pandas

In [None]:
users['subscribed']

In [None]:
users["sub_enc"] = users["subscribed"].apply(lambda val: 1 if val == "y" else 0)

In [None]:
users[["subscribed", "sub_enc"]]

### with Scikit-learn

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
users["sub_enc_le"] = le.fit_transform(users["subscribed"])

print(users[["subscribed", "sub_enc_le"]])

## Encoding Categorical - Nominal Variable

In [None]:
le_color = LabelEncoder()
users["color_enc_le"] = le_color.fit_transform(users["fav_color"])

print(users[["fav_color", "color_enc_le"]])

## Encoding Categorical - Ordinal Variable

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Ordered as the category rank
badge_cat = ['Silver', 'Gold', 'Platinum']
oe = OrdinalEncoder(categories=[badge_cat])

users["badge_enc_oe"] = oe.fit_transform(users[["loyalty_badge"]])
users["badge_enc_oe"] = users["badge_enc_oe"].astype('int')

print(users[["loyalty_badge", "badge_enc_oe"]])

# One-Hot Encoding
To Categorical-Nominal Feature

In [None]:
users["fav_color"]

In [None]:
pd.get_dummies(users["fav_color"])

In [None]:
users = pd.get_dummies(users, columns = ["fav_color"])
users

# Standardization

## Log Normalization

In [None]:
df = pd.DataFrame(
    data={
        "col1":[1, 1.20, 0.75, 1.60],
        "col2":[3, 45.5, 28.0, 100]
    }
)

df

In [None]:
print(df.var())

In [None]:
df["log_2"] = np.log(df["col2"])
print(df)

In [None]:
print(df[['col1', 'log_2']].var())

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(df),
    columns=df.columns
)

In [None]:
print(df_scaled)

In [None]:
print(df_scaled.var())

# Train-test Splitting
Case: Standardized data and modeling with K-Nearest Neighbor


## Checking Missing Values

In [None]:
diabetes_median.info()

## Dataset Splitting

In [None]:
from sklearn.model_selection import train_test_split

X = diabetes_median.iloc[:, :-1]
y = diabetes_median["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

## Modeling with KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

knn = KNeighborsClassifier()
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Evaluation Before & After Scaling

In [None]:
knn.fit(X_train, y_train)
print("Accuracy - Before Scaled:", knn.score(X_test, y_test))

In [None]:
knn.fit(X_train_scaled, y_train)
print("Accuracy - After Scaled:", knn.score(X_test_scaled, y_test))

In [None]:
(0.753 - 0.6753)/0.6753