In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
url ='https://raw.githubusercontent.com/digipodium/Datasets/main/classfication/Social_Network_Ads.csv'
df = pd.read_csv(url)
df.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
# model selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
# pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [None]:
X = df.drop(columns=['User ID', 'Purchased'])
y = df['Purchased']

In [None]:
cat_cols = X.select_dtypes(exclude='number').columns.tolist()
num_cols = X.select_dtypes(include='number').columns.tolist()
print(f'Categorical columns: {cat_cols}')
print(f'Numerical columns: {num_cols}')

In [None]:
cat_pipe = Pipeline([
    ('ordinal', OrdinalEncoder())
])
num_pipe = Pipeline([
    ('scale', StandardScaler())
])

col_transform = ColumnTransformer([
    ('categorical', cat_pipe, cat_cols),
    ('numerical', num_pipe, num_cols)
]) 

# logistic regression
lr_pipe = Pipeline([
    ('transform', col_transform),
    ('model', LogisticRegression())
])

# knn
knn_pipe = Pipeline([
    ('transform', col_transform),
    ('model', KNeighborsClassifier())
])

# svm
svm_pipe = Pipeline([
    ('transform', col_transform),
    ('model', SVC())
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
lr_pipe.fit(X_train, y_train)
knn_pipe.fit(X_train, y_train)
svm_pipe.fit(X_train, y_train)

In [None]:
import seaborn as sns

def plot_confusion_matrix(xtest, ytest, model):
    ypred = model.predict(xtest)
    cm = confusion_matrix(ytest, ypred)
    fig, ax = plt.subplots(figsize=(2,2))
    sns.heatmap(cm, annot=True, fmt='d', ax=ax, cbar=False)

In [None]:
plot_confusion_matrix(X_test, y_test, lr_pipe)
plot_confusion_matrix(X_test, y_test, knn_pipe)
plot_confusion_matrix(X_test, y_test, svm_pipe)

In [None]:
# precision, recall, f1-score
ypred_1 = lr_pipe.predict(X_test)
print(f'Precision= {precision_score(y_test, ypred_1)}')
print(f'Recall= {recall_score(y_test, ypred_1)}') 
print(f'F1-score= {f1_score(y_test, ypred_1)}')

In [None]:
ypred_2 = knn_pipe.predict(X_test)
print(f'Precision= {precision_score(y_test, ypred_2)}')
print(f'Recall= {recall_score(y_test, ypred_2)}')
print(f'F1-score= {f1_score(y_test, ypred_2)}')

In [None]:
ypred_3 = knn_pipe.predict(X_test)
print(classification_report(y_test, ypred_3))

In [None]:
ypred_1 = lr_pipe.predict(X_test)
print(classification_report(y_test, ypred_1))

In [None]:
ypred_2 = svm_pipe.predict(X_test)
print(classification_report(y_test, ypred_2))

In [None]:
# visualize distribution of classes in dataset

# make classification on diabetes dataset

to be covered
- visualizing classification ✅
- class imbalance problem
- column selection

In [None]:
# simple pipeline for easy visualization
num_pipe = Pipeline([
    ('scale', StandardScaler())
])

col_transform = ColumnTransformer([
    ('numerical', num_pipe, num_cols)
]) 

# svm
svm_pipe = Pipeline([
    ('transform', col_transform),
    ('model', SVC())
])

# logistic regression
lr_pipe = Pipeline([
    ('transform', col_transform),
    ('model', LogisticRegression())
])

# knn
knn_pipe = Pipeline([
    ('transform', col_transform),
    ('model', KNeighborsClassifier())
])

In [None]:
X = df.drop(columns=['User ID', 'Purchased', 'Gender'])
y = df['Purchased']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
svm_pipe.fit(X_train, y_train)
lr_pipe.fit(X_train, y_train)
knn_pipe.fit(X_train, y_train)

In [None]:
from utils.visualize import plot_classification

In [None]:
_ = plot_classification(svm_pipe, X_train, y_train, 'Age', 'EstimatedSalary')

In [None]:
_ = plot_classification(lr_pipe, X_train, y_train, 'Age', 'EstimatedSalary')

In [None]:
_ = plot_classification(knn_pipe, X_train, y_train, 'Age', 'EstimatedSalary')

In [None]:
df['Purchased'].value_counts().plot(kind='pie', autopct='%.2f%%', explode=[0,0.1], shadow=True, figsize=(4,4))

# handling class imbalance problem
`pip install imblearn`

- under sampling
    - near miss
- over sampling
    - SMOTE (Synthetic Minority Over-sampling Technique)

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

In [None]:
im1 = SMOTE()
im2 = NearMiss()

In [None]:
Xi, yi = im1.fit_resample(X, y)
print(f'Original dataset shape {X.shape}')
print(f'Resampled dataset shape {Xi.shape}')


In [None]:
yi.value_counts().plot(kind='pie', autopct='%.2f%%', explode=[0,0.1],
                        shadow=True, figsize=(4,4))

In [None]:
Xi2, yi2 = im2.fit_resample(X, y)
print(f'Original dataset shape {X.shape}')
print(f'Resampled dataset shape {Xi2.shape}')

In [None]:
yi2.value_counts().plot(kind='pie', autopct='%.2f%%', explode=[0,0.1],
                        shadow=True, figsize=(4,4))