<a href="https://colab.research.google.com/github/gabrielnichio/hyperparameter-optimization-classification/blob/main/classification_hyperparameter_optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hyperparameter Optimization

Dataset from Kaggle: https://www.kaggle.com/datasets/taweilo/mba-admission-dataset

The focus here is only to document and learn about hyperparameter optimization for classification.

In [70]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

In [71]:
data = pd.read_csv('/content/MBA.csv')

In [72]:
data.head()

Unnamed: 0,application_id,gender,international,gpa,major,race,gmat,work_exp,work_industry,admission
0,1,Female,False,3.3,Business,Asian,620.0,3.0,Financial Services,Admit
1,2,Male,False,3.28,Humanities,Black,680.0,5.0,Investment Management,
2,3,Female,True,3.3,Business,,710.0,5.0,Technology,Admit
3,4,Male,False,3.47,STEM,Black,690.0,6.0,Technology,
4,5,Male,False,3.35,STEM,Hispanic,590.0,5.0,Consulting,


In [73]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6194 entries, 0 to 6193
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   application_id  6194 non-null   int64  
 1   gender          6194 non-null   object 
 2   international   6194 non-null   bool   
 3   gpa             6194 non-null   float64
 4   major           6194 non-null   object 
 5   race            4352 non-null   object 
 6   gmat            6194 non-null   float64
 7   work_exp        6194 non-null   float64
 8   work_industry   6194 non-null   object 
 9   admission       1000 non-null   object 
dtypes: bool(1), float64(3), int64(1), object(5)
memory usage: 441.7+ KB


In [74]:
# this information is on Kaggle


data.drop(columns='application_id', inplace=True)
data['admission'] = data['admission'].fillna('Deny')
data['race'] = data['race'].fillna('Other')


In [75]:
data['race'].value_counts()

Unnamed: 0_level_0,count
race,Unnamed: 1_level_1
Other,2079
White,1456
Asian,1147
Black,916
Hispanic,596


In [76]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6194 entries, 0 to 6193
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         6194 non-null   object 
 1   international  6194 non-null   bool   
 2   gpa            6194 non-null   float64
 3   major          6194 non-null   object 
 4   race           6194 non-null   object 
 5   gmat           6194 non-null   float64
 6   work_exp       6194 non-null   float64
 7   work_industry  6194 non-null   object 
 8   admission      6194 non-null   object 
dtypes: bool(1), float64(3), object(5)
memory usage: 393.3+ KB


In [77]:
data['admission'].value_counts()

Unnamed: 0_level_0,count
admission,Unnamed: 1_level_1
Deny,5194
Admit,900
Waitlist,100


In [78]:
fig = px.histogram(data, x='gender', color='admission')
fig.show()

In [79]:
fig = px.histogram(data, x='major')
fig.show()

In [80]:
plt.figure(figsize=(10, 10))
fig = px.box(data, x="gpa", color='race')
fig.show();

<Figure size 1000x1000 with 0 Axes>

## Data Transformation

In [81]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

In [82]:
x = data.drop(columns='admission')
y = data['admission']

In [83]:
columns = x.columns

one_hot = make_column_transformer((
    OneHotEncoder(drop='if_binary'),
    ['gender', 'international', 'major', 'race', 'work_industry']
),
    remainder='passthrough',
    sparse_threshold=0)

x = one_hot.fit_transform(x)

label_encoder = LabelEncoder()

y = label_encoder.fit_transform(y)

In [84]:
pd.DataFrame(x, columns = one_hot.get_feature_names_out(columns))

Unnamed: 0,onehotencoder__gender_Male,onehotencoder__international_True,onehotencoder__major_Business,onehotencoder__major_Humanities,onehotencoder__major_STEM,onehotencoder__race_Asian,onehotencoder__race_Black,onehotencoder__race_Hispanic,onehotencoder__race_Other,onehotencoder__race_White,...,onehotencoder__work_industry_Media/Entertainment,onehotencoder__work_industry_Nonprofit/Gov,onehotencoder__work_industry_Other,onehotencoder__work_industry_PE/VC,onehotencoder__work_industry_Real Estate,onehotencoder__work_industry_Retail,onehotencoder__work_industry_Technology,remainder__gpa,remainder__gmat,remainder__work_exp
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.30,620.0,3.0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.28,680.0,5.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.30,710.0,5.0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.47,690.0,6.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.35,590.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6189,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.49,640.0,5.0
6190,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.18,670.0,4.0
6191,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.22,680.0,5.0
6192,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.36,590.0,5.0


In [85]:
y_counts = pd.Series(y).value_counts

In [86]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=5, stratify=y, test_size=0.25)

In [87]:
scaler = MinMaxScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

## KNN

For this app I'm using KNN

In [88]:
from sklearn.neighbors import NearestNeighbors