In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder


In [2]:
dataset= pd.read_csv('mba.csv')
dataset.shape

(6194, 10)

In [3]:
dataset.head(10)

Unnamed: 0,application_id,gender,international,gpa,major,race,gmat,work_exp,work_industry,admission
0,1,Female,False,3.3,Business,Asian,620.0,3.0,Financial Services,Admit
1,2,Male,False,3.28,Humanities,Black,680.0,5.0,Investment Management,
2,3,Female,True,3.3,Business,,710.0,5.0,Technology,Admit
3,4,Male,False,3.47,STEM,Black,690.0,6.0,Technology,
4,5,Male,False,3.35,STEM,Hispanic,590.0,5.0,Consulting,
5,6,Male,False,3.18,Business,White,610.0,6.0,Consulting,
6,7,Female,False,2.93,STEM,Other,590.0,3.0,Technology,Admit
7,8,Male,True,3.02,Business,,630.0,6.0,Financial Services,
8,9,Male,False,3.24,Business,White,590.0,2.0,Nonprofit/Gov,
9,10,Male,False,3.27,Humanities,Asian,690.0,3.0,Consulting,


In [4]:
dataset.drop('application_id', axis=1, inplace=True)

In [5]:
dataset.isnull().sum()

gender              0
international       0
gpa                 0
major               0
race             1842
gmat                0
work_exp            0
work_industry       0
admission        5194
dtype: int64

In [6]:
dataset['race'].fillna('International', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['race'].fillna('International', inplace=True)


In [7]:
dataset.isnull().sum()

gender              0
international       0
gpa                 0
major               0
race                0
gmat                0
work_exp            0
work_industry       0
admission        5194
dtype: int64

In [8]:
dataset.drop('international', axis=1, inplace=True)

In [9]:
dataset.isnull().sum()

gender              0
gpa                 0
major               0
race                0
gmat                0
work_exp            0
work_industry       0
admission        5194
dtype: int64

In [10]:
dataset.replace('Admit', '1', inplace=True)
dataset.head(15)

Unnamed: 0,gender,gpa,major,race,gmat,work_exp,work_industry,admission
0,Female,3.3,Business,Asian,620.0,3.0,Financial Services,1
1,Male,3.28,Humanities,Black,680.0,5.0,Investment Management,
2,Female,3.3,Business,International,710.0,5.0,Technology,1
3,Male,3.47,STEM,Black,690.0,6.0,Technology,
4,Male,3.35,STEM,Hispanic,590.0,5.0,Consulting,
5,Male,3.18,Business,White,610.0,6.0,Consulting,
6,Female,2.93,STEM,Other,590.0,3.0,Technology,1
7,Male,3.02,Business,International,630.0,6.0,Financial Services,
8,Male,3.24,Business,White,590.0,2.0,Nonprofit/Gov,
9,Male,3.27,Humanities,Asian,690.0,3.0,Consulting,


In [11]:
dataset.replace('Waitlist', '1', inplace=True)
dataset.head(15)

Unnamed: 0,gender,gpa,major,race,gmat,work_exp,work_industry,admission
0,Female,3.3,Business,Asian,620.0,3.0,Financial Services,1.0
1,Male,3.28,Humanities,Black,680.0,5.0,Investment Management,
2,Female,3.3,Business,International,710.0,5.0,Technology,1.0
3,Male,3.47,STEM,Black,690.0,6.0,Technology,
4,Male,3.35,STEM,Hispanic,590.0,5.0,Consulting,
5,Male,3.18,Business,White,610.0,6.0,Consulting,
6,Female,2.93,STEM,Other,590.0,3.0,Technology,1.0
7,Male,3.02,Business,International,630.0,6.0,Financial Services,
8,Male,3.24,Business,White,590.0,2.0,Nonprofit/Gov,
9,Male,3.27,Humanities,Asian,690.0,3.0,Consulting,


In [12]:
dataset['admission'].fillna('0', inplace=True)
dataset.head(9)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['admission'].fillna('0', inplace=True)


Unnamed: 0,gender,gpa,major,race,gmat,work_exp,work_industry,admission
0,Female,3.3,Business,Asian,620.0,3.0,Financial Services,1
1,Male,3.28,Humanities,Black,680.0,5.0,Investment Management,0
2,Female,3.3,Business,International,710.0,5.0,Technology,1
3,Male,3.47,STEM,Black,690.0,6.0,Technology,0
4,Male,3.35,STEM,Hispanic,590.0,5.0,Consulting,0
5,Male,3.18,Business,White,610.0,6.0,Consulting,0
6,Female,2.93,STEM,Other,590.0,3.0,Technology,1
7,Male,3.02,Business,International,630.0,6.0,Financial Services,0
8,Male,3.24,Business,White,590.0,2.0,Nonprofit/Gov,0


In [13]:
label_encoders = {}
categorical_cols = ["major", "work_industry", "gender", "race"]
for col in categorical_cols:
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col])
    label_encoders[col] = le

In [14]:
dataset['admission'] = pd.to_numeric(dataset['admission'])
dataset.head(5)

Unnamed: 0,gender,gpa,major,race,gmat,work_exp,work_industry,admission
0,0,3.3,0,0,620.0,3.0,3,1
1,1,3.28,1,1,680.0,5.0,6,0
2,0,3.3,0,3,710.0,5.0,13,1
3,1,3.47,2,1,690.0,6.0,13,0
4,1,3.35,2,2,590.0,5.0,1,0


In [15]:
y = dataset["admission"]
x = dataset.drop(columns= ["admission"])

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [17]:
dataset.head(5)

Unnamed: 0,gender,gpa,major,race,gmat,work_exp,work_industry,admission
0,0,3.3,0,0,620.0,3.0,3,1
1,1,3.28,1,1,680.0,5.0,6,0
2,0,3.3,0,3,710.0,5.0,13,1
3,1,3.47,2,1,690.0,6.0,13,0
4,1,3.35,2,2,590.0,5.0,1,0


In [18]:
y.head(5)

0    1
1    0
2    1
3    0
4    0
Name: admission, dtype: int64

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [20]:
knn_model3 = KNeighborsClassifier(n_neighbors=3)
knn_model3.fit(X_train, y_train)
y_pred_knn3 = knn_model3.predict(X_test)
report_knn3 = classification_report(y_test, y_pred_knn3)
print(report_knn3)

              precision    recall  f1-score   support

           0       0.87      0.92      0.89      1563
           1       0.37      0.25      0.30       296

    accuracy                           0.81      1859
   macro avg       0.62      0.59      0.60      1859
weighted avg       0.79      0.81      0.80      1859



In [21]:
knn_model4 = KNeighborsClassifier(n_neighbors=4)
knn_model4.fit(X_train, y_train)
y_pred_knn4 = knn_model4.predict(X_test)
report_knn4 = classification_report(y_test, y_pred_knn4)
print(report_knn4)

              precision    recall  f1-score   support

           0       0.86      0.97      0.91      1563
           1       0.45      0.14      0.21       296

    accuracy                           0.84      1859
   macro avg       0.65      0.55      0.56      1859
weighted avg       0.79      0.84      0.80      1859



In [22]:
knn_model5 = KNeighborsClassifier(n_neighbors=5)
knn_model5.fit(X_train, y_train)
y_pred_knn5 = knn_model5.predict(X_test)
report_knn5 = classification_report(y_test, y_pred_knn5)
print(report_knn5)

              precision    recall  f1-score   support

           0       0.86      0.94      0.90      1563
           1       0.39      0.22      0.28       296

    accuracy                           0.82      1859
   macro avg       0.63      0.58      0.59      1859
weighted avg       0.79      0.82      0.80      1859



In [23]:
knn_model6 = KNeighborsClassifier(n_neighbors=6)
knn_model6.fit(X_train, y_train)
y_pred_knn6 = knn_model6.predict(X_test)
report_knn6 = classification_report(y_test, y_pred_knn6)
print(report_knn6)

              precision    recall  f1-score   support

           0       0.85      0.97      0.91      1563
           1       0.40      0.11      0.18       296

    accuracy                           0.83      1859
   macro avg       0.62      0.54      0.54      1859
weighted avg       0.78      0.83      0.79      1859



In [24]:
knn_model7 = KNeighborsClassifier(n_neighbors=7)
knn_model7.fit(X_train, y_train)
y_pred_knn7 = knn_model7.predict(X_test)
report_knn7 = classification_report(y_test, y_pred_knn7)
print(report_knn7)

              precision    recall  f1-score   support

           0       0.86      0.95      0.90      1563
           1       0.40      0.17      0.24       296

    accuracy                           0.83      1859
   macro avg       0.63      0.56      0.57      1859
weighted avg       0.79      0.83      0.80      1859



In [25]:
knn_model8 = KNeighborsClassifier(n_neighbors=8)
knn_model8.fit(X_train, y_train)
y_pred_knn8 = knn_model8.predict(X_test)
report_knn8 = classification_report(y_test, y_pred_knn8)
print(report_knn8)

              precision    recall  f1-score   support

           0       0.85      0.97      0.91      1563
           1       0.37      0.08      0.14       296

    accuracy                           0.83      1859
   macro avg       0.61      0.53      0.52      1859
weighted avg       0.77      0.83      0.78      1859



In [26]:
from sklearn.metrics import accuracy_score
print('accuracy ', accuracy_score(y_pred_knn7,y_test))

accuracy  0.8278644432490586
