# Study about handling imbalanced datasets using the SMOTE technique

* This notebook is based on the [video](https://www.youtube.com/watch?v=dkXB8HH_4-k&ab_channel=DataMites)
* The original dataset is from the [link](https://archive.ics.uci.edu/ml/datasets/car+evaluation).

In [60]:
import pandas as pd

data = pd.read_csv("car_evaluation.csv")
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [61]:
data.shape

(1728, 7)

In [62]:
data.outcome.value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: outcome, dtype: int64

In [63]:
X = data.iloc[:,:-1]
y = data.outcome
X.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high
3,vhigh,vhigh,2,2,med,low
4,vhigh,vhigh,2,2,med,med


In [64]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
X.loc[:,["buying", "maint", "lug_boot", "safety"]] = X.loc[:,["buying", "maint", "lug_boot", "safety"]].apply(encoder.fit_transform)
X.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,3,3,2,2,2,1
1,3,3,2,2,2,2
2,3,3,2,2,2,0
3,3,3,2,2,1,1
4,3,3,2,2,1,2


In [65]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [66]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [67]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_predict, y_test))
table = pd.crosstab(y_test, y_predict)
table

0.9441233140655106


col_0,acc,good,unacc,vgood
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,91,1,10,0
good,8,13,0,0
unacc,2,0,369,0
vgood,5,2,1,17


In [68]:
print('TP acc:', table.loc['acc'][0])
print('TP good:', table.loc['good'][1])
print('TP unacc:', table.loc['unacc'][2])
print('TP vgood:', table.loc['vgood'][3])

TP acc: 91
TP good: 13
TP unacc: 369
TP vgood: 17


In [69]:
b_acc = table.loc['acc'][0] / table.loc['acc'].sum()
b_good = table.loc['good'][1] / table.loc['good'].sum()
b_unacc = table.loc['unacc'][2] / table.loc['unacc'].sum()
b_vgood = table.loc['vgood'][3] / table.loc['vgood'].sum()

## Performance of the model

In [70]:
print("Accuracy before SMOTE")
print('acc:', int(b_acc * 100))
print('good:', int(b_good * 100))
print('unacc:', int(b_unacc * 100))
print('vgood:', int(b_vgood * 100))

Accuracy before SMOTE
acc: 89
good: 61
unacc: 99
vgood: 68


In [51]:
!pip install imblearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [71]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [72]:
from collections import Counter
print("Before SMOTE: ", Counter(y_train))
print("After SMOTE: ", Counter(y_train_smote))

Before SMOTE:  Counter({'unacc': 839, 'acc': 282, 'good': 48, 'vgood': 40})
After SMOTE:  Counter({'acc': 839, 'unacc': 839, 'vgood': 839, 'good': 839})


In [73]:
model.fit(X_train_smote, y_train_smote)
y_predict = model.predict(X_test)
print(accuracy_score(y_test, y_predict))
smote_table = pd.crosstab(y_test, y_predict)
smote_table

0.8169556840077071


col_0,acc,good,unacc,vgood
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,82,16,4,0
good,4,17,0,0
unacc,53,8,306,4
vgood,5,1,0,19


In [74]:
a_acc = smote_table.loc['acc'][0] / smote_table.loc['acc'].sum()
a_good = smote_table.loc['good'][1] / smote_table.loc['good'].sum()
a_unacc = smote_table.loc['unacc'][2] / smote_table.loc['unacc'].sum()
a_vgood = smote_table.loc['vgood'][3] / smote_table.loc['vgood'].sum()
print("Accuracy after SMOTE")
print('acc:', int(a_acc * 100))
print('good:', int(a_good * 100))
print('unacc:', int(a_unacc * 100))
print('vgood:', int(a_vgood * 100))

Accuracy after SMOTE
acc: 80
good: 80
unacc: 82
vgood: 76


In [75]:
print("Gain of performance by class")
print('acc:', int((b_acc - a_acc) * 100))
print('good:', int((b_good - a_good) * 100))
print('unacc:', int((b_unacc - a_unacc) * 100))
print('vgood:', int((b_vgood - a_vgood) * 100))

Gain of performance by class
acc: 8
good: -19
unacc: 16
vgood: -7


### The accuracy is better in classes with fewer examples!