# SMOTE 
### (Synthetic Minority Over-sampling Technique )

In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data=pd.read_csv('car_evaluation.csv')
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
data.shape

(1728, 7)

In [4]:
data.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
outcome     0
dtype: int64

In [5]:
X=data.iloc[:,:-1]
y=data.outcome

In [9]:
enc=LabelEncoder()
X.buying=enc.fit_transform(X.buying)
X.maint=enc.fit_transform(X.maint)
X.lug_boot=enc.fit_transform(X.lug_boot)
X.safety=enc.fit_transform(X.safety)
X.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,3,3,2,2,2,1
1,3,3,2,2,2,2
2,3,3,2,2,2,0
3,3,3,2,2,1,1
4,3,3,2,2,1,2


In [11]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=10)

In [12]:
model=KNeighborsClassifier(n_neighbors=5)
model.fit(X_train,y_train)
y_predict=model.predict(X_test)

In [44]:
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))
pd.crosstab(y_test,y_predict)

0.9364161849710982
              precision    recall  f1-score   support

         acc       0.85      0.85      0.85       102
        good       0.67      0.95      0.78        21
       unacc       0.99      0.96      0.97       371
       vgood       0.89      0.96      0.92        25

    accuracy                           0.94       519
   macro avg       0.85      0.93      0.88       519
weighted avg       0.94      0.94      0.94       519



col_0,acc,good,unacc,vgood
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,87,8,5,2
good,1,20,0,0
unacc,14,1,355,1
vgood,0,1,0,24


## Handling Imbalance Dataset

In [17]:
!pip install imblearn

Note: you may need to restart the kernel to use updated packages.


'C:\Users\PARVEEZ' is not recognized as an internal or external command,
operable program or batch file.


In [46]:
from imblearn.over_sampling import SMOTE
smote=SMOTE()

In [47]:
X_train_smote,y_train_smote=smote.fit_sample(X_train.astype('float'),y_train)
# .astype to convert X_train values from int to float becoz it increases accuracy
X_train_smote.tail() # tail because after imbalance data gets added at last

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
3351,1.0,1.388331,4.0,5.0,1.0,0.0
3352,1.0,0.0,3.0,4.702488,0.0,0.0
3353,2.0,2.0,2.490094,5.0,0.0,0.0
3354,1.0,2.0,3.628685,5.0,0.0,0.0
3355,1.844997,2.0,4.0,4.0,1.0,0.0


In [48]:
print(Counter(y_train))
print(Counter(y_train_smote))

Counter({'unacc': 839, 'acc': 282, 'good': 48, 'vgood': 40})
Counter({'acc': 839, 'unacc': 839, 'vgood': 839, 'good': 839})


In [49]:
# from above smoting has done i.e balanced everything with majority class 

In [50]:
model.fit(X_train_smote,y_train_smote)
y_predict=model.predict(X_test)
print(accuracy_score(y_test,y_predict))
pd.crosstab(y_test,y_predict)
print(classification_report(y_test,y_predict))

0.9479768786127167
              precision    recall  f1-score   support

         acc       0.89      0.88      0.89       102
        good       0.67      0.95      0.78        21
       unacc       0.99      0.97      0.98       371
       vgood       0.88      0.92      0.90        25

    accuracy                           0.95       519
   macro avg       0.86      0.93      0.89       519
weighted avg       0.95      0.95      0.95       519

