### Importing the libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

### Import the dataset and clean the dataset

In [2]:
data=pd.read_csv("car_evaluation.csv")
data.columns =['Buying', 'Maintenance', 'Doors', 'Person','Luggage','Safety','Outcome']
data["Doors"].replace({"5more": 6}, inplace=True)
data["Person"].replace({"more": 6}, inplace=True)
data.head()

Unnamed: 0,Buying,Maintenance,Doors,Person,Luggage,Safety,Outcome
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


### Shape of the data

In [3]:
data.shape

(1727, 7)

### Dividing the x and y axis

In [4]:
x=data.iloc[:,:-1]
y=data.Outcome
x.head()

Unnamed: 0,Buying,Maintenance,Doors,Person,Luggage,Safety
0,vhigh,vhigh,2,2,small,med
1,vhigh,vhigh,2,2,small,high
2,vhigh,vhigh,2,2,med,low
3,vhigh,vhigh,2,2,med,med
4,vhigh,vhigh,2,2,med,high


### Converting strings to numbers

In [5]:
from sklearn.preprocessing import LabelEncoder
enc=LabelEncoder()
x.loc[:,['Buying','Maintenance','Luggage','Safety']]=x.loc[:,['Buying','Maintenance','Luggage','Safety']].apply(enc.fit_transform)
x.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


Unnamed: 0,Buying,Maintenance,Doors,Person,Luggage,Safety
0,3,3,2,2,2,2
1,3,3,2,2,2,0
2,3,3,2,2,1,1
3,3,3,2,2,1,2
4,3,3,2,2,1,0


### Function to predict the data using KNN algorithm

In [6]:
def naive(x_train_fn,y_train_fn,x_test_fn,y_test_fn):
    model=KNeighborsClassifier()
    model.fit(x_train_fn,y_train_fn)
    prediction=model.predict(x_test_fn)
    print("Accuracy is: ",accuracy_score(y_test_fn,prediction))
    print(pd.crosstab(y_test_fn,prediction))

### Predict the normal data using KNN algorithm

In [7]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=10)
naive(x_train,y_train,x_test,y_test)
# Accuracy is 85%

Accuracy is:  0.8516377649325626
col_0    acc  good  unacc  vgood
Outcome                         
acc       77     6     40      0
good      10     6      0      0
unacc      8     0    350      0
vgood     11     1      1      9


### Predict the oversampled data using KNN algorithm

In [8]:
oversampler=RandomOverSampler()
x_train_OS,y_train_OS=oversampler.fit_resample(x_train.astype('float'),y_train)
print("Before oversampler: ",Counter(y_train))
print("After oversampler: ",Counter(y_train_OS))

Before oversampler:  Counter({'unacc': 851, 'acc': 261, 'good': 53, 'vgood': 43})
After oversampler:  Counter({'unacc': 851, 'acc': 851, 'good': 851, 'vgood': 851})


##### Accuracy is 69% which is comparatively low

In [9]:
naive(x_train_OS,y_train_OS,x_test,y_test)

Accuracy is:  0.697495183044316
col_0    acc  good  unacc  vgood
Outcome                         
acc       79    27      4     13
good       6     8      0      2
unacc     77    14    260      7
vgood      5     2      0     15


### Predict the undersampled data using KNN algorithm

In [10]:
undersampler=RandomUnderSampler()
x_train_US,y_train_US=undersampler.fit_resample(x_train.astype('float'),y_train)
print("Before undersampler: ",Counter(y_train))
print("After undersampler: ",Counter(y_train_US))

Before undersampler:  Counter({'unacc': 851, 'acc': 261, 'good': 53, 'vgood': 43})
After undersampler:  Counter({'acc': 43, 'good': 43, 'unacc': 43, 'vgood': 43})


##### Accuracy is 48% which is lower than Oversampled data

In [11]:
naive(x_train_US,y_train_US,x_test,y_test)

Accuracy is:  0.4836223506743738
col_0    acc  good  unacc  vgood
Outcome                         
acc       30    63      4     26
good       2    14      0      0
unacc     66    79    189     24
vgood      1     3      0     18


### Predict the Synthetic Minority Over-sampled data using KNN algorithm

In [12]:
smote=SMOTE()
x_train_smote,y_train_smote=smote.fit_resample(x_train.astype('float'),y_train)
print("Before SMOTE: ",Counter(y_train))
print("Before SMOTE: ",Counter(y_train_smote))

Before SMOTE:  Counter({'unacc': 851, 'acc': 261, 'good': 53, 'vgood': 43})
Before SMOTE:  Counter({'unacc': 851, 'acc': 851, 'good': 851, 'vgood': 851})


##### Accuracy is 84% which is better than the other two sampling techniques, but lower than the actual data by 1%. But it is fine because the accuracy for the all the individual class is comparatively better for Synthetic Minority Over-sampled data

In [13]:
naive(x_train_smote,y_train_smote,x_test,y_test)

Accuracy is:  0.8420038535645472
col_0    acc  good  unacc  vgood
Outcome                         
acc       88    16     11      8
good       1    15      0      0
unacc     35     6    316      1
vgood      3     1      0     18
