# K-fold cross validation

## importing required libraries:

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

## loading dataset:

In [2]:
## reading file:

df = pd.read_csv('Social_Network_Ads.csv')
df

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0
...,...,...,...
395,46,41000,1
396,51,23000,1
397,50,20000,1
398,36,33000,0


In [3]:
## loading first 5 dataset:

df.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0


In [4]:
## to check numeric columns:

df.describe()

Unnamed: 0,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0
mean,37.655,69742.5,0.3575
std,10.482877,34096.960282,0.479864
min,18.0,15000.0,0.0
25%,29.75,43000.0,0.0
50%,37.0,70000.0,0.0
75%,46.0,88000.0,1.0
max,60.0,150000.0,1.0


In [5]:
## to check number of columns and rows:

df.shape

(400, 3)

In [6]:
## to check column summary:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Age              400 non-null    int64
 1   EstimatedSalary  400 non-null    int64
 2   Purchased        400 non-null    int64
dtypes: int64(3)
memory usage: 9.5 KB


## Data Preparation:

In [7]:
x = df.drop('Purchased',axis=1)
y = df['Purchased']

In [8]:
x.head()

Unnamed: 0,Age,EstimatedSalary
0,19,19000
1,35,20000
2,26,43000
3,27,57000
4,19,76000


In [9]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Purchased, dtype: int64

## Feature scaling:

In [14]:
from sklearn.preprocessing import StandardScaler

st = StandardScaler()
X = st.fit_transform(x)

## Train Test Split:

In [16]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state= 35)

## Train SVM model:

In [17]:
from sklearn.svm import SVC

cf = SVC(kernel= 'rbf',random_state=42)
cf.fit(x_train,y_train)

SVC(random_state=42)

## Predicting the Model:

In [18]:
y_pred = cf.predict(x_test)
y_pred

array([0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1], dtype=int64)

## confusion matrix:

In [19]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,y_pred)
cm

array([[61,  3],
       [20, 16]], dtype=int64)

## evaluate the model:

In [20]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test,y_pred)
acc

0.77

## Applying K-Fold cross validation:

In [21]:
from sklearn.model_selection import cross_val_score

score = cross_val_score(estimator=cf , X = x_train, y = y_train, cv=10)
score

array([0.8       , 0.8       , 0.86666667, 0.73333333, 0.8       ,
       0.86666667, 0.7       , 0.7       , 0.66666667, 0.73333333])

In [22]:
#accuracy score:

score.mean()

0.7666666666666667

In [23]:
# standard deviation:

score.std()

0.0666666666666667