# Overfitting
In this notebook we will see the effect of overfitting using SVM. We will see that our gap between training error and generalization error will be huge as we will increase the degree of Polynomial.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [2]:
from sklearn.datasets import load_breast_cancer

In [3]:
dataset = load_breast_cancer()

In [4]:
df = pd.DataFrame(dataset.data,columns=dataset.feature_names)
df['class'] = dataset.target

In [5]:
df.shape

(569, 31)

In [7]:
df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,class
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,1.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


In [8]:
df.groupby('class').mean()

Unnamed: 0_level_0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,17.46283,21.604906,115.365377,978.376415,0.102898,0.145188,0.160775,0.08799,0.192909,0.06268,...,21.134811,29.318208,141.37033,1422.286321,0.144845,0.374824,0.450606,0.182237,0.323468,0.09153
1,12.146524,17.914762,78.075406,462.790196,0.092478,0.080085,0.046058,0.025717,0.174186,0.062867,...,13.379801,23.51507,87.005938,558.89944,0.124959,0.182673,0.166238,0.074444,0.270246,0.079442


In [9]:
df['class'].value_counts()

1    357
0    212
Name: class, dtype: int64

In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [27]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [29]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [30]:
lr = LogisticRegression(solver='lbfgs', max_iter=200,C=1)
lr.fit(X_train,y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [31]:
def print_score(model, X_train, y_train, X_test, y_test, train=True):
    '''
    Taining Performence
    '''
    if train:
        print("Train Result:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_train, model.predict(X_train))))
        print("Classification Report: \n {} \n".format(classification_report(y_train,model.predict(X_train))))
        print("Confusion Matrix:")
        print(confusion_matrix(y_train, model.predict(X_train)))
        
        res = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
    
    elif train == False:
        '''
        Testing Performence
        '''
        print("Test Results:\n")
        print("Accuracy Score: {0:.4f}".format(accuracy_score(y_test, model.predict(X_test))))
        print("Classification Report: \n {} \n".format(classification_report(y_test,model.predict(X_test))))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, model.predict(X_test)))

In [32]:
print_score(lr, X_train, y_train, X_test, y_test, train=True)

Train Result:

Accuracy Score: 0.9890

Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       165
           1       0.99      1.00      0.99       290

   micro avg       0.99      0.99      0.99       455
   macro avg       0.99      0.99      0.99       455
weighted avg       0.99      0.99      0.99       455
 

Confusion Matrix:
[[161   4]
 [  1 289]]
Average Accuracy: 	 0.9781
Accuracy SD: 		 0.0198


In [33]:
print_score(lr, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.9649
Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.96      0.96        47
           1       0.97      0.97      0.97        67

   micro avg       0.96      0.96      0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114
 

Confusion Matrix:
[[45  2]
 [ 2 65]]


In [35]:
from sklearn.svm import SVC
svm = SVC(gamma='scale', kernel='linear')
svm.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [37]:
print_score(svm, X_train, y_train, X_test, y_test, train=True)

Train Result:

Accuracy Score: 0.9890

Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       165
           1       0.99      1.00      0.99       290

   micro avg       0.99      0.99      0.99       455
   macro avg       0.99      0.99      0.99       455
weighted avg       0.99      0.99      0.99       455
 

Confusion Matrix:
[[161   4]
 [  1 289]]
Average Accuracy: 	 0.9715
Accuracy SD: 		 0.0198


In [38]:
print_score(svm, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.9825
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98        47
           1       0.99      0.99      0.99        67

   micro avg       0.98      0.98      0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114
 

Confusion Matrix:
[[46  1]
 [ 1 66]]


In [39]:
from sklearn.svm import SVC
svm = SVC(gamma='scale', kernel='rbf')
svm.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [40]:
print_score(svm, X_train, y_train, X_test, y_test, train=True)

Train Result:

Accuracy Score: 0.9846

Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.96      0.98       165
           1       0.98      1.00      0.99       290

   micro avg       0.98      0.98      0.98       455
   macro avg       0.99      0.98      0.98       455
weighted avg       0.98      0.98      0.98       455
 

Confusion Matrix:
[[159   6]
 [  1 289]]
Average Accuracy: 	 0.9759
Accuracy SD: 		 0.0153


In [41]:
print_score(svm, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.9825
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.96      0.98        47
           1       0.97      1.00      0.99        67

   micro avg       0.98      0.98      0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114
 

Confusion Matrix:
[[45  2]
 [ 0 67]]


In [42]:
from sklearn.svm import SVC
svm = SVC(gamma='scale', kernel='poly', degree=3)
svm.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [43]:
print_score(svm, X_train, y_train, X_test, y_test, train=True)

Train Result:

Accuracy Score: 0.9099

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.75      0.86       165
           1       0.88      1.00      0.93       290

   micro avg       0.91      0.91      0.91       455
   macro avg       0.94      0.88      0.90       455
weighted avg       0.92      0.91      0.91       455
 

Confusion Matrix:
[[124  41]
 [  0 290]]
Average Accuracy: 	 0.8923
Accuracy SD: 		 0.0208


In [44]:
print_score(svm, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.9035
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.77      0.87        47
           1       0.86      1.00      0.92        67

   micro avg       0.90      0.90      0.90       114
   macro avg       0.93      0.88      0.90       114
weighted avg       0.92      0.90      0.90       114
 

Confusion Matrix:
[[36 11]
 [ 0 67]]


In [45]:
from sklearn.svm import SVC
svm = SVC(gamma='scale', kernel='poly', degree=4)
svm.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=4, gamma='scale', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [46]:
print_score(svm, X_train, y_train, X_test, y_test, train=True)

Train Result:

Accuracy Score: 0.8352

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.55      0.71       165
           1       0.79      1.00      0.89       290

   micro avg       0.84      0.84      0.84       455
   macro avg       0.90      0.77      0.80       455
weighted avg       0.87      0.84      0.82       455
 

Confusion Matrix:
[[ 90  75]
 [  0 290]]
Average Accuracy: 	 0.8087
Accuracy SD: 		 0.0392


In [47]:
print_score(svm, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.7719
Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.47      0.63        47
           1       0.73      0.99      0.84        67

   micro avg       0.77      0.77      0.77       114
   macro avg       0.84      0.73      0.73       114
weighted avg       0.82      0.77      0.75       114
 

Confusion Matrix:
[[22 25]
 [ 1 66]]


As our data is linearly separable as we are increasing the degree of polynomial or using another kernel our model is overfitting.