In [None]:
"""
Objective:
• Employ SVM from scikit learn for binary classification.
• Impact of preprocessing data and hyper parameter search using grid search.
"""

In [18]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score

In [2]:
"""
Load the data from “college.csv” that has attributes collected about private and 
public colleges for a particular year. We will try to predict the private/public 
status of the college from other attributes.
"""

CSV_PATH = r'D:\CourseWork\data-science-python-certification-course\Assignments\09 Supervised Learning - II\Case Study III\resources\College.csv'
df = pd.read_csv(CSV_PATH)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Private      777 non-null    object 
 1   Apps         777 non-null    int64  
 2   Accept       777 non-null    int64  
 3   Enroll       777 non-null    int64  
 4   Top10perc    777 non-null    int64  
 5   Top25perc    777 non-null    int64  
 6   F.Undergrad  777 non-null    int64  
 7   P.Undergrad  777 non-null    int64  
 8   Outstate     777 non-null    int64  
 9   Room.Board   777 non-null    int64  
 10  Books        777 non-null    int64  
 11  Personal     777 non-null    int64  
 12  PhD          777 non-null    int64  
 13  Terminal     777 non-null    int64  
 14  S.F.Ratio    777 non-null    float64
 15  perc.alumni  777 non-null    int64  
 16  Expend       777 non-null    int64  
 17  Grad.Rate    777 non-null    int64  
dtypes: float64(1), int64(16), object(1)
memory usage: 

In [5]:
"""
Use LabelEncoder to encode the target variable into numerical form and split 
the data such that 20% of the data is set aside for testing.
"""

le = LabelEncoder()
df['Private'] = le.fit_transform(df['Private'])
df.head(5)

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,1,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,1,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,1,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,1,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,1,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [7]:
x = df.iloc[:, 1:]
y = df['Private']
x_tr, x_tt, y_tr, y_tt = train_test_split(x, y, test_size=0.2, random_state=57)

In [10]:
# Fit a linear SVM from scikit learn and observe the accuracy.

svc = LinearSVC()
svc.fit(x_tr, y_tr)
y_pred = svc.predict(x_tt)
print(accuracy_score(y_tt, y_pred))

0.9166666666666666




In [12]:
# Preprocess the data using StandardScalar and fit the same model again and observe the change in accuracy.

sdf = StandardScaler().fit_transform(x)
sdf = pd.DataFrame(sdf, columns=x.columns)

x = sdf
x_tr, x_tt, y_tr, y_tt = train_test_split(x, y, test_size=0.2, random_state=57)

svc.fit(x_tr, y_tr)
y_pred = svc.predict(x_tt)
print(accuracy_score(y_tt, y_pred))

0.9487179487179487




In [19]:
"""
Use scikit learns grid search to select the best hyperparameter for a non-linear 
SVM, and identify the model with the best score and its parameters.
"""

parameter_grid = [
    {'C': [1, 10, 100, 1000], 'kernel': ['poly']},
    {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
    {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}
]

clf = GridSearchCV(
    estimator=SVC(),
    param_grid=parameter_grid,
    cv=ShuffleSplit()
)

clf.fit(x_tr, y_tr)
print("Best Score:", clf.best_score_)
print('Best C:', clf.best_estimator_.C)
print('Best Kernel:', clf.best_estimator_.kernel)
print('Best Gamma:', clf.best_estimator_.gamma)

Best Score: 0.9507936507936507
Best C: 1000
Best Kernel: rbf
Best Gamma: 0.001
