In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import sklearn.model_selection as skm
import matplotlib.pyplot as plt
import seaborn as sns
from ISLP import confusion_table
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [37]:
import warnings

warnings.filterwarnings('ignore')

## 1. Load data

In [38]:
raw = pd.read_csv("training.csv")
raw

Unnamed: 0,query_id,url_id,query_length,is_homepage,sig1,sig2,sig3,sig4,sig5,sig6,sig7,sig8,relevance,id
0,4631,28624,2,1,0.09,0.15,1288,352,376,13,0.46,0.35,0,4631.286240
1,4631,28625,2,1,0.20,0.35,4662,337,666,28,0.43,0.27,1,4631.286250
2,4631,28626,2,1,0.36,0.49,1121,385,270,15,0.34,0.20,1,4631.286260
3,4631,28627,2,1,0.21,0.45,2925,478,640,14,0.44,0.33,1,4631.286270
4,4631,28628,2,1,0.25,0.42,1328,429,412,27,0.40,0.57,1,4631.286280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80041,17038,102812,1,0,0.13,0.29,10,7,10,0,0.25,0.79,0,17038.102812
80042,17038,102813,1,0,0.12,0.20,121,0,46,1,0.30,0.50,0,17038.102813
80043,17038,102814,1,0,0.04,0.37,179,15,92,0,0.32,0.42,0,17038.102814
80044,17038,102815,1,0,0.22,0.50,643,223,114,10,0.32,0.86,1,17038.102815


## 2. Exploratory data analysis

**Check if there are missing data.**

In [39]:
raw.isnull().sum()

query_id        0
url_id          0
query_length    0
is_homepage     0
sig1            0
sig2            0
sig3            0
sig4            0
sig5            0
sig6            0
sig7            0
sig8            0
relevance       0
id              0
dtype: int64

No missing data.

**Check duplicates**

In [40]:
duplicates = raw.duplicated()
num_duplicates = duplicates.sum()
num_duplicates

0

No duplicates

## 3. Declare predictors and target variable.

In [41]:
X = raw.drop(['relevance', 'query_id', 'url_id', 'id'], axis=1)
y = raw['relevance']

## 4. Scale the predictors

In [42]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns = X.columns)
X_scaled.describe()

Unnamed: 0,query_length,is_homepage,sig1,sig2,sig3,sig4,sig5,sig6,sig7,sig8
count,80046.0,80046.0,80046.0,80046.0,80046.0,80046.0,80046.0,80046.0,80046.0,80046.0
mean,-5.254995000000001e-17,4.9709410000000006e-17,1.185924e-16,5.592308e-17,-8.521613e-18,-3.550672e-18,1.1362150000000001e-17,1.4912820000000002e-17,-2.2724300000000002e-17,1.768235e-16
std,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006
min,-1.041878,-0.6065367,-1.243548,-2.010775,-0.2064047,-0.1540609,-0.291605,-0.1565392,-2.304098,-2.039932
25%,-0.3848838,-0.6065367,-0.7006331,-0.7936921,-0.20309,-0.14908,-0.2863082,-0.1565392,-0.7173708,-0.7861737
50%,-0.3848838,-0.6065367,-0.2255822,-0.04026011,-0.188684,-0.1084019,-0.2577053,-0.1565392,-0.06825495,-0.05121198
75%,0.27211,1.648705,0.3851975,0.7711282,-0.09856135,-0.0314043,-0.1136316,-0.1343338,0.7251088,0.7269828
max,10.12702,1.648705,5.542893,2.973468,28.42023,137.0178,24.60031,40.31295,4.042812,2.023974


## 5. Split the data

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.8, random_state = 0)

## 6. Run SVM with default hyperparameters

In [44]:
svc=SVC() 
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with default hyperparameters: 0.6564


## 7. Run SVM with linear kernel

In [45]:
svm_linear = SVC(C = 0.01, kernel = 'linear')
svm_linear.fit(X_train, y_train)
para_grid = {'C': np.logspace(-2, 1, 10)}
kfold = skm.KFold(5, 
                  random_state=0,
                  shuffle=True)
grid = skm.GridSearchCV(svc,
                        para_grid,
                        refit=True,
                        cv = kfold,
                        scoring='accuracy');
grid.fit(X_train, y_train)
best_c = grid.best_params_['C']
best_c

10.0

In [46]:
svm_optimalC = SVC(C = best_c, kernel = 'linear')
svm_optimalC.fit(X_train, y_train)

In [47]:
y_train_hat = svm_optimalC.predict(X_train)
confusion_table(y_train_hat, y_train)

Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7144,3775
1,1825,3265


In [48]:
y_pred_test = svm_optimalC.predict(X_test)
print('Model accuracy score with linear kernel: {0:0.4f}'. format(accuracy_score(y_test, y_pred_test)))

Model accuracy score with linear kernel: 0.6547


## 8. Run SVM with rbf kernel

In [74]:
svm_rbf = SVC(C = 0.01, kernel = 'rbf')
svm_rbf.fit(X_train, y_train)
grid = skm.GridSearchCV(svm_rbf,
                        para_grid,
                        refit = True,
                        cv = kfold,
                        scoring='accuracy');
grid.fit(X_train, y_train)
best_c = grid.best_params_['C']
best_c

10.0

In [75]:
svm_optimalC_rbf = SVC(C = best_c, kernel = 'rbf')
svm_optimalC_rbf.fit(X_train, y_train)

In [52]:
y_pred_test_rbf = svm_optimalC_rbf.predict(X_test)
print('Model accuracy score with rbf kernel: {0:0.4f}'. format(accuracy_score(y_test, y_pred_test_rbf)))

Model accuracy score with rbf kernel: 0.6588


## 9. Run SVM with polynomial kernel

In [54]:
param_grid = {
    'C': np.logspace(-2, 1, 10),  # Values of C from 0.001 to 1000
    'degree': [2, 3, 4, 5]        # Polynomial degrees
}
svm_poly = SVC(C = 0.01, kernel = 'poly', degree = 2)
svm_poly.fit(X_train, y_train)
grid = skm.GridSearchCV(svm_poly,
                        param_grid,
                        refit = True,
                        cv = kfold,
                        scoring='accuracy');
grid.fit(X_train, y_train)
best_c = grid.best_params_['C']
best_degree = grid.best_params_['degree']
print("Best C: ", best_c)
print("Best degree: ", best_degree)

Best C:  10.0
Best degree:  3


In [55]:
svm_optimalC_poly = SVC(C = best_c, kernel = 'poly', degree = best_degree)
svm_optimalC_poly.fit(X_train, y_train)
y_pred_test_poly = svm_optimalC_poly.predict(X_test)
print('Model accuracy score with polynomial kernel: {0:0.4f}'. format(accuracy_score(y_test, y_pred_test_poly)))

Model accuracy score with polynomial kernel: 0.6370


## Prediction

In [76]:
test_data = pd.read_csv("test.csv")
X_true_test = test_data.drop(['query_id', 'url_id', 'id'], axis=1, errors='ignore')
X_true_test_scaled = scaler.transform(X_true_test)
predictions = svm_optimalC_rbf.predict(X_true_test_scaled)
results = pd.DataFrame({
    'prediction': predictions,
    'id': test_data['id']
})

results.to_csv("predictions_SVM_rbf.csv", index=False)