# **Support vector machine sigmoid kernel (SVM with sigmoid)**

## **1. Import necessary libraries**

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import utils
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings

warnings.filterwarnings('ignore')

## **2. Insert and preprocess data**

### **2.1. Load data**
Load our data from path `../Data/processed_data.csv`.

In [3]:
# Load our data from path `../Data/processed_data.csv`
courses_df = pd.read_csv('../Data/processed_data.csv', sep=',',engine='python', encoding='utf-8')
courses_df.sample(5)

Unnamed: 0,name,general,specify,enrollment,language,rating,level,duration,instructor,instructor_rate,offered by
2986,Introduction to Mobile Development,Computer Science,Mobile and Web Development,21731,English,4.7,Beginner,12,Taught by Meta Staff,4.9,Meta
1889,ESG and Climate Change,Business,Business Strategy,10023,English,4.8,Other,7,Sarah Light,4.9,University of Pennsylvania
176,Advanced Writing,Language Learning,Learning English,213079,English,4.7,Other,19,"Tamy Chapman, Helen Nam, Brad Gilpin",4.8,"University of California, Irvine"
5516,Motores gráficos en videojuegos: game engine,Computer Science,Design and Product,7700,Spanish,3.9,Other,9,"Jordi Arnal Montoya, Enric Martí Gòdia",4.7,Universitat Autònoma de Barcelona
4419,Pro Tools Basics,Arts and Humanities,Music and Art,35747,English,4.7,Other,11,Chrissy Tignor,4.9,Berklee


### **2.2. Preprocess data**

- The `name` and `instructor` features needs to be removed because it is not useful in training the model.

In [4]:
data_ = courses_df.copy().drop(columns=['name', 'instructor'])
data_.sample(5)

Unnamed: 0,general,specify,enrollment,language,rating,level,duration,instructor_rate,offered by
4963,Language Learning,Learning English,8311,English,4.8,Intermediate,20,5.0,"University of California, Irvine"
2133,Arts and Humanities,Music and Art,44076,English,4.8,Beginner,9,4.9,University of London
3109,Data Science,Machine Learning,10361,English,3.3,Intermediate,39,2.8,University of Colorado Boulder
3958,Health,Nutrition,234223,Spanish,4.8,Beginner,10,4.8,Universidad Nacional Autónoma de México
1569,Physical Science and Engineering,Physics and Astronomy,2370,Chinese (Simplified),5.0,Other,18,4.7,Peking University


- Explore missing values in variables:
    + View summary of dataset.

In [5]:
data_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5718 entries, 0 to 5717
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   general          5717 non-null   object 
 1   specify          5717 non-null   object 
 2   enrollment       5718 non-null   int64  
 3   language         5718 non-null   object 
 4   rating           5718 non-null   float64
 5   level            5718 non-null   object 
 6   duration         5718 non-null   int64  
 7   instructor_rate  5718 non-null   float64
 8   offered by       5703 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 402.2+ KB


In [6]:
print('Number missing values in each column:\n',data_.isnull().sum())

Number missing values in each column:
 general             1
specify             1
enrollment          0
language            0
rating              0
level               0
duration            0
instructor_rate     0
offered by         15
dtype: int64


- Explore missing values in variables:
    + Drop rows with missing values: Because the number of missing values each variable is insignificant, we will remove rows containing missing data.

In [7]:
# Drop rows with missing values
data_.dropna(inplace=True)
print('Number missing values in each column:\n',data_.isnull().sum())

Number missing values in each column:
 general            0
specify            0
enrollment         0
language           0
rating             0
level              0
duration           0
instructor_rate    0
offered by         0
dtype: int64


## **3. Prepare for training model**

### **3.1. Define kind of features**

- Define selection and target features to prepare data for training model.

In [8]:
# Define selection and target features to prepare data for training model
target = ['rating']
specificities = list(set(data_.columns) - set(target))

- Define numerical and categorical features to transformer.

In [9]:
# Define numerical and categorical features
numeracy_ = list(set(specificities) - set(courses_df.select_dtypes(include=['object']).columns))
category_ = list(set(specificities) - set(courses_df.select_dtypes(exclude=['object']).columns))
print('Numerical features:', numeracy_)
print('Categorical features:', category_)

Numerical features: ['instructor_rate', 'enrollment', 'duration']
Categorical features: ['specify', 'language', 'level', 'general', 'offered by']


### **3.2. Split data**
Split data into 3 datasets: `Training dataset`, `Validation dataset` and `Testing dataset`. We’ll perform splitting on the following ratio 80-20.

In [10]:
# Define the constant variable random_state
random_state = 2112

# Select features and target variable
X = data_[specificities]
y = data_[target]

# Split data on the following ratio 80-20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=random_state)

### **3.3. Initialize transformer**

In [11]:
# Create transformer for numerical and categorical features by using Pipeline
num_transformer = Pipeline(steps=[('scaler', StandardScaler())])
cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[('num', num_transformer, numeracy_),
                                               ('cat', cat_transformer, category_)])

### **3.4. Search hyperparameter for available data fitting model**

In [29]:
# Use SVR for regression
pipe_svm = Pipeline([('preprocessor', preprocessor),
                      ('regressor', SVR())])

# Fine-tuning hyperparameters
param_kernel = ['rbf', 'sigmoid']
param_C = [0.01, 0.1, 1.0, 10.0, 100.0]
param_gamma = [0.001, 0.01, 0.1, 1.0]
param_grid = [{'regressor__C': param_C,
               'regressor__kernel': param_kernel,
               'regressor__gamma': param_gamma}]

# Use a regression-specific scoring metric
gs_spe = GridSearchCV(
    estimator=pipe_svm,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    return_train_score=True,
    cv=5)

gs_spe = gs_spe.fit(X_train, y_train)

In [13]:
# Use SVR for regression
pipe_svm = Pipeline([('preprocessor', preprocessor),
                      ('regressor', SVR())])

# Fine-tuning hyperparameters
param_kernel = ['linear', 'rbf', 'sigmoid', 'poly']
param_C = [0.01, 0.1, 1.0, 10.0, 100.0]
param_grid = [{'regressor__C': param_C,
               'regressor__kernel': param_kernel}]

# Use a regression-specific scoring metric
gs = GridSearchCV(
    estimator=pipe_svm,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    cv=5)

gs = gs.fit(X_train, y_train)

In [19]:
print('[SVR: grid search]')
print('Validation MSE: %.3f' % -gs.best_score_)  # Print as positive MSE # Common regression metric
print(gs.best_params_)

[SVR: grid search]
Validation MSE: 0.034
{'regressor__C': 0.1, 'regressor__kernel': 'linear'}


In [30]:
print('[SVR: grid search]')
print('Validation MSE: %.3f' % -gs.best_score_)  # Print as positive MSE # Common regression metric
print(gs_spe.best_params_)

[SVR: grid search]
Validation MSE: 0.034
{'regressor__C': 10.0, 'regressor__gamma': 0.01, 'regressor__kernel': 'rbf'}


In [26]:
gs.cv_results_

make_scorer(mean_squared_error, greater_is_better=False)

In [16]:

# # Fit and transform the training data using the preprocessor
# pd.DataFrame(preprocessor.fit_transform(X_train, y_train))

# # # Transform the validation and test data using the preprocessor
# # X_val_preprocessed = preprocessor.transform(X_val)
# # X_test_preprocessed = preprocessor.transform(X_test)

In [43]:
gs_spe.cv_results_

{'mean_fit_time': array([0.66329122, 0.86683869, 0.99862118, 1.06056557, 0.83785782,
        1.04285865, 0.91792021, 2.27676663, 0.82146668, 0.89081507,
        0.65052385, 0.70768952, 0.67044249, 1.5975112 , 0.87517056,
        2.16418986, 0.63090892, 0.68541198, 0.68714447, 0.8289794 ,
        0.81406374, 1.89202385, 0.93075609, 2.09376636, 0.70500278,
        0.77734528, 0.81964135, 1.2024528 , 1.13399301, 1.86927748,
        0.9658946 , 2.02928643, 0.95131965, 0.96316361, 2.08981991,
        1.96806955, 1.77055855, 2.0182971 , 1.17355103, 2.0859479 ]),
 'std_fit_time': array([0.05291069, 0.20147992, 0.07681086, 0.09374724, 0.10644811,
        0.10919248, 0.03046209, 0.19950874, 0.08055589, 0.07590743,
        0.0470283 , 0.01103903, 0.03232036, 0.12324772, 0.02799437,
        0.17463483, 0.01779965, 0.01952029, 0.08511775, 0.10568403,
        0.04243538, 0.16043332, 0.02970232, 0.10964079, 0.05917975,
        0.14076799, 0.03877578, 0.03207048, 0.06457139, 0.09723054,
        0.073

In [63]:
# Get scoring values of gs_spe
pd.DataFrame({'mean_test':['{:f}'.format(item) for item in -gs_spe.cv_results_['mean_test_score'].round(4)],
              'mean_train':['{:f}'.format(item) for item in -gs_spe.cv_results_['mean_train_score'].round(4)],
              'ranking':gs_spe.cv_results_['rank_test_score']},
             index=gs_spe.cv_results_['params'])

Unnamed: 0,mean_test,mean_train,ranking
"{'regressor__C': 0.01, 'regressor__gamma': 0.001, 'regressor__kernel': 'rbf'}",0.0579,0.0579,27
"{'regressor__C': 0.01, 'regressor__gamma': 0.001, 'regressor__kernel': 'sigmoid'}",0.061,0.061,28
"{'regressor__C': 0.01, 'regressor__gamma': 0.01, 'regressor__kernel': 'rbf'}",0.0417,0.0415,16
"{'regressor__C': 0.01, 'regressor__gamma': 0.01, 'regressor__kernel': 'sigmoid'}",0.0442,0.0441,19
"{'regressor__C': 0.01, 'regressor__gamma': 0.1, 'regressor__kernel': 'rbf'}",0.042,0.041,17
"{'regressor__C': 0.01, 'regressor__gamma': 0.1, 'regressor__kernel': 'sigmoid'}",0.0506,0.0504,22
"{'regressor__C': 0.01, 'regressor__gamma': 1.0, 'regressor__kernel': 'rbf'}",0.0623,0.0599,29
"{'regressor__C': 0.01, 'regressor__gamma': 1.0, 'regressor__kernel': 'sigmoid'}",2.5029,2.5842,30
"{'regressor__C': 0.1, 'regressor__gamma': 0.001, 'regressor__kernel': 'rbf'}",0.0396,0.0394,14
"{'regressor__C': 0.1, 'regressor__gamma': 0.001, 'regressor__kernel': 'sigmoid'}",0.0441,0.044,18


In [61]:
# Get scoring values of gs
pd.DataFrame({'mean_test':['{:f}'.format(item) for item in -gs.cv_results_['mean_test_score'].round(4)],
              'ranking':gs.cv_results_['rank_test_score']},
             index=gs.cv_results_['params'])

Unnamed: 0,mean_test,ranking
"{'regressor__C': 0.01, 'regressor__kernel': 'linear'}",0.0343,2
"{'regressor__C': 0.01, 'regressor__kernel': 'rbf'}",0.0431,11
"{'regressor__C': 0.01, 'regressor__kernel': 'sigmoid'}",0.0711,14
"{'regressor__C': 0.01, 'regressor__kernel': 'poly'}",0.0408,9
"{'regressor__C': 0.1, 'regressor__kernel': 'linear'}",0.0342,1
"{'regressor__C': 0.1, 'regressor__kernel': 'rbf'}",0.0351,8
"{'regressor__C': 0.1, 'regressor__kernel': 'sigmoid'}",10.3176,17
"{'regressor__C': 0.1, 'regressor__kernel': 'poly'}",0.0349,7
"{'regressor__C': 1.0, 'regressor__kernel': 'linear'}",0.0347,3
"{'regressor__C': 1.0, 'regressor__kernel': 'rbf'}",0.0348,6


In [48]:
print("Grid scores on development set:")

means = gs_spe.cv_results_['mean_test_score']
stds = gs_spe.cv_results_['std_test_score']
i = 0
for mean, std, params in zip(means, stds, gs_spe.cv_results_['params']):
    print("[%d]: %0.4f (+/-%0.03f) for %r"
          % (i, -mean, std * 2, params))
    i += 1

Grid scores on development set:
[0]: 0.0579 (+/-0.008) for {'regressor__C': 0.01, 'regressor__gamma': 0.001, 'regressor__kernel': 'rbf'}
[1]: 0.0610 (+/-0.008) for {'regressor__C': 0.01, 'regressor__gamma': 0.001, 'regressor__kernel': 'sigmoid'}
[2]: 0.0417 (+/-0.006) for {'regressor__C': 0.01, 'regressor__gamma': 0.01, 'regressor__kernel': 'rbf'}
[3]: 0.0442 (+/-0.005) for {'regressor__C': 0.01, 'regressor__gamma': 0.01, 'regressor__kernel': 'sigmoid'}
[4]: 0.0420 (+/-0.008) for {'regressor__C': 0.01, 'regressor__gamma': 0.1, 'regressor__kernel': 'rbf'}
[5]: 0.0506 (+/-0.007) for {'regressor__C': 0.01, 'regressor__gamma': 0.1, 'regressor__kernel': 'sigmoid'}
[6]: 0.0623 (+/-0.009) for {'regressor__C': 0.01, 'regressor__gamma': 1.0, 'regressor__kernel': 'rbf'}
[7]: 2.5029 (+/-0.381) for {'regressor__C': 0.01, 'regressor__gamma': 1.0, 'regressor__kernel': 'sigmoid'}
[8]: 0.0396 (+/-0.005) for {'regressor__C': 0.1, 'regressor__gamma': 0.001, 'regressor__kernel': 'rbf'}
[9]: 0.0441 (+/-0.