# **Support vector machine sigmoid kernel (SVM with sigmoid)**

## **1. Import necessary libraries**

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import utils
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings

warnings.filterwarnings('ignore')

## **2. Insert and preprocess data**

### **2.1. Load data**
Load our data from path `../Data/processed_data.csv`.

In [3]:
# Load our data from path `../Data/processed_data.csv`
courses_df = pd.read_csv('../Data/processed_data.csv', sep=',',engine='python', encoding='utf-8')
courses_df.sample(5)

Unnamed: 0,name,general,specify,enrollment,language,rating,level,duration,instructor,instructor_rate,offered by
4545,Interfacing with the Raspberry Pi,Physical Science and Engineering,Electrical Engineering,78641,English,4.7,Other,11,Ian Harris,4.8,"University of California, Irvine"
2894,Intermediate Object-Oriented Programming for U...,Computer Science,Software Development,2235,English,4.8,Intermediate,15,"Dr. Tim ""Dr. T"" Chamillard",4.7,University of Colorado System
14,"Fundamentos de Negociação com o 10,000 Women d...",Business,Entrepreneurship,13926,Portuguese (Brazilian),4.9,Other,3,"Mori Taheripour, Goldman Sachs 10,000 Women, E...",4.8,Goldman Sachs
4283,Principles of Computing (Part 1),Computer Science,Software Development,32971,English,4.7,Intermediate,18,"Joe Warren, Luay Nakhleh, Scott Rixner",4.9,Rice University
348,Anti-Racism I,Social Sciences,Governance and Society,25395,English,4.6,Beginner,15,"Jennifer, Shawn",4.8,University of Colorado Boulder


### **2.2. Preprocess data**

- The `name` and `instructor` features needs to be removed because it is not useful in training the model.

In [4]:
data_ = courses_df.copy().drop(columns=['name', 'instructor'])
data_.sample(5)

Unnamed: 0,general,specify,enrollment,language,rating,level,duration,instructor_rate,offered by
360,Computer Science,Mobile and Web Development,7365,Spanish,3.7,Beginner,22,4.7,Universidad Austral
3784,Computer Science,Design and Product,7921,English,4.6,Intermediate,7,4.6,University of Colorado Boulder
1186,Business,Marketing,77833,Spanish,4.9,Intermediate,60,4.9,Universidad de Chile
218,Business,Business Strategy,1994,Russian,4.4,Beginner,6,4.7,DeepLearning.AI
651,Computer Science,Computer Security and Networks,6184,English,4.5,Intermediate,14,4.8,INSEAD


- Explore missing values in variables:
    + View summary of dataset.

In [5]:
data_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5718 entries, 0 to 5717
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   general          5717 non-null   object 
 1   specify          5717 non-null   object 
 2   enrollment       5718 non-null   int64  
 3   language         5718 non-null   object 
 4   rating           5718 non-null   float64
 5   level            5718 non-null   object 
 6   duration         5718 non-null   int64  
 7   instructor_rate  5718 non-null   float64
 8   offered by       5703 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 402.2+ KB


In [6]:
print('Number missing values in each column:\n',data_.isnull().sum())

Number missing values in each column:
 general             1
specify             1
enrollment          0
language            0
rating              0
level               0
duration            0
instructor_rate     0
offered by         15
dtype: int64


- Explore missing values in variables:
    + Drop rows with missing values: Because the number of missing values each variable is insignificant, we will remove rows containing missing data.

In [7]:
# Drop rows with missing values
data_.dropna(inplace=True)
print('Number missing values in each column:\n',data_.isnull().sum())

Number missing values in each column:
 general            0
specify            0
enrollment         0
language           0
rating             0
level              0
duration           0
instructor_rate    0
offered by         0
dtype: int64


## **3. Prepare for training model**

### **3.1. Define kind of features**

- Define selection and target features to prepare data for training model.

In [8]:
# Define selection and target features to prepare data for training model
target = ['rating']
specificities = list(set(data_.columns) - set(target))

- Define numerical and categorical features to transformer.

In [9]:
# Define numerical and categorical features
numeracy_ = list(set(specificities) - set(courses_df.select_dtypes(include=['object']).columns))
category_ = list(set(specificities) - set(courses_df.select_dtypes(exclude=['object']).columns))
print('Numerical features:', numeracy_)
print('Categorical features:', category_)

Numerical features: ['enrollment', 'duration', 'instructor_rate']
Categorical features: ['general', 'language', 'offered by', 'specify', 'level']


### **3.2. Split data**
Split data into 3 datasets: `Training dataset`, `Validation dataset` and `Testing dataset`. We’ll perform splitting on the following ratio 70-20-10.

In [10]:
# # Define the constant variable random_state
# random_state = 2112

# # Split data on the following ratio 70-20-10
# data_train, data_temp = train_test_split(data_, test_size=0.3,random_state=random_state)
# data_valid, data_test = train_test_split(data_temp, test_size=2/3,random_state=random_state)
# data_train.head(5)

In [11]:
# Define the constant variable random_state
random_state = 2112

# Select features and target variable
X = data_[specificities]
y = data_[target]

# Split data on the following ratio 70-20-10
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3,random_state=21)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2/3,random_state=21)

### **3.3. Initialize transformer**

In [12]:
# Create transformer for numerical and categorical features by using Pipeline
num_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])
cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numeracy_),
        ('cat', cat_transformer, category_)
    ])

# Fit and transform the training data using the preprocessor
pd.DataFrame(preprocessor.fit_transform(X_train, y_train))

# # Transform the validation and test data using the preprocessor
# X_val_preprocessed = preprocessor.transform(X_val)
# X_test_preprocessed = preprocessor.transform(X_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,378,379,380,381,382,383,384,385,386,387
0,0.084276,0.206349,0.848485,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.004554,0.071429,0.969697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.104741,0.055556,0.909091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.053172,0.055556,0.909091,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.060473,0.031746,0.969697,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3986,0.107239,0.182540,0.939394,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3987,0.005749,0.063492,0.909091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3988,0.016630,0.206349,0.969697,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3989,0.006087,0.063492,0.969697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [13]:
pd.DataFrame(preprocessor.fit_transform(X_train, y_train)).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,378,379,380,381,382,383,384,385,386,387
count,3991.0,3991.0,3991.0,3991.0,3991.0,3991.0,3991.0,3991.0,3991.0,3991.0,...,3991.0,3991.0,3991.0,3991.0,3991.0,3991.0,3991.0,3991.0,3991.0,3991.0
mean,0.040603,0.116637,0.902295,0.060636,0.260586,0.146079,0.110499,0.104235,0.076422,0.025558,...,0.02656,0.00451,0.002756,0.020546,0.06665,0.007767,0.030569,0.519168,0.248309,0.201954
std,0.083917,0.083165,0.079384,0.238692,0.43901,0.353229,0.31355,0.305603,0.265705,0.157831,...,0.160813,0.067014,0.052434,0.141877,0.249446,0.087801,0.172168,0.499695,0.432086,0.401509
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.005125,0.063492,0.878788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.014078,0.103175,0.909091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,0.037182,0.150794,0.939394,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
# # One-hot encoding for categorical features
# from sklearn.preprocessing import OneHotEncoder

# # Create an instance of the OneHotEncoder
# encoder = OneHotEncoder()

# # Fit and transform the categorical features
# X_encoded_array = encoder.fit_transform(X_train[category_]).toarray().astype(int)

# # Create a DataFrame with the encoded features
# X_encoded_df = pd.DataFrame(X_encoded_array, columns=encoder.get_feature_names_out(category_), index=X_train.index).astype('category')

# # Concatenate the encoded features with the numerical features
# X_encoded_combined = pd.concat([X_train[numeracy_], X_encoded_df], axis=1)
# X_encoded_combined.shape

### **3.4. Search hyperparameter for available data fitting model**

In [15]:
# # 1. Tạo model SVM với kernel sigmoid
# modelSVC = Pipeline(steps=[('preprocessor', preprocessor),
#                            ('regressor', SVC())])

# # 2. Xác định không gian tìm kiếm của các tham số, bao gồm cả l2_regularization
# parameters = param_grid = {
#     'C': [0.1,1, 10, 100],
#     'gamma': [1,0.1,0.01,0.001],
#     'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
# }

# # 3. Sử dụng GridSearchCV để tìm bộ hyperparameter phù hợp
# cv = GridSearchCV(SVC(), parameters, cv=5);
# cv.fit(pd.DataFrame(preprocessor.fit_transform(X_train, y_train)));

# # 4. Lấy bộ hyperparameter phù hợp
# best_parameters = cv.best_params_;
# print(best_parameters)

In [19]:
pipe_svm = Pipeline([('preprocessor', preprocessor),
                     ('scl', MinMaxScaler()),
                     ('clf', SVC(random_state=random_state))])

param_gamma = [0.0001, 0.001, 0.01, 0.1, 1.0]
param_C = [0.1, 1.0, 10.0, 100.0]

# here you can set parameter for different steps 
# by adding two underlines (__) between step name and parameter name
param_grid = [{'clf__C': param_C, 
               'clf__kernel': ['linear']},
              {'clf__C': param_C, 
               'clf__gamma': param_gamma, 
               'clf__kernel': ['rbf']},
              {'clf__C': param_C, 
               'clf__gamma': param_gamma, 
               'clf__kernel': ['sigmoid']},
              {'clf__C': param_C, 
               'clf__gamma': param_gamma, 
               'clf__kernel': ['poly']}]

# set pipe_svm as the estimator
gs = GridSearchCV(
    estimator = pipe_svm, 
    param_grid = param_grid, 
    scoring = "accuracy",
    cv = 3
)

gs = gs.fit(X_train, y_train)
print('[SVC: grid search]')
print('Validation accuracy: %.3f' % gs.best_score_)
print(gs.best_params_)

ValueError: 
All the 192 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
192 fits failed with the following error:
Traceback (most recent call last):
  File "/home/dagngyen5462/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/dagngyen5462/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/dagngyen5462/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/sklearn/pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/dagngyen5462/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/dagngyen5462/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/sklearn/svm/_base.py", line 199, in fit
    y = self._validate_targets(y)
  File "/home/dagngyen5462/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/sklearn/svm/_base.py", line 743, in _validate_targets
    check_classification_targets(y)
  File "/home/dagngyen5462/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/sklearn/utils/multiclass.py", line 216, in check_classification_targets
    raise ValueError(
ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.
