# **Support vector machine sigmoid kernel (SVM with sigmoid)**

## **1. Import necessary libraries**

In [123]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import utils
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline

import matplotlib.pyplot as plt
import seaborn as sns

In [50]:
import warnings

warnings.filterwarnings('ignore')

## **2. Insert and preprocess data**

### **2.1. Load data**
Load our data from path `../Data/processed_data.csv`.

In [51]:
# Load our data from path `../Data/processed_data.csv`
courses_df = pd.read_csv('../Data/processed_data.csv', sep=',',engine='python', encoding='utf-8')
courses_df.sample(5)

Unnamed: 0,name,general,specify,enrollment,language,rating,level,duration,instructor,instructor_rate,offered by
5661,A Voice of Their Own. Women's Spirituality in ...,Arts and Humanities,History,14395,English,4.4,Beginner,11,"Blanca Garí, Delfi I. Nieto-Isabel, Núria Jornet",4.0,Universitat de Barcelona
3493,Machine Translation,Data Science,Machine Learning,14722,English,4.5,Intermediate,27,"Jan Niehues, Alexander Waibel",4.3,Karlsruhe Institute of Technology
40,3-Axis Machining with Autodesk Fusion 360,Physical Science and Engineering,Mechanical Engineering,8112,English,4.9,Advanced,27,Autodesk,4.9,Autodesk
128,Administración de sistemas y servicios de infr...,Information Technology,Support and Operations,33820,Spanish,4.9,Beginner,26,"Google Career Certificates, Onlea support",4.8,Google
5197,Teaching Character and Creating Positive Class...,Social Sciences,Education,81350,English,4.8,Other,10,Dave Levin,4.8,Relay Graduate School of Education


### **2.2. Preprocess data**

- The `name` and `instructor` features needs to be removed because it is not useful in training the model.

In [52]:
data_ = courses_df.copy().drop(columns=['name', 'instructor'])
data_.sample(5)

Unnamed: 0,general,specify,enrollment,language,rating,level,duration,instructor_rate,offered by
5288,Personal Development,Personal Development,4613,English,4.9,Other,19,4.9,Yale University
3217,Computer Science,Mobile and Web Development,197353,English,4.7,Beginner,9,4.8,University of Michigan
1160,Arts and Humanities,Music and Art,22398,English,4.6,Other,11,4.5,The University of North Carolina at Chapel Hill
4509,Health,Health Informatics,6878,English,4.7,Beginner,13,4.8,Emory University
1063,Data Science,Probability and Statistics,3844,English,3.7,Beginner,12,3.5,Databricks


- Explore missing values in variables:
    + View summary of dataset.

In [53]:
data_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5718 entries, 0 to 5717
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   general          5717 non-null   object 
 1   specify          5717 non-null   object 
 2   enrollment       5718 non-null   int64  
 3   language         5718 non-null   object 
 4   rating           5718 non-null   float64
 5   level            5718 non-null   object 
 6   duration         5718 non-null   int64  
 7   instructor_rate  5718 non-null   float64
 8   offered by       5703 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 402.2+ KB


In [54]:
print('Number missing values in each column:\n',data_.isnull().sum())

Number missing values in each column:
 general             1
specify             1
enrollment          0
language            0
rating              0
level               0
duration            0
instructor_rate     0
offered by         15
dtype: int64


- Explore missing values in variables:
    + Drop rows with missing values: Because the number of missing values each variable is insignificant, we will remove rows containing missing data.

In [55]:
# Drop rows with missing values
data_.dropna(inplace=True)
print('Number missing values in each column:\n',data_.isnull().sum())

Number missing values in each column:
 general            0
specify            0
enrollment         0
language           0
rating             0
level              0
duration           0
instructor_rate    0
offered by         0
dtype: int64


## **3. Prepare for training model**

### **3.1. Define kind of features**

- Define selection and target features to prepare data for training model.

In [56]:
# Define selection and target features to prepare data for training model
target = ['rating']
specificities = list(set(data_.columns) - set(target))

- Define numerical and categorical features to transformer.

In [57]:
# Define numerical and categorical features
features = list(data_.columns)
numeracy_ = list(set(features) - set(courses_df.select_dtypes(include=['object']).columns))
category_ = list(set(features) - set(courses_df.select_dtypes(exclude=['object']).columns))
print('Numerical features:', numeracy_)
print('Categorical features:', category_)

Numerical features: ['instructor_rate', 'duration', 'enrollment', 'rating']
Categorical features: ['general', 'language', 'specify', 'offered by', 'level']


### **3.2. Split data**
Split data into 3 datasets: `Training dataset`, `Validation dataset` and `Testing dataset`. We’ll perform splitting on the following ratio 70-20-10.

In [58]:
# Define the constant variable random_state
random_state = 21

# Split data on the following ratio 70-20-10
data_train, data_temp = train_test_split(data_, test_size=0.3,random_state=random_state)
data_valid, data_test = train_test_split(data_temp, test_size=2/3,random_state=random_state)
data_train.head(5)

Unnamed: 0,general,specify,enrollment,language,rating,level,duration,instructor_rate,offered by
137,Computer Science,Algorithms,78370,English,4.6,Advanced,27,4.5,University of California San Diego
4886,Social Sciences,Governance and Society,5183,English,4.8,Other,10,4.9,Columbia University
3625,Physical Science and Engineering,Mechanical Engineering,97158,English,4.7,Other,8,4.7,"University of California, Davis"
3957,Business,Marketing,49816,English,4.8,Other,8,4.7,Northwestern University
796,Health,Basic Science,56518,English,4.9,Beginner,5,4.9,Johns Hopkins University


### **3.3. Initialize transformer**

In [87]:
# Create transformer for numerical and categorical features by using Pipeline
num_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])
cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numeracy_),
        ('cat', cat_transformer, category_)
    ])

# Keep header for dataframes after preprocessor
def keep_header_preprocessor(data: np.ndarray, preprocessor:ColumnTransformer=preprocessor, numeric_features:list=numeracy_) -> pd.DataFrame:
    enc_cat_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out()
    labels = np.concatenate([numeric_features, enc_cat_features])
    return pd.DataFrame(data, columns=labels)


# # Fit and transform the training data using the preprocessor
# data_train_preprocessed = keep_header_preprocessor(preprocessor.fit_transform(data_train))

# # Transform the validation and test data using the preprocessor
# data_valid_preprocessed = keep_header_preprocessor(preprocessor.transform(data_valid))
# data_test_preprocessed = keep_header_preprocessor(preprocessor.transform(data_test))

In [88]:
# data_test_preprocessed

Unnamed: 0,instructor_rate,duration,enrollment,rating,general_Arts and Humanities,general_Business,general_Computer Science,general_Data Science,general_Health,general_Information Technology,...,offered by_École Polytechnique,offered by_École Polytechnique Fédérale de Lausanne,offered by_École des Ponts ParisTech,offered by_École normale supérieure,offered by_上海戏剧学院,offered by_真格基金,level_Advanced,level_Beginner,level_Intermediate,level_Other
0,0.849162,-0.734558,-0.433203,-0.192261,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.085614,-0.639116,-0.282779,0.636821,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.085614,1.174299,-0.316130,1.051362,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-1.823254,0.124427,-0.022837,-1.021343,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.085614,-0.543673,-0.372278,-0.192261,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1136,-0.296159,-1.402658,-0.459687,1.051362,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1137,-1.441480,-0.830001,-0.286077,-0.606802,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1138,0.085614,-0.257344,0.143361,-0.192261,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1139,1.230935,0.410756,-0.461011,-0.192261,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [61]:
# # One-hot encoding for categorical features
# from sklearn.preprocessing import OneHotEncoder

# # Create an instance of the OneHotEncoder
# encoder = OneHotEncoder()

# # Fit and transform the categorical features
# X_encoded_array = encoder.fit_transform(X_train[category_]).toarray().astype(int)

# # Create a DataFrame with the encoded features
# X_encoded_df = pd.DataFrame(X_encoded_array, columns=encoder.get_feature_names_out(category_), index=X_train.index).astype('category')

# # Concatenate the encoded features with the numerical features
# X_encoded_combined = pd.concat([X_train[numeracy_], X_encoded_df], axis=1)
# X_encoded_combined.shape

KeyError: "['rating'] not in index"

### **3.4. Search hyperparameter for available data fitting model**

In [120]:
y_train = data_train_preprocessed[target].values.flatten()
X_train = data_train_preprocessed.drop(columns=target)
y_train

array([-0.19226144,  0.63682059,  0.22227957, ...,  1.0513616 ,
        0.63682059,  1.0513616 ])

In [122]:
# 1. Tạo bộ tham số
y_train = data_train_preprocessed[target].values.flatten()
X_train = data_train_preprocessed.drop(columns=target)

# 2. Xác định không gian tìm kiếm của các tham số, bao gồm cả l2_regularization
parameters = param_grid = {
    'C': [0.1,1, 10, 100],
    'gamma': [1,0.1,0.01,0.001],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
}

# 3. Sử dụng GridSearchCV để tìm bộ hyperparameter phù hợp
cv = GridSearchCV(SVC(), parameters, cv=5);
cv.fit(X_train, y_train);

# 4. Lấy bộ hyperparameter phù hợp
best_parameters = cv.best_params_;
print(best_parameters)

ValueError: 
All the 320 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "/home/dagngyen5462/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/dagngyen5462/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/dagngyen5462/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/sklearn/svm/_base.py", line 199, in fit
    y = self._validate_targets(y)
  File "/home/dagngyen5462/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/sklearn/svm/_base.py", line 743, in _validate_targets
    check_classification_targets(y)
  File "/home/dagngyen5462/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/sklearn/utils/multiclass.py", line 216, in check_classification_targets
    raise ValueError(
ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.
