Support vector Regression (SVR)

In [9]:
# Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, pointbiserialr
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

In [11]:
# Import the dataset
math = pd.read_csv("./student-mat.csv", sep=';', header=0)
por = pd.read_csv("./student-por.csv", sep=';', header=0)

In [13]:
#Feature selection based on the correlation of the dependent variables with the independent variables

from scipy.stats import pearsonr, pointbiserialr
from sklearn.preprocessing import LabelEncoder

# Specify the target column
target_column = 'G3'  # Target column name

# Separate numerical and categorical columns
numerical_features = por.select_dtypes(include=['number']).columns
categorical_features = por.select_dtypes(include=['object', 'category']).columns

# Dictionary to store correlation results
correlation_results = {}

# Handle numerical columns
for col in numerical_features:
    if col != target_column:
        correlation, p_value = pearsonr(math[col], math[target_column])
        correlation_results[col] = {'correlation': abs(correlation), 'p_value': p_value}

# Handle categorical columns
for col in categorical_features:
    if col != target_column:
        # Encode categorical values
        encoded_col = LabelEncoder().fit_transform(math[col])
        correlation, p_value = pointbiserialr(encoded_col, math[target_column])
        correlation_results[col] = {'correlation': abs(correlation), 'p_value': p_value}

# Sort features by correlation
sorted_features = sorted(correlation_results.items(), key=lambda x: x[1]['correlation'], reverse=True)

# Display top features with p-values
print("Feature Correlations and P-values with Target Variable:")
for feature, stats in sorted_features:
    print(f"{feature}: Correlation = {stats['correlation']:.2f}, P-value = {stats['p_value']:.3e}")


Feature Correlations and P-values with Target Variable:
G2: Correlation = 0.90, P-value = 7.626e-148
G1: Correlation = 0.80, P-value = 9.001e-90
failures: Correlation = 0.36, P-value = 1.466e-13
Medu: Correlation = 0.22, P-value = 1.336e-05
higher: Correlation = 0.18, P-value = 2.668e-04
age: Correlation = 0.16, P-value = 1.271e-03
Fedu: Correlation = 0.15, P-value = 2.380e-03
goout: Correlation = 0.13, P-value = 8.229e-03
romantic: Correlation = 0.13, P-value = 9.713e-03
reason: Correlation = 0.12, P-value = 1.527e-02
traveltime: Correlation = 0.12, P-value = 1.987e-02
address: Correlation = 0.11, P-value = 3.563e-02
sex: Correlation = 0.10, P-value = 3.987e-02
Mjob: Correlation = 0.10, P-value = 4.259e-02
paid: Correlation = 0.10, P-value = 4.277e-02
internet: Correlation = 0.10, P-value = 5.048e-02
studytime: Correlation = 0.10, P-value = 5.206e-02
schoolsup: Correlation = 0.08, P-value = 1.004e-01
famsize: Correlation = 0.08, P-value = 1.062e-01
guardian: Correlation = 0.07, P-valu

# SVR with all the features with splitting into test set and train set

In [15]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

# Define the target column and features
target_column = "G3"
X = por.drop(columns=[target_column])
y = por[target_column]

# Apply the scaler only to numerical columns
numerical_cols = X.select_dtypes(include=['number']).columns
sc_X = StandardScaler()
X[numerical_cols] = sc_X.fit_transform(X[numerical_cols])

# Encode categorical columns (if any)
categorical_features = X.select_dtypes(include=['object', 'category']).columns
for col in categorical_features:
    X[col] = LabelEncoder().fit_transform(X[col])

# Scale the target column
sc_y = StandardScaler()
y = sc_y.fit_transform(y.values.reshape(-1, 1)).flatten()  # Flatten back to 1D array

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize Support Vector Regression (SVR) with RBF kernel
regressor = SVR(kernel='linear') # linear kernel was better than rbf in this case: 0.71 vs 0.78

# Fit the model
regressor.fit(X_train, y_train)


# Predict using the test data
y_pred = regressor.predict(X_test)


# Rescale predictions and actual values back to original scale
y_pred_original = sc_y.inverse_transform(y_pred.reshape(-1, 1)).flatten()
y_test_original = sc_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

np.set_printoptions(suppress=True, precision=2)

# Display predictions vs actual values
comparison = np.column_stack((y_pred_original, y_test_original))
print("Predictions vs Actual:")
print(comparison)

# Evaluating the model performance
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)


Predictions vs Actual:
[[ 7.7   8.  ]
 [14.54 15.  ]
 [16.22 16.  ]
 [10.5  10.  ]
 [ 9.45 10.  ]
 [12.58 12.  ]
 [12.98 13.  ]
 [18.31 17.  ]
 [12.11 12.  ]
 [11.41 12.  ]
 [10.5  11.  ]
 [10.64 10.  ]
 [13.2  13.  ]
 [ 8.69  8.  ]
 [18.19 18.  ]
 [12.44 12.  ]
 [12.75 13.  ]
 [12.55 13.  ]
 [10.68 10.  ]
 [10.34 10.  ]
 [12.16 12.  ]
 [10.53 10.  ]
 [17.38 17.  ]
 [13.37 15.  ]
 [12.59 14.  ]
 [ 1.53  0.  ]
 [12.71 12.  ]
 [13.52 14.  ]
 [11.36 12.  ]
 [12.69  9.  ]
 [13.61 13.  ]
 [16.28 16.  ]
 [13.25 13.  ]
 [16.15 16.  ]
 [12.41 12.  ]
 [ 9.21 10.  ]
 [ 9.52 10.  ]
 [11.38 11.  ]
 [12.54 13.  ]
 [11.41 10.  ]
 [15.46 15.  ]
 [17.37 18.  ]
 [11.5  11.  ]
 [13.66 13.  ]
 [12.54 13.  ]
 [ 9.71 10.  ]
 [12.67 14.  ]
 [ 9.2   9.  ]
 [11.45 11.  ]
 [ 9.56 10.  ]
 [ 6.21  8.  ]
 [14.71 17.  ]
 [ 9.43  9.  ]
 [12.29 13.  ]
 [ 7.28  8.  ]
 [11.12 11.  ]
 [11.65 12.  ]
 [11.02 12.  ]
 [14.61 15.  ]
 [14.64 15.  ]
 [13.4  13.  ]
 [ 7.61  7.  ]
 [11.63 12.  ]
 [ 9.3  10.  ]
 [13.48 12.  ]
 [

0.8775693192611538

r2 is 0.7878634076745303 for math, 
r2 is 0.8775693192610926 for portuguese

# SVR with splitting the data into test and train set and with feature selection

In [19]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
from sklearn.metrics import r2_score

# Define the target column and features
target_column = "G3"
# Select only the specified columns from the dataset
selected_columns = ['Medu', 'failures', 'Dalc', 'Walc', 'absences', 'G1', 'G2', 
                    'sex', 'Mjob', 'schoolsup', 'famsup', 'activities', 'nursery', 'higher']
X = math[selected_columns]  # Use only the selected columns as features
y = math[target_column]

# Apply the scaler only to numerical columns
numerical_cols = X.select_dtypes(include=['number']).columns
sc_X = StandardScaler()
X[numerical_cols] = sc_X.fit_transform(X[numerical_cols])

# Encode categorical columns (if any)
categorical_features = X.select_dtypes(include=['object', 'category']).columns
for col in categorical_features:
    X[col] = LabelEncoder().fit_transform(X[col])

# Scale the target column
sc_y = StandardScaler()
y = sc_y.fit_transform(y.values.reshape(-1, 1)).flatten()  # Flatten back to 1D array

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize Support Vector Regression (SVR) with linear kernel
regressor = SVR(kernel='linear')  # Linear kernel was better than RBF in this case

# Fit the model
regressor.fit(X_train, y_train)

# Predict using the test data
y_pred = regressor.predict(X_test)

# Rescale predictions and actual values back to original scale
y_pred_original = sc_y.inverse_transform(y_pred.reshape(-1, 1)).flatten()
y_test_original = sc_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

np.set_printoptions(suppress=True, precision=2)

# Display predictions vs actual values
comparison = np.column_stack((y_pred_original, y_test_original))
print("Predictions vs Actual:")
print(comparison)

# Evaluate the model performance
r2 = r2_score(y_test_original, y_pred_original)
print(f"R² Score: {r2:.4f}")


Predictions vs Actual:
[[14.58 14.  ]
 [11.39 10.  ]
 [10.28  9.  ]
 [15.41 15.  ]
 [16.45 16.  ]
 [12.37 12.  ]
 [13.31 14.  ]
 [11.28 11.  ]
 [ 9.16  9.  ]
 [15.31 15.  ]
 [ 8.16 10.  ]
 [12.67 11.  ]
 [11.3  10.  ]
 [ 8.26  8.  ]
 [15.36 15.  ]
 [14.44 14.  ]
 [13.35 12.  ]
 [10.17  0.  ]
 [ 9.43 10.  ]
 [15.36 16.  ]
 [ 7.29  0.  ]
 [ 4.38  0.  ]
 [ 7.26  8.  ]
 [15.26 16.  ]
 [12.23 12.  ]
 [16.46 15.  ]
 [10.29 10.  ]
 [19.55 19.  ]
 [ 0.43  0.  ]
 [13.4  14.  ]
 [-0.44  0.  ]
 [ 7.43 10.  ]
 [ 8.24  8.  ]
 [ 5.87  4.  ]
 [13.43 13.  ]
 [10.3  10.  ]
 [13.17 13.  ]
 [18.67 18.  ]
 [ 9.25  8.  ]
 [12.3  11.  ]
 [ 5.31  6.  ]
 [18.54 18.  ]
 [ 8.13  8.  ]
 [ 8.89  8.  ]
 [10.35 11.  ]
 [-0.47  0.  ]
 [12.29 13.  ]
 [ 4.92  0.  ]
 [12.34 13.  ]
 [ 7.24  7.  ]
 [ 9.37 10.  ]
 [18.38 18.  ]
 [11.12 10.  ]
 [ 8.36  9.  ]
 [12.4  11.  ]
 [ 8.26  6.  ]
 [ 9.47  9.  ]
 [13.27 13.  ]
 [15.36 15.  ]
 [14.31 14.  ]
 [13.36 14.  ]
 [13.5  15.  ]
 [14.49 14.  ]
 [15.63 15.  ]
 [ 9.12  0.  ]
 [

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_cols] = sc_X.fit_transform(X[numerical_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = LabelEncoder().fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = LabelEncoder().fit_transform(X[col])
A value is trying to be set on a copy of a slice from a

In [132]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

# Copy the DataFrame to avoid working on the original
df_encoded = por.copy()

# Define numerical and categorical features
numerical_features = ['Medu', 'failures', 'Dalc', 'Walc', 'absences', 'G1', 'G2']
categorical_features = por.select_dtypes(include=['object', 'category']).columns

# Label encode categorical features
for col in categorical_features:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

# Define the target column and features
target_column = "G3"
X = df_encoded[['Medu', 'failures', 'Dalc', 'Walc', 'absences', 'G1', 'G2', 'sex', 
                'Mjob', 'schoolsup', 'famsup', 'activities', 'nursery', 'higher']].copy()
y = df_encoded[target_column].copy()

# Scale the numerical features
sc_X = StandardScaler()
X.loc[:, numerical_features] = sc_X.fit_transform(X[numerical_features])  # Use `.loc` for explicit indexing

# Scale the target dependent variable
sc_y = StandardScaler()
y = sc_y.fit_transform(y.values.reshape(-1, 1)).flatten()

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize Support Vector Regression (SVR) with linear kernel
regressor = SVR(kernel='linear')

# Fit the model
regressor.fit(X_train, y_train)

# Predict using the test data
y_pred = regressor.predict(X_test)

# Rescale predictions and actual values back to original scale
y_pred_original = sc_y.inverse_transform(y_pred.reshape(-1, 1)).flatten()
y_test_original = sc_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

# Set NumPy print options to suppress scientific notation
np.set_printoptions(suppress=True, precision=2)

# Display predictions vs actual values
comparison = np.column_stack((y_pred_original, y_test_original))
print("Predictions vs Actual (in fixed-point notation):")
print(comparison)

# Evaluate the model performance
r2 = r2_score(y_test_original, y_pred_original)
mse = mean_squared_error(y_test_original, y_pred_original)
mae = mean_absolute_error(y_test_original, y_pred_original)

print(f"R² Score: {r2:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")



Predictions vs Actual (in fixed-point notation):
[[ 7.73  8.  ]
 [14.43 15.  ]
 [16.12 16.  ]
 [10.47 10.  ]
 [ 9.38 10.  ]
 [12.69 12.  ]
 [12.96 13.  ]
 [18.41 17.  ]
 [12.23 12.  ]
 [11.61 12.  ]
 [10.67 11.  ]
 [10.48 10.  ]
 [13.46 13.  ]
 [ 8.69  8.  ]
 [18.23 18.  ]
 [12.37 12.  ]
 [12.73 13.  ]
 [12.46 13.  ]
 [10.56 10.  ]
 [10.35 10.  ]
 [12.36 12.  ]
 [10.58 10.  ]
 [17.34 17.  ]
 [13.3  15.  ]
 [12.55 14.  ]
 [ 1.33  0.  ]
 [12.4  12.  ]
 [13.42 14.  ]
 [11.51 12.  ]
 [12.59  9.  ]
 [13.56 13.  ]
 [16.61 16.  ]
 [13.47 13.  ]
 [16.42 16.  ]
 [12.55 12.  ]
 [ 9.11 10.  ]
 [ 9.45 10.  ]
 [11.43 11.  ]
 [12.68 13.  ]
 [11.49 10.  ]
 [15.43 15.  ]
 [17.33 18.  ]
 [11.56 11.  ]
 [13.4  13.  ]
 [12.49 13.  ]
 [ 9.69 10.  ]
 [12.73 14.  ]
 [ 9.5   9.  ]
 [11.57 11.  ]
 [ 9.4  10.  ]
 [ 6.05  8.  ]
 [14.62 17.  ]
 [ 9.39  9.  ]
 [12.36 13.  ]
 [ 7.4   8.  ]
 [11.08 11.  ]
 [11.69 12.  ]
 [11.23 12.  ]
 [14.53 15.  ]
 [14.47 15.  ]
 [13.34 13.  ]
 [ 7.97  7.  ]
 [11.59 12.  ]
 [ 9.1

  1.31  1.31 -0.45  1.31  1.31  0.43  0.43  1.31  1.31  1.31  1.31 -0.45
 -0.45 -0.45 -0.45  1.31  0.43  1.31  1.31  1.31  1.31  0.43  0.43 -0.45
  1.31  1.31  0.43 -0.45 -0.45  1.31  1.31 -0.45 -0.45  1.31  0.43  1.31
  1.31  1.31 -0.45  1.31  1.31  1.31  0.43 -0.45  1.31  1.31 -1.34  1.31
  1.31 -1.34 -1.34  1.31  1.31  1.31  1.31  0.43 -0.45  0.43  0.43  1.31
 -1.34  0.43  0.43  1.31  1.31 -0.45 -0.45  0.43 -0.45 -0.45  0.43 -0.45
 -1.34  1.31 -0.45  1.31 -0.45  1.31  0.43  1.31  0.43  1.31 -0.45 -1.34
  1.31 -0.45  1.31  1.31  1.31  1.31  1.31  0.43  0.43  0.43 -0.45  0.43
  1.31  1.31  1.31  0.43 -0.45  1.31 -0.45  1.31  1.31  0.43 -1.34  0.43
 -1.34 -0.45 -0.45  1.31 -0.45  0.43  0.43 -2.22  1.31 -0.45  0.43 -0.45
 -0.45 -1.34 -0.45  0.43 -0.45 -0.45  0.43  1.31  0.43 -1.34  0.43  0.43
 -1.34  0.43 -1.34  1.31 -1.34  1.31  0.43  1.31 -0.45  1.31 -1.34 -0.45
 -1.34 -0.45  0.43 -1.34 -1.34  1.31 -0.45 -1.34 -0.45  0.43  0.43  1.31
 -0.45 -1.34 -1.34 -0.45  0.43 -1.34 -0.45  0.43 -1

# K-Fold cross validation and Hyperparameter tuning

In [1]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
from sklearn.metrics import r2_score

# Define the target column and features
target_column = "G3"
X = por.drop(columns=[target_column])
y = por[target_column]

# Apply the scaler only to numerical columns
numerical_cols = X.select_dtypes(include=['number']).columns
sc_X = StandardScaler()
X[numerical_cols] = sc_X.fit_transform(X[numerical_cols])

# Encode categorical columns (if any)
categorical_features = X.select_dtypes(include=['object', 'category']).columns
for col in categorical_features:
    X[col] = LabelEncoder().fit_transform(X[col])

# Scale the target column
sc_y = StandardScaler()
y = sc_y.fit_transform(y.values.reshape(-1, 1)).flatten()  # Flatten back to 1D array

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize Support Vector Regression (SVR) with a linear kernel
regressor = SVR(kernel='linear')

# 1. K-Fold Cross-Validation (5-fold)
cv_scores = cross_val_score(regressor, X, y, cv=5, scoring='r2')  # Using R² score for regression
print(f"Cross-Validation R² scores for each fold: {cv_scores}")
print(f"Average Cross-Validation R² score: {cv_scores.mean()}")

# 2. Hyperparameter Tuning using RandomizedSearchCV
# Define the parameter distribution for hyperparameter tuning
param_dist = {
    'C': np.logspace(-3, 3, 7),  # Regularization parameter
    'epsilon': np.linspace(0.01, 0.1, 10),  # Margin of tolerance
    'kernel': ['linear', 'rbf'],  # Kernel type: linear or RBF
}

# Perform RandomizedSearchCV with 5-fold cross-validation
random_search = RandomizedSearchCV(estimator=regressor, param_distributions=param_dist, n_iter=10, cv=5, scoring='r2', random_state=0)
random_search.fit(X_train, y_train)

# Best hyperparameters and best R² score
print(f"Best Hyperparameters from RandomizedSearchCV: {random_search.best_params_}")
print(f"Best R² score from RandomizedSearchCV: {random_search.best_score_}")

# 3. Fit the model with the best hyperparameters found by RandomizedSearchCV
best_regressor = random_search.best_estimator_
best_regressor.fit(X_train, y_train)

# Predict using the test data
y_pred = best_regressor.predict(X_test)

# Rescale predictions and actual values back to the original scale
y_pred_original = sc_y.inverse_transform(y_pred.reshape(-1, 1)).flatten()
y_test_original = sc_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

# Display predictions vs actual values
comparison = np.column_stack((y_pred_original, y_test_original))
print("Predictions vs Actual:")
print(comparison)

# Evaluating the model performance on the test set
r2_test_score = r2_score(y_test, y_pred)
print(f"R² score on the test set: {r2_test_score}")


NameError: name 'por' is not defined

SVR performance (r2): 

with all features 0.7878634076745303, 0.8775693192610926 por
with selected features 0.7736 math, 0.8737 por
with hyperparameter tuning  0.782517153964617,  0.8802525595059679 for por