Support vector Regression (SVR)

In [14]:
# Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, pointbiserialr
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

In [15]:
# Import the dataset
math = pd.read_csv("./student-mat.csv", sep=';', header=0)
por = pd.read_csv("./student-por.csv", sep=';', header=0)

In [16]:
from scipy.stats import pearsonr, pointbiserialr
from sklearn.preprocessing import LabelEncoder

# Specify the target column
target_column = 'G3'  # Target column name

# Separate numerical and categorical columns
numerical_features = math.select_dtypes(include=['number']).columns
categorical_features = math.select_dtypes(include=['object', 'category']).columns

# Dictionary to store correlation results
correlation_results = {}

# Handle numerical columns
for col in numerical_features:
    if col != target_column:
        correlation, p_value = pearsonr(math[col], math[target_column])
        correlation_results[col] = {'correlation': abs(correlation), 'p_value': p_value}

# Handle categorical columns
for col in categorical_features:
    if col != target_column:
        # Encode categorical values
        encoded_col = LabelEncoder().fit_transform(math[col])
        correlation, p_value = pointbiserialr(encoded_col, math[target_column])
        correlation_results[col] = {'correlation': abs(correlation), 'p_value': p_value}

# Sort features by correlation
sorted_features = sorted(correlation_results.items(), key=lambda x: x[1]['correlation'], reverse=True)

# Display top features with p-values
print("Feature Correlations and P-values with Target Variable:")
for feature, stats in sorted_features:
    print(f"{feature}: Correlation = {stats['correlation']:.2f}, P-value = {stats['p_value']:.3e}")


Feature Correlations and P-values with Target Variable:
G2: Correlation = 0.90, P-value = 7.626e-148
G1: Correlation = 0.80, P-value = 9.001e-90
failures: Correlation = 0.36, P-value = 1.466e-13
Medu: Correlation = 0.22, P-value = 1.336e-05
higher: Correlation = 0.18, P-value = 2.668e-04
age: Correlation = 0.16, P-value = 1.271e-03
Fedu: Correlation = 0.15, P-value = 2.380e-03
goout: Correlation = 0.13, P-value = 8.229e-03
romantic: Correlation = 0.13, P-value = 9.713e-03
reason: Correlation = 0.12, P-value = 1.527e-02
traveltime: Correlation = 0.12, P-value = 1.987e-02
address: Correlation = 0.11, P-value = 3.563e-02
sex: Correlation = 0.10, P-value = 3.987e-02
Mjob: Correlation = 0.10, P-value = 4.259e-02
paid: Correlation = 0.10, P-value = 4.277e-02
internet: Correlation = 0.10, P-value = 5.048e-02
studytime: Correlation = 0.10, P-value = 5.206e-02
schoolsup: Correlation = 0.08, P-value = 1.004e-01
famsize: Correlation = 0.08, P-value = 1.062e-01
guardian: Correlation = 0.07, P-valu

In [17]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Encode categorical columns (if any)
df_encoded = math.copy()
for col in categorical_features:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

# Define the target column and features
X = df_encoded.drop(columns=["G3"])
y = df_encoded[target_column]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize Support Vector Regression (SVR) with linear kernel
regressor = SVR(kernel='linear')

# Fit the model
regressor.fit(X_train, y_train)

# Get coefficients (this will only work if you're using a linear kernel)
importances = abs(regressor.coef_).flatten()  # Flattening in case of multiple dimensions

# Sort features by importance
sorted_idx = importances.argsort()

# Display feature importance
print("Feature Importance from Support Vector Regression (SVR):")
for idx in sorted_idx[::-1]:  # Sorting in descending order
    print(f"{X.columns[idx]}: {importances[idx]:.4f}")

# Predict using the test data
y_pred = regressor.predict(X_test)

# Display predictions vs actual values
np.set_printoptions(precision=2)  # Display only 2 decimals after the column for the numerical values
comparison = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.values.reshape(len(y_test), 1)), axis=1)  # Fixed the axis argument
print(comparison)


Feature Importance from Support Vector Regression (SVR):
G2: 0.9909
nursery: 0.0749
famrel: 0.0692
famsize: 0.0583
romantic: 0.0537
famsup: 0.0513
schoolsup: 0.0485
sex: 0.0429
internet: 0.0422
activities: 0.0404
studytime: 0.0381
Walc: 0.0381
Medu: 0.0350
Dalc: 0.0335
Fedu: 0.0258
goout: 0.0234
address: 0.0231
G1: 0.0198
paid: 0.0188
age: 0.0188
guardian: 0.0133
Pstatus: 0.0109
failures: 0.0104
Mjob: 0.0090
health: 0.0085
traveltime: 0.0070
freetime: 0.0068
higher: 0.0060
reason: 0.0052
school: 0.0050
absences: 0.0049
Fjob: 0.0014
[[ 1.41e+01  1.40e+01]
 [ 1.11e+01  1.00e+01]
 [ 9.92e+00  9.00e+00]
 [ 1.53e+01  1.50e+01]
 [ 1.61e+01  1.60e+01]
 [ 1.20e+01  1.20e+01]
 [ 1.28e+01  1.40e+01]
 [ 1.09e+01  1.10e+01]
 [ 8.89e+00  9.00e+00]
 [ 1.50e+01  1.50e+01]
 [ 7.98e+00  1.00e+01]
 [ 1.22e+01  1.10e+01]
 [ 1.11e+01  1.00e+01]
 [ 8.13e+00  8.00e+00]
 [ 1.51e+01  1.50e+01]
 [ 1.42e+01  1.40e+01]
 [ 1.30e+01  1.20e+01]
 [ 9.88e+00  0.00e+00]
 [ 9.10e+00  1.00e+01]
 [ 1.50e+01  1.60e+01]
 [

In [18]:
# Evaluating the model performance
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.7806765535095334

In [19]:
# Select the top features based on importance (e.g., top 5 features)
top_features_idx = importances.argsort()[::-1][:5]  # Top 5 features based on importance

# Filter the dataset to use only the selected features
X_selected_train = X_train.iloc[:, top_features_idx]
X_selected_test = X_test.iloc[:, top_features_idx]

# Train the model with only the selected features
regressor.fit(X_selected_train, y_train)

# Predict using the test data with selected features
y_pred = regressor.predict(X_selected_test)

# Display predictions vs actual values
np.set_printoptions(precision=2)  # Display only 2 decimals after the column for the numerical values
comparison = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.values.reshape(len(y_test), 1)), axis=1)
print(comparison)

[[14.03 14.  ]
 [11.03 10.  ]
 [10.03  9.  ]
 [15.17 15.  ]
 [16.03 16.  ]
 [12.03 12.  ]
 [12.97 14.  ]
 [10.9  11.  ]
 [ 9.03  9.  ]
 [15.03 15.  ]
 [ 8.03 10.  ]
 [12.03 11.  ]
 [10.97 10.  ]
 [ 8.1   8.  ]
 [15.1  15.  ]
 [14.1  14.  ]
 [13.03 12.  ]
 [10.03  0.  ]
 [ 9.1  10.  ]
 [15.03 16.  ]
 [ 7.1   0.  ]
 [ 4.97  0.  ]
 [ 7.03  8.  ]
 [15.03 16.  ]
 [12.17 12.  ]
 [15.9  15.  ]
 [10.03 10.  ]
 [19.03 19.  ]
 [ 0.03  0.  ]
 [13.1  14.  ]
 [ 0.1   0.  ]
 [ 8.03 10.  ]
 [ 8.1   8.  ]
 [ 5.97  4.  ]
 [13.03 13.  ]
 [10.1  10.  ]
 [13.17 13.  ]
 [17.97 18.  ]
 [ 8.97  8.  ]
 [12.1  11.  ]
 [ 5.1   6.  ]
 [18.03 18.  ]
 [ 7.9   8.  ]
 [ 9.1   8.  ]
 [10.1  11.  ]
 [ 0.03  0.  ]
 [12.03 13.  ]
 [ 5.03  0.  ]
 [12.1  13.  ]
 [ 8.17  7.  ]
 [ 9.1  10.  ]
 [18.1  18.  ]
 [11.17 10.  ]
 [ 7.97  9.  ]
 [12.03 11.  ]
 [ 8.17  6.  ]
 [ 9.03  9.  ]
 [12.97 13.  ]
 [15.03 15.  ]
 [14.1  14.  ]
 [13.1  14.  ]
 [13.17 15.  ]
 [14.03 14.  ]
 [15.03 15.  ]
 [ 9.03  0.  ]
 [15.17 15.  ]
 [18.03 19

In [21]:
# Evaluating the model performance
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.77675217920871

In [27]:
# Encode categorical columns (if any)
df_encoded = math.copy()

numerical_features = math.select_dtypes(include=['number']).columns
categorical_features = math.select_dtypes(include=['object', 'category']).columns

for col in categorical_features:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

# Define the target column and features
X = df_encoded[['G3', 'Medu', 'failures', 'Dalc', 'Walc', 'absences', 'G1', 'G2', 'sex', 'Mjob', 'schoolsup', 'famsup', 'activities', 'nursery', 'higher']]
y = df_encoded[target_column]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize Support Vector Regression (SVR) with linear kernel
regressor = SVR(kernel='linear')

# Fit the model
regressor.fit(X_train, y_train)

# Get coefficients (this will only work if you're using a linear kernel)
importances = abs(regressor.coef_).flatten()  # Flattening in case of multiple dimensions

# Sort features by importance
sorted_idx = importances.argsort()

# Display feature importance
print("Feature Importance from Support Vector Regression (SVR):")
for idx in sorted_idx[::-1]:  # Sorting in descending order
    print(f"{X.columns[idx]}: {importances[idx]:.4f}")

# Predict using the test data
y_pred = regressor.predict(X_test)

# Display predictions vs actual values
np.set_printoptions(precision=2)  # Display only 2 decimals after the column for the numerical values
comparison = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.values.reshape(len(y_test), 1)), axis=1)  # Fixed the axis argument
print(comparison)




Feature Importance from Support Vector Regression (SVR):
G3: 0.9816
schoolsup: 0.0206
G2: 0.0189
activities: 0.0085
famsup: 0.0080
nursery: 0.0044
Dalc: 0.0037
failures: 0.0033
absences: 0.0029
Medu: 0.0024
sex: 0.0009
Walc: 0.0008
Mjob: 0.0007
G1: 0.0006
higher: 0.0000
[[ 1.40e+01  1.40e+01]
 [ 9.94e+00  1.00e+01]
 [ 8.97e+00  9.00e+00]
 [ 1.49e+01  1.50e+01]
 [ 1.59e+01  1.60e+01]
 [ 1.20e+01  1.20e+01]
 [ 1.39e+01  1.40e+01]
 [ 1.09e+01  1.10e+01]
 [ 8.95e+00  9.00e+00]
 [ 1.49e+01  1.50e+01]
 [ 9.91e+00  1.00e+01]
 [ 1.11e+01  1.10e+01]
 [ 9.98e+00  1.00e+01]
 [ 7.98e+00  8.00e+00]
 [ 1.50e+01  1.50e+01]
 [ 1.39e+01  1.40e+01]
 [ 1.20e+01  1.20e+01]
 [ 1.10e-01  0.00e+00]
 [ 9.93e+00  1.00e+01]
 [ 1.59e+01  1.60e+01]
 [ 6.62e-02  0.00e+00]
 [ 1.65e-02  0.00e+00]
 [ 7.91e+00  8.00e+00]
 [ 1.59e+01  1.60e+01]
 [ 1.19e+01  1.20e+01]
 [ 1.50e+01  1.50e+01]
 [ 9.94e+00  1.00e+01]
 [ 1.90e+01  1.90e+01]
 [-7.99e-02  0.00e+00]
 [ 1.39e+01  1.40e+01]
 [-8.28e-02  0.00e+00]
 [ 9.91e+00  1.0

In [28]:
# Evaluating the model performance

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9998576191201302