In [9]:
# Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, pointbiserialr
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

In [17]:
# Import the dataset
math = pd.read_csv("./student-mat.csv", sep=';', header=0)
por = pd.read_csv("./student-por.csv", sep=';', header=0)

In [131]:
#correlation of the dependent variables with the independent variables
#########   Correlation with the method from David Boules


# F-statistics for correlation 
#target features is the final grade G3

target = "G3"
# Separate numerical and categorical columns
numerical_features = por.select_dtypes(include=['number']).columns
categorical_features = por.select_dtypes(include=['object', 'category']).columns

X = por.drop(columns = [target])
y = por[target]

# one way ANOVA test for the correlation of means between two or more groups
# assumption the data is normally distributed


from scipy.stats import f_oneway

#correlation between the categorical variables and the target variable of G3(final grading)
from sklearn.feature_selection import f_classif 
for col in categorical_features: 
    groups = [y[X[col] == value] for value in X[col].unique()]
    f_stat, p_value = f_oneway(*groups)
    print(f"Feature: {col}, F_statistic:{f_stat}, P-value: {p_value}")

# filter numerical features
numerical_features= X.select_dtypes(include = [np.number]).columns

#perform F-test

f_scores, p_values = f_classif(X[numerical_features], y)

#display the results

for col, f_score, p_value in zip(numerical_features, f_scores, p_values):
    print(f"Feature: {col}, F_statistic:{f_stat}, P-value: {p_value}")

selected_features_num = [col for col, p_value in zip(numerical_features, p_values) if p_value < 0.05]
selected_features_cat = [col for col, p_value in zip(categorical_features, p_values) if p_value < 0.05]

print("Selected Features:", selected_features_num, selected_features_cat)

Feature: school, F_statistic:56.89067686337133, P-value: 1.5661990923002604e-13
Feature: sex, F_statistic:10.962308407124874, P-value: 0.0009815287061373317
Feature: address, F_statistic:18.707910527412754, P-value: 1.7641534609222437e-05
Feature: famsize, F_statistic:1.3137906447644496, P-value: 0.2521332216658279
Feature: Pstatus, F_statistic:0.0003677569375753126, P-value: 0.984705825951084
Feature: Mjob, F_statistic:7.370224291121831, P-value: 8.30514988494739e-06
Feature: Fjob, F_statistic:3.2726805958419667, P-value: 0.011376280623605892
Feature: reason, F_statistic:10.248465509132915, P-value: 1.3416422874904278e-06
Feature: guardian, F_statistic:2.63816697870449, P-value: 0.07226239503367116
Feature: schoolsup, F_statistic:2.865641360496058, P-value: 0.09097103846579366
Feature: famsup, F_statistic:2.2759054076786045, P-value: 0.1318865120420208
Feature: paid, F_statistic:1.955810225639281, P-value: 0.16244124863569093
Feature: activities, F_statistic:2.321335165021925, P-value

In [133]:
# Encode the categorical variables and scale the numerical variables

from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
from sklearn.metrics import r2_score

# Define the target column and features
target_column = "G3"
X = por.drop(columns=[target_column])
X = X[['Medu', 'failures', 'Dalc', 'Walc', 'absences', 'G1', 'G2', 'sex', 'Mjob', 'schoolsup', 'famsup', 'activities', 'nursery', 'higher']]
y = por[target_column]

# Apply the scaler 
numerical_cols = X.select_dtypes(include=['number']).columns
sc_X = StandardScaler()
X[numerical_cols] = sc_X.fit_transform(X[numerical_cols])

#scaling the y is not necessary
#sc_y = StandardScaler()
#y = sc_y.fit_transform(y)

# Encode categorical columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns
for col in categorical_features:
    X[col] = LabelEncoder().fit_transform(X[col])


In [135]:
#splitting the dataset into the training and the test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [85]:
#train the model on the the training test

from sklearn.linear_model import LinearRegression

# Initialize Linear Regression model
regressor = LinearRegression()

# Fit the model
regressor.fit(X_train, y_train)

In [129]:
y_pred = regressor.predict(X_test)

# Ensure y_pred and y_test are reshaped into 2D arrays for concatenation
y_pred = y_pred.reshape(-1, 1)  # Reshape to (n_samples, 1)
y_test = y_test.reshape(-1, 1)  # Reshape to (n_samples, 1)

# Concatenate and print
np.set_printoptions(precision=2, suppress=True)  # Suppress scientific notation
print(np.concatenate((y_pred, y_test), axis=1))

# Evaluating the model performance on the test set
r2_test_score = r2_score(y_test, y_pred)
print(f"R² score on the test set: {r2_test_score}")

[[ 7.31  8.  ]
 [14.42 15.  ]
 [16.31 16.  ]
 [10.37 10.  ]
 [ 8.88 10.  ]
 [12.72 12.  ]
 [13.03 13.  ]
 [18.63 17.  ]
 [11.85 12.  ]
 [11.32 12.  ]
 [10.82 11.  ]
 [10.35 10.  ]
 [13.51 13.  ]
 [ 8.37  8.  ]
 [18.45 18.  ]
 [12.12 12.  ]
 [12.9  13.  ]
 [12.24 13.  ]
 [10.54 10.  ]
 [ 9.96 10.  ]
 [12.16 12.  ]
 [10.28 10.  ]
 [17.47 17.  ]
 [13.21 15.  ]
 [12.41 14.  ]
 [ 0.57  0.  ]
 [12.09 12.  ]
 [13.28 14.  ]
 [11.28 12.  ]
 [12.58  9.  ]
 [13.56 13.  ]
 [16.89 16.  ]
 [13.3  13.  ]
 [16.36 16.  ]
 [12.98 12.  ]
 [ 8.71 10.  ]
 [ 9.31 10.  ]
 [11.15 11.  ]
 [13.   13.  ]
 [11.28 10.  ]
 [15.63 15.  ]
 [17.64 18.  ]
 [11.71 11.  ]
 [13.36 13.  ]
 [12.34 13.  ]
 [ 9.21 10.  ]
 [12.63 14.  ]
 [ 9.46  9.  ]
 [11.58 11.  ]
 [ 9.13 10.  ]
 [ 5.48  8.  ]
 [14.39 17.  ]
 [ 8.93  9.  ]
 [12.11 13.  ]
 [ 7.05  8.  ]
 [10.99 11.  ]
 [11.63 12.  ]
 [11.37 12.  ]
 [14.42 15.  ]
 [14.43 15.  ]
 [13.4  13.  ]
 [ 7.33  7.  ]
 [11.53 12.  ]
 [ 8.8  10.  ]
 [13.45 12.  ]
 [12.7  12.  ]
 [11.31 11

In [109]:
## Applying k-Fold Cross Validation

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 82.88 %
Standard Deviation: 5.04 %


In [125]:
# Hyperparameter tuning


from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import numpy as np

# Define hyperparameter grid
#When working with a multiple regression model, you might encounter multicollinearity (correlated features) or overfitting. To address these issues, you can use Ridge regression, which adds an L2 penalty to shrink coefficients.
param_grid = {
    'alpha': np.logspace(-5, 5, 10),  # Regularization strength (L2 penalty)
}

# Perform GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=Ridge(),
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='r2',  # Optimize for R² score
    verbose=1  # Optional: display progress
)
grid_search.fit(X_train, y_train)

# Best hyperparameters and best R² score
print(f"Best Hyperparameters from GridSearchCV: {grid_search.best_params_}")
print(f"Best R² score from GridSearchCV: {grid_search.best_score_}")

# Fit the model with the best hyperparameters
best_regressor = grid_search.best_estimator_
best_regressor.fit(X_train, y_train)

# Predict using the test data
y_pred = best_regressor.predict(X_test)

# Reverse scaling on predictions if necessary (optional, depends on prior scaling)
# y_pred = sc_y.inverse_transform(y_pred)  # Uncomment if scaling was applied to y

# Display predictions vs actual values
np.set_printoptions(precision=2, suppress=True)  # Suppress scientific notation
comparison = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis=1)
print("Predictions vs Actual:")
print(comparison)

# Get the regression coefficients and intercept
print("Coefficients:", best_regressor.coef_)
print("Intercept:", best_regressor.intercept_)

# Evaluate the model performance on the test set
r2_test_score = r2_score(y_test, y_pred)
print(f"R² score on the test set: {r2_test_score}")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters from GridSearchCV: {'alpha': 3.593813663804626}
Best R² score from GridSearchCV: 0.83243878823417
Predictions vs Actual:
[[ 7.36  8.  ]
 [14.41 15.  ]
 [16.28 16.  ]
 [10.37 10.  ]
 [ 8.89 10.  ]
 [12.74 12.  ]
 [13.09 13.  ]
 [18.59 17.  ]
 [11.84 12.  ]
 [11.35 12.  ]
 [10.85 11.  ]
 [10.38 10.  ]
 [13.51 13.  ]
 [ 8.39  8.  ]
 [18.42 18.  ]
 [12.13 12.  ]
 [12.93 13.  ]
 [12.25 13.  ]
 [10.56 10.  ]
 [ 9.97 10.  ]
 [12.2  12.  ]
 [10.27 10.  ]
 [17.45 17.  ]
 [13.19 15.  ]
 [12.39 14.  ]
 [ 0.72  0.  ]
 [12.1  12.  ]
 [13.26 14.  ]
 [11.29 12.  ]
 [12.56  9.  ]
 [13.57 13.  ]
 [16.89 16.  ]
 [13.27 13.  ]
 [16.32 16.  ]
 [12.98 12.  ]
 [ 8.71 10.  ]
 [ 9.35 10.  ]
 [11.16 11.  ]
 [13.01 13.  ]
 [11.25 10.  ]
 [15.62 15.  ]
 [17.6  18.  ]
 [11.71 11.  ]
 [13.33 13.  ]
 [12.33 13.  ]
 [ 9.26 10.  ]
 [12.64 14.  ]
 [ 9.47  9.  ]
 [11.57 11.  ]
 [ 9.11 10.  ]
 [ 5.53  8.  ]
 [14.41 17.  ]
 [ 8.97  9.  ]
 [