In [21]:
# Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, pointbiserialr
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

In [22]:
# Import the dataset
math = pd.read_csv("./student-mat.csv", sep=';', header=0)
por = pd.read_csv("./student-por.csv", sep=';', header=0)

In [23]:
#correlation of the dependent variables with the independent variables
#########   Correlation with the method from David Boules


# F-statistics for correlation 
#target features is the final grade G3

target = "G3"
# Separate numerical and categorical columns
numerical_features = por.select_dtypes(include=['number']).columns
categorical_features = por.select_dtypes(include=['object', 'category']).columns

X = por.drop(columns = [target])
y = por[target]

# one way ANOVA test for the correlation of means between two or more groups
# assumption the data is normally distributed


from scipy.stats import f_oneway

#correlation between the categorical variables and the target variable of G3(final grading)
from sklearn.feature_selection import f_classif 
for col in categorical_features: 
    groups = [y[X[col] == value] for value in X[col].unique()]
    f_stat, p_value = f_oneway(*groups)
    print(f"Feature: {col}, F_statistic:{f_stat}, P-value: {p_value}")

# filter numerical features
numerical_features= X.select_dtypes(include = [np.number]).columns

#perform F-test

f_scores, p_values = f_classif(X[numerical_features], y)

#display the results

for col, f_score, p_value in zip(numerical_features, f_scores, p_values):
    print(f"Feature: {col}, F_statistic:{f_stat}, P-value: {p_value}")

selected_features_num = [col for col, p_value in zip(numerical_features, p_values) if p_value < 0.05]
selected_features_cat = [col for col, p_value in zip(categorical_features, p_values) if p_value < 0.05]

print("Selected Features:", selected_features_num, selected_features_cat)

Feature: school, F_statistic:56.89067686337133, P-value: 1.5661990923002604e-13
Feature: sex, F_statistic:10.962308407124874, P-value: 0.0009815287061373317
Feature: address, F_statistic:18.707910527412754, P-value: 1.7641534609222437e-05
Feature: famsize, F_statistic:1.3137906447644496, P-value: 0.2521332216658279
Feature: Pstatus, F_statistic:0.0003677569375753126, P-value: 0.984705825951084
Feature: Mjob, F_statistic:7.370224291121831, P-value: 8.30514988494739e-06
Feature: Fjob, F_statistic:3.2726805958419667, P-value: 0.011376280623605892
Feature: reason, F_statistic:10.248465509132915, P-value: 1.3416422874904278e-06
Feature: guardian, F_statistic:2.63816697870449, P-value: 0.07226239503367116
Feature: schoolsup, F_statistic:2.865641360496058, P-value: 0.09097103846579366
Feature: famsup, F_statistic:2.2759054076786045, P-value: 0.1318865120420208
Feature: paid, F_statistic:1.955810225639281, P-value: 0.16244124863569093
Feature: activities, F_statistic:2.321335165021925, P-value

In [24]:
# Encode the categorical variables and scale the numerical variables

from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
from sklearn.metrics import r2_score

# Define the target column and features
target_column = "G3"
X = por.drop(columns=[target_column])
X = X[['Medu', 'failures', 'Dalc', 'Walc', 'absences', 'G1', 'G2', 'sex', 'Mjob', 'schoolsup', 'famsup', 'activities', 'nursery', 'higher']]
y = por[target_column]

# Apply the scaler 
numerical_cols = X.select_dtypes(include=['number']).columns
sc_X = StandardScaler()
X[numerical_cols] = sc_X.fit_transform(X[numerical_cols])

#scaling the y is not necessary
#sc_y = StandardScaler()
#y = sc_y.fit_transform(y)

# Encode categorical columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns
for col in categorical_features:
    X[col] = LabelEncoder().fit_transform(X[col])


In [25]:
#splitting the dataset into the training and the test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [26]:
# train the model
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

In [27]:
import numpy as np
from sklearn.metrics import r2_score

# Assuming `regressor` is already trained and `X_test` and `y_test` are defined

# Make predictions using the trained model
y_pred = regressor.predict(X_test)

# Convert y_test to a NumPy array and ensure both y_pred and y_test are reshaped into 2D arrays
y_pred = y_pred.reshape(-1, 1)  # Reshape to (n_samples, 1)
y_test = y_test.to_numpy().reshape(-1, 1)  # Convert y_test to NumPy array and reshape

# Concatenate and print the predictions vs actual values
np.set_printoptions(precision=2, suppress=True)  # Suppress scientific notation
comparison = np.concatenate((y_pred, y_test), axis=1)  # Concatenate along columns (axis=1)
print("Predictions vs Actual:")
print(comparison)

# Evaluating the model performance on the test set
r2_test_score = r2_score(y_test, y_pred)
print(f"R² score on the test set: {r2_test_score}")


Predictions vs Actual:
[[ 9.   8. ]
 [15.  15. ]
 [16.  16. ]
 [11.  10. ]
 [ 9.  10. ]
 [13.  12. ]
 [12.  13. ]
 [17.  17. ]
 [14.  12. ]
 [14.  12. ]
 [10.  11. ]
 [10.  10. ]
 [14.  13. ]
 [10.   8. ]
 [18.  18. ]
 [12.  12. ]
 [13.  13. ]
 [12.  13. ]
 [11.  10. ]
 [ 9.  10. ]
 [12.  12. ]
 [11.  10. ]
 [18.  17. ]
 [13.  15. ]
 [12.  14. ]
 [ 0.   0. ]
 [13.  12. ]
 [13.  14. ]
 [12.  12. ]
 [12.   9. ]
 [14.  13. ]
 [16.  16. ]
 [13.  13. ]
 [16.  16. ]
 [14.  12. ]
 [ 8.  10. ]
 [10.  10. ]
 [12.  11. ]
 [14.  13. ]
 [12.  10. ]
 [14.  15. ]
 [18.  18. ]
 [12.  11. ]
 [12.  13. ]
 [13.  13. ]
 [10.  10. ]
 [13.  14. ]
 [11.   9. ]
 [12.  11. ]
 [11.  10. ]
 [ 8.   8. ]
 [14.  17. ]
 [11.   9. ]
 [13.  13. ]
 [ 7.   8. ]
 [10.  11. ]
 [11.  12. ]
 [10.  12. ]
 [14.  15. ]
 [16.  15. ]
 [13.  13. ]
 [ 9.   7. ]
 [11.  12. ]
 [10.  10. ]
 [14.  12. ]
 [13.  12. ]
 [14.  11. ]
 [14.  13. ]
 [14.  14. ]
 [ 0.   8. ]
 [ 7.   9. ]
 [12.  11. ]
 [13.  13. ]
 [10.  11. ]
 [13.  14. ]
 [

In [28]:
## Applying k-Fold Cross Validation

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 66.91 %
Standard Deviation: 15.36 %


In [29]:
# Hyperparemeters tuning

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid (using a larger set for GridSearch)
param_grid = {
    'n_estimators': [10, 50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Max depth of trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),  # The model we're tuning
    param_grid=param_grid,  # The hyperparameter grid
    cv=3,  # 3-fold cross-validation
    scoring='r2',  # R² score for evaluation
    n_jobs=-1,  # Use all CPU cores for faster processing
    verbose=1,  # Print progress during fitting
)

# Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best R² score from GridSearchCV
print(f"Best Parameters from GridSearchCV: {grid_search.best_params_}")
print(f"Best R² Score from GridSearchCV: {grid_search.best_score_:.4f}")

# Optionally, you can refit the model on the entire training data with the best parameters:
best_regressor = grid_search.best_estimator_

# Now you can make predictions and evaluate on the test set:
y_pred = best_regressor.predict(X_test)
# Fit the model with the best hyperparameters found by GridSearchCV
best_regressor = grid_search.best_estimator_
best_regressor.fit(X_train, y_train)

# Predict using the test data
y_pred = best_regressor.predict(X_test)

# Display predictions vs actual values (make sure y_test is in the same shape as y_pred)
# If y_test is a pandas Series, we need to convert it to a numpy array or use its values
y_test = y_test.to_numpy() if isinstance(y_test, pd.Series) else y_test  # Ensure y_test is numpy array

# Concatenate and print the predictions vs actual values
comparison = np.column_stack((y_pred, y_test))
print("Predictions vs Actual:")
print(comparison)

# Evaluating the model performance on the test set
r2_test_score = r2_score(y_test, y_pred)  # Evaluating on the actual test set
print(f"R² score on the test set: {r2_test_score}")



Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters from GridSearchCV: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Best R² Score from GridSearchCV: 0.8316
Predictions vs Actual:
[[ 7.7   8.  ]
 [14.66 15.  ]
 [16.08 16.  ]
 [10.34 10.  ]
 [ 9.6  10.  ]
 [12.93 12.  ]
 [12.67 13.  ]
 [17.77 17.  ]
 [12.23 12.  ]
 [11.59 12.  ]
 [10.73 11.  ]
 [10.88 10.  ]
 [13.37 13.  ]
 [ 8.33  8.  ]
 [17.78 18.  ]
 [12.9  12.  ]
 [12.68 13.  ]
 [12.55 13.  ]
 [10.78 10.  ]
 [ 9.94 10.  ]
 [12.43 12.  ]
 [10.69 10.  ]
 [17.36 17.  ]
 [13.2  15.  ]
 [12.65 14.  ]
 [ 0.86  0.  ]
 [12.53 12.  ]
 [13.05 14.  ]
 [11.53 12.  ]
 [12.57  9.  ]
 [13.39 13.  ]
 [16.1  16.  ]
 [13.09 13.  ]
 [15.99 16.  ]
 [13.44 12.  ]
 [ 8.82 10.  ]
 [ 9.36 10.  ]
 [11.34 11.  ]
 [13.48 13.  ]
 [11.21 10.  ]
 [15.48 15.  ]
 [17.41 18.  ]
 [11.3  11.  ]
 [13.39 13.  ]
 [12.75 13.  ]
 [10.   10.  ]
 [12.74 14.  ]
 [ 9.95  9.  ]
 [11.51 11.  ]
 [ 9.39 10.  ]
 [