In [3]:
import os

DATA_FOLDER = os.path.join(os.getcwd(), 'data')
DATA_FILE = os.path.join(DATA_FOLDER, 'Student_Performance.csv')

try:
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER) # if not exist create the dir
    print(f"Data folder found: {DATA_FOLDER}\nContains dataset: {os.path.basename(DATA_FILE)}")
except Exception as e:
    # Handle the exception
    print(f"Error: {e}")

Data folder found: C:\Users\ffore\Desktop\Francesco - QNAP\ML\Multivariate Linear Regression\data
Contains dataset: Student_Performance.csv


In [4]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

df = pd.read_csv("/kaggle/input/student-performance-multiple-linear-regression/Student_Performance.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/student-performance-multiple-linear-regression/Student_Performance.csv'

In [None]:
# Transform categorical variable into numerical
df['Extracurricular Activities'] = df['Extracurricular Activities'].map({'Yes': 1, 'No': 0})

In [None]:
# Check data type and non-null elements
df.info()

In [None]:
# Prediction target, and input features
y = df['Performance Index']
x = df.drop(columns = "Performance Index")

In [None]:
# Training set and validation set
x_train, x_validation, y_train, y_validation = train_test_split(x, y, random_state=0)

In [None]:
# Model: Multivariate Linear Regression
linear_regression_model = LinearRegression()
linear_regression_model.fit(x_train, y_train)
y_hat = linear_regression_model.predict(x_validation)

In [None]:
# Model Error: MAE
print(f"Model Error: \u00B1 {mean_absolute_error(y_validation, y_hat)} in the Performance Index")

In [None]:
# Model Evaluation 1: Predictions vs. Actual Values
plt.scatter(y_validation, y_hat, color='blue')  # Predictions vs actual values
plt.plot([y_validation.min(), y_validation.max()], [y_validation.min(), y_validation.max()], 'k--', lw=3)  # Ideal line where predictions match actual values
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.show()

In [None]:
# Model Evaluation 2: Residuals plot
residuals = y_validation - y_hat

plt.scatter(y_hat, residuals)
plt.hlines(y=0, xmin=y_hat.min(), xmax=y_hat.max(), colors='red', linestyles='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

In [None]:
# Model Evaluation 3: Coefficient Plot
coefficients = pd.DataFrame(linear_regression_model.coef_, x.columns, columns=['Coefficient'])

coefficients.sort_values(by='Coefficient', ascending=True).plot(kind='barh')
plt.title('Coefficients in the Multivariate Linear Model')
plt.xlabel('Coefficient Value')
plt.ylabel('Features')
plt.show()

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, validation_scores = learning_curve(linear_regression_model, x, y, train_sizes=np.linspace(0.1, 1.0, 10))

plt.plot(train_sizes, train_scores.mean(axis=1), label='Training score')
plt.plot(train_sizes, validation_scores.mean(axis=1), label='Validation score')
plt.xlabel('Training set size')
plt.ylabel('Score')
plt.title('Learning Curve')
plt.legend()
plt.show()