In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error,root_mean_squared_error
from sklearn.linear_model import LinearRegression,ridge_regression,Ridge,Lasso
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.preprocessing import OneHotEncoder,StandardScaler,Normalizer
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [3]:
df=pd.read_csv('StudentsPerformance.csv')

In [4]:
df.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88


In [5]:
X=df.drop(['math score'],axis=1)
y=df['math score']

In [6]:
from sklearn.compose import ColumnTransformer
num_feat=X.select_dtypes(exclude="object").columns
cat_feat=X.select_dtypes(include="object").columns

num_transf=StandardScaler()
oh_transf=OneHotEncoder()

preprocessor=ColumnTransformer([

    ("onehotencoder",oh_transf,cat_feat),
    ("StandardScaler",num_transf,num_feat),

])

In [7]:
X=preprocessor.fit_transform(X)

In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.3)
X_train.shape,y_train.shape

((700, 19), (700,))

In [10]:
X

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]])

In [13]:
def model_evaluate(true,predicted):
    mae=mean_squared_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=root_mean_squared_error(true,predicted)
    r2_square=r2_score(true,predicted)
    return mae, mse, r2_square

In [16]:
def evaluate_regression_models(X, y, test_size=0.2, random_state=42):
    """
    Trains multiple regression models and evaluates their performance.
    
    Parameters:
        X (pd.DataFrame or np.array): Feature matrix
        y (pd.Series or np.array): Target variable
        test_size (float): Proportion of data to use for testing
        random_state (int): Seed for reproducibility
    
    Returns:
        pd.DataFrame: Model performance results (R² Score, MAE)
    """
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # List of regression models
    models = {
        "Linear Regression": LinearRegression(),
        "Ridge Regression": Ridge(),
        "Lasso Regression": Lasso(),
        "Decision Tree": DecisionTreeRegressor(),
        "Random Forest": RandomForestRegressor(),
        "XGBoost": XGBRegressor(),
        "SVR": SVR(),
        "KNN Regressor": KNeighborsRegressor()
    }
    
    # Dictionary to store results
    results = {"Model": [], "R2 Score": [], "MAE": []}
    
    for name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)
        
        # Predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        
        # Store results
        results["Model"].append(name)
        results["R2 Score"].append(r2)
        results["MAE"].append(mae)
    
    return pd.DataFrame(results).sort_values(by="R2 Score", ascending=False)

In [17]:
results = evaluate_regression_models(X, y)
print(results)

               Model  R2 Score       MAE
1   Ridge Regression  0.880593  4.211101
0  Linear Regression  0.880433  4.214763
4      Random Forest  0.853450  4.623535
5            XGBoost  0.827797  5.057731
2   Lasso Regression  0.825320  5.157882
7      KNN Regressor  0.783813  5.621000
3      Decision Tree  0.743855  6.200000
6                SVR  0.728600  5.401539
