In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('data/stud.csv')

In [3]:
df.columns = df.columns.str.lower().str.replace(" ", "_")
df.head()

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
df['parental_level_of_education'] = df['parental_level_of_education'].str.replace("'", "")
df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace("/", "_")

In [5]:
for col in ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']:
    df[col] = df[col].str.lower()

In [6]:
numerical_cols = [col for col in df.columns if df[col].dtype != 'O']
categorical_cols = [col for col in df.columns if df[col].dtype == 'O']

print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)

Numerical Columns: ['math_score', 'reading_score', 'writing_score']
Categorical Columns: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']


In [7]:
# Add total score
df['total_score'] = df['math_score'] + df['reading_score'] + df['writing_score']

# Add average score
df['average_score'] = df[['math_score','reading_score','writing_score']].mean(axis=1)

In [8]:
math_full_count = (df['math_score'] == 100).sum()
reading_full_count = (df['reading_score'] == 100).sum()
writing_full_count = (df['writing_score'] == 100).sum()

In [9]:
print("Students with full marks in math:", math_full_count)
print("Students with full marks in reading:", reading_full_count)
print("Students with full marks in writing:", writing_full_count)

Students with full marks in math: 7
Students with full marks in reading: 17
Students with full marks in writing: 14


In [10]:
math_low_count = (df['math_score'] < 20).sum()
reading_low_count = (df['reading_score'] < 20).sum()
writing_low_count = (df['writing_score'] < 20).sum()

print("Students with less than 20 marks in math:", math_low_count)
print("Students with less than 20 marks in reading:", reading_low_count)
print("Students with less than 20 marks in writing:", writing_low_count)

Students with less than 20 marks in math: 4
Students with less than 20 marks in reading: 1
Students with less than 20 marks in writing: 3


In [11]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score,average_score
0,female,group b,bachelors degree,standard,none,72,72,74,218,72.666667
1,female,group c,some college,standard,completed,69,90,88,247,82.333333
2,female,group b,masters degree,standard,none,90,95,93,278,92.666667
3,male,group a,associates degree,free/reduced,none,47,57,44,148,49.333333
4,male,group c,some college,standard,none,76,78,75,229,76.333333


## MODEL TRAINING

In [12]:
X = df.drop(['average_score', 'total_score'], axis=1)
y = df['average_score']                  # target

In [13]:
categorical_cols = [col for col in X.columns if X[col].dtype == 'O']
numerical_cols   = [col for col in X.columns if X[col].dtype != 'O']

In [14]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

In [15]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,0,1,1,1,1,72,72,74
1,0,2,4,1,0,69,90,88
2,0,1,3,1,1,90,95,93
3,1,0,0,0,1,47,57,44
4,1,2,4,1,1,76,78,75


In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

# Optional: convert back to DataFrame (recommended)
import pandas as pd
X = pd.DataFrame(X_scaled, columns=X.columns)

In [17]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,-0.964625,-1.015044,-0.81264,0.741881,0.746748,0.390024,0.193999,0.391492
1,-0.964625,-0.150441,0.827953,0.741881,-1.33914,0.192076,1.427476,1.313269
2,-0.964625,-1.015044,0.281088,0.741881,0.746748,1.577711,1.770109,1.642475
3,1.036672,-1.879647,-1.359505,-1.347925,0.746748,-1.259543,-0.833899,-1.583744
4,1.036672,-0.150441,0.827953,0.741881,0.746748,0.653954,0.605158,0.457333


In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (800, 8)
X_test shape: (200, 8)
y_train shape: (800,)
y_test shape: (200,)


In [23]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd

In [24]:
models = {
    "Linear Regression": LinearRegression(),
    "AdaBoost": AdaBoostRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Random Forest": RandomForestRegressor(),
    "SVR": SVR(),
    "XGBoost": xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
    "Lasso": Lasso(),
    "Ridge": Ridge()
}

def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
    test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)
    
    results.append([
        name, 
        round(train_mae, 4), round(test_mae, 4),
        round(train_rmse, 4), round(test_rmse, 4),
        round(train_r2, 4), round(test_r2, 4)
    ])

# Convert results into DataFrame
results_df = pd.DataFrame(results, columns=[
    "Model", 
    "Train MAE", "Test MAE",
    "Train RMSE", "Test RMSE",
    "Train R2", "Test R2"
])

print(results_df)

               Model  Train MAE  Test MAE  Train RMSE  Test RMSE  Train R2  \
0  Linear Regression     0.0000    0.0000      0.0000     0.0000    1.0000   
1           AdaBoost     1.3354    1.5448      1.7187     2.2135    0.9852   
2  Gradient Boosting     0.3368    0.5496      0.4263     0.9570    0.9991   
3      Random Forest     0.1783    0.5262      0.2775     1.0935    0.9996   
4                SVR     1.3748    1.9913      3.3665     5.8189    0.9432   
5            XGBoost     0.0394    0.5827      0.0553     1.1093    1.0000   
6              Lasso     0.8506    0.8769      1.0645     1.1142    0.9943   
7              Ridge     0.0063    0.0066      0.0078     0.0086    1.0000   

   Test R2  
0   1.0000  
1   0.9771  
2   0.9957  
3   0.9944  
4   0.8420  
5   0.9943  
6   0.9942  
7   1.0000  


## Ridge is comparatively better 

In [25]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

# Train model
ridge_model = Ridge()
ridge_model.fit(X_train, y_train)

# Predict
y_pred = ridge_model.predict(X_test)

# Accuracy (R2 Score)
score = r2_score(y_test, y_pred) * 100

print(f"Accuracy of the Ridge model is: {score:.2f}%")

Accuracy of the Ridge model is: 100.00%
