In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("../dataset/Student_Performance.csv")

df.head()



Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [37]:
df.shape


(10000, 6)

In [38]:
df.columns


Index(['Hours Studied', 'Previous Scores', 'Extracurricular Activities',
       'Sleep Hours', 'Sample Question Papers Practiced', 'Performance Index'],
      dtype='object')

In [39]:
df.dtypes


Hours Studied                         int64
Previous Scores                       int64
Extracurricular Activities           object
Sleep Hours                           int64
Sample Question Papers Practiced      int64
Performance Index                   float64
dtype: object

In [40]:
df.describe()


Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.9929,69.4457,6.5306,4.5833,55.2248
std,2.589309,17.343152,1.695863,2.867348,19.212558
min,1.0,40.0,4.0,0.0,10.0
25%,3.0,54.0,5.0,2.0,40.0
50%,5.0,69.0,7.0,5.0,55.0
75%,7.0,85.0,8.0,7.0,71.0
max,9.0,99.0,9.0,9.0,100.0


In [41]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [42]:
df.nunique()


Hours Studied                        9
Previous Scores                     60
Extracurricular Activities           2
Sleep Hours                          6
Sample Question Papers Practiced    10
Performance Index                   91
dtype: int64

In [43]:
df.isnull().sum()


Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         0
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64

In [44]:
df = df.fillna(df.mean(numeric_only=True))


In [45]:
df.duplicated().sum()


np.int64(127)

In [46]:
df = df.drop_duplicates()


In [47]:
numeric_cols = df.select_dtypes(include=[np.number]).columns

Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

outliers = ((df[numeric_cols] < (Q1 - 1.5 * IQR)) | 
            (df[numeric_cols] > (Q3 + 1.5 * IQR))).sum()

outliers


Hours Studied                       0
Previous Scores                     0
Sleep Hours                         0
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64

In [48]:
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(include=['object']).columns

num_cols, cat_cols


(Index(['Hours Studied', 'Previous Scores', 'Sleep Hours',
        'Sample Question Papers Practiced', 'Performance Index'],
       dtype='object'),
 Index(['Extracurricular Activities'], dtype='object'))

In [49]:
df_no_outliers = df[~((df[numeric_cols] < (Q1 - 1.5 * IQR)) | 
                       (df[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
df = df_no_outliers


In [50]:
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(include=['object']).columns

num_cols, cat_cols


(Index(['Hours Studied', 'Previous Scores', 'Sleep Hours',
        'Sample Question Papers Practiced', 'Performance Index'],
       dtype='object'),
 Index(['Extracurricular Activities'], dtype='object'))

In [51]:
df['Extracurricular Activities'] = df['Extracurricular Activities'].map({'Yes':1, 'No':0})
df.head()


Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0


In [52]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_df = df.copy()
scaled_df[num_cols] = scaler.fit_transform(df[num_cols])

scaled_df.head()


Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,0.775566,1.706168,1,1.454025,-1.249715,1.862979
1,-0.383205,0.724912,0,-1.491315,-0.900925,0.509348
2,1.161822,-1.064438,1,0.275889,-0.900925,-0.531907
3,0.003052,-1.006717,1,-0.902247,-0.900925,-1.000471
4,0.775566,0.320865,0,0.864957,0.145444,0.561411


In [53]:
from sklearn.model_selection import train_test_split

X = scaled_df.drop("Performance Index", axis=1)
y = scaled_df["Performance Index"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [54]:
from sklearn.model_selection import train_test_split

X = scaled_df.drop("Performance Index", axis=1)
y = scaled_df["Performance Index"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [55]:
# quick kernel & variable checks — run this cell first
import sys
print("Python:", sys.version.splitlines()[0])

# Is kernel alive?
try:
    _ = 1/1
    print("Kernel: OK")
except Exception as e:
    print("Kernel error:", e)

# Are required variables defined?
print("scaled_df in globals():", 'scaled_df' in globals())
if 'scaled_df' in globals():
    try:
        print("scaled_df.shape:", scaled_df.shape)
        print("columns:", list(scaled_df.columns))
    except Exception as e:
        print("Error inspecting scaled_df:", e)

# Are sklearn and train_test_split available?
try:
    import sklearn
    from sklearn.model_selection import train_test_split
    print("sklearn version:", sklearn.__version__)
except Exception as e:
    print("sklearn import error:", e)


Python: 3.13.9 | packaged by Anaconda, Inc. | (main, Oct 21 2025, 19:09:58) [MSC v.1929 64 bit (AMD64)]
Kernel: OK
scaled_df in globals(): True
scaled_df.shape: (9873, 6)
columns: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced', 'Performance Index']
sklearn version: 1.7.2


In [56]:
# CLEAN, single-run block: split, train Linear Regression, evaluate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Defensive: strip whitespace from column names (just in case)
scaled_df.columns = scaled_df.columns.str.strip()

# Prepare X and y (use exact column name shown in your notebook)
X = scaled_df.drop("Performance Index", axis=1)
y = scaled_df["Performance Index"]

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Shapes -> X_train:", X_train.shape, "X_test:", X_test.shape)
print("         y_train:", y_train.shape, "y_test:", y_test.shape)

# Train Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict & evaluate
y_pred = lr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\nLinear Regression results")
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 score:", r2)


Shapes -> X_train: (7898, 5) X_test: (1975, 5)
         y_train: (7898,) y_test: (1975,)

Linear Regression results
MSE: 0.011671265615520262
RMSE: 0.1080336318723029
R2 score: 0.9884301209927054


In [57]:
from sklearn.linear_model import Ridge, Lasso

ridge = Ridge(alpha=1.0).fit(X_train, y_train)
lasso = Lasso(alpha=0.01).fit(X_train, y_train)

for name, model in [("Ridge", ridge), ("Lasso", lasso)]:
    pred = model.predict(X_test)
    print(f"\n{name} R2:", r2_score(y_test, pred), "RMSE:", np.sqrt(mean_squared_error(y_test, pred)))



Ridge R2: 0.9884299406717718 RMSE: 0.1080344737414981

Lasso R2: 0.9874700926716604 RMSE: 0.11242644993373414


In [58]:
# Block 5 — encode categorical features
# For this dataset the column is 'Extracurricular Activities' with Yes/No
if 'Extracurricular Activities' in df.columns:
    df['Extracurricular Activities'] = df['Extracurricular Activities'].map({'Yes':1, 'No':0})
    # fallback if mapping created NaNs
    if df['Extracurricular Activities'].isnull().any():
        df['Extracurricular Activities'] = df['Extracurricular Activities'].fillna(0).astype(int)

# If there are other categorical columns, one-hot encode them:
other_cat = [c for c in cat_cols if c != 'Extracurricular Activities']
if other_cat:
    df = pd.get_dummies(df, columns=other_cat, drop_first=True)

print("After encoding, columns:", df.columns.tolist())


After encoding, columns: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced', 'Performance Index']


In [59]:
# Block 6 — IQR outlier detection and capping (winsorizing)
numeric_features = [c for c in df.select_dtypes(include=[np.number]).columns if c != 'Performance Index']

Q1 = df[numeric_features].quantile(0.25)
Q3 = df[numeric_features].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

print("Outlier counts:")
print(((df[numeric_features] < lower) | (df[numeric_features] > upper)).sum())

# Cap values to limits
for c in numeric_features:
    df[c] = np.where(df[c] < lower[c], lower[c], df[c])
    df[c] = np.where(df[c] > upper[c], upper[c], df[c])

print("Outliers capped (winsorized).")


Outlier counts:
Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         0
Sample Question Papers Practiced    0
dtype: int64
Outliers capped (winsorized).


In [60]:
# Block 11 — Linear Regression baseline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("LinearRegression -> RMSE: {:.4f}, R2: {:.4f}".format(rmse_lr, r2_lr))


LinearRegression -> RMSE: 0.1080, R2: 0.9884


In [61]:
# Block 11 — Linear Regression baseline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("LinearRegression -> RMSE: {:.4f}, R2: {:.4f}".format(rmse_lr, r2_lr))


LinearRegression -> RMSE: 0.1080, R2: 0.9884


In [62]:
# BLOCK C — evaluation metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

y_pred = lr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Evaluation on test set:")
print(f"  MSE : {mse:.4f}")
print(f"  RMSE: {rmse:.4f}")
print(f"  MAE : {mae:.4f}")
print(f"  R2  : {r2:.4f}")


Evaluation on test set:
  MSE : 0.0117
  RMSE: 0.1080
  MAE : 0.0857
  R2  : 0.9884


In [63]:
# BLOCK D — cross-validation (5-fold) for RMSE and R2
from sklearn.model_selection import cross_val_score
import numpy as np

# cross_val_score with neg_mean_squared_error returns negative MSE
neg_mse = cross_val_score(lr, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
rmse_cv = np.sqrt(-neg_mse)
r2_cv = cross_val_score(lr, X_train, y_train, scoring="r2", cv=5)

print("Cross-val (5-fold) RMSE: mean={:.4f}, std={:.4f}".format(rmse_cv.mean(), rmse_cv.std()))
print("Cross-val (5-fold) R2  : mean={:.4f}, std={:.4f}".format(r2_cv.mean(), r2_cv.std()))


Cross-val (5-fold) RMSE: mean=0.1061, std=0.0012
Cross-val (5-fold) R2  : mean=0.9887, std=0.0003


In [64]:
# --- MODEL TRAINING BLOCK (Linear Regression) ---

from sklearn.linear_model import LinearRegression

# Create model
lr = LinearRegression()

# Train (fit) model
lr.fit(X_train, y_train)

print("Linear Regression model trained successfully!")


Linear Regression model trained successfully!


In [65]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.01, max_iter=10000)
lasso.fit(X_train, y_train)

print("Lasso Regression model trained!")


Lasso Regression model trained!


In [66]:
import joblib

# Save the trained Linear Regression model
joblib.dump(lr, "linear_regression_model.joblib")

print("Model saved successfully as: linear_regression_model.joblib")


Model saved successfully as: linear_regression_model.joblib
