In [71]:
import pandas as pd
import numpy as np

In [72]:
class My_MLR():

    def __init__(self):
        self.weights = None
        self.offset = None
    def fit(self,X_train,y_train):
        X_train = np.asarray(X_train, dtype=float)
        y_train = np.asarray(y_train, dtype=float)
        X_train = np.insert(X_train,0,1,axis=1)
        coefficients = np.linalg.inv(np.dot(X_train.T,X_train)).dot(X_train.T).dot(y_train)
        self.offset = coefficients[0]
        self.weights = coefficients[1:]
    def predict(self,X_test):
        y_pred = np.dot(X_test,self.weights) + self.offset
        return y_pred

In [73]:
df = pd.read_csv('Student_Performance.csv')

In [74]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [75]:
df.tail()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
9995,1,49,Yes,4,2,23.0
9996,7,64,Yes,8,5,58.0
9997,6,83,Yes,8,5,74.0
9998,9,97,Yes,7,0,95.0
9999,7,74,No,8,1,64.0


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [77]:
df.describe()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.9929,69.4457,6.5306,4.5833,55.2248
std,2.589309,17.343152,1.695863,2.867348,19.212558
min,1.0,40.0,4.0,0.0,10.0
25%,3.0,54.0,5.0,2.0,40.0
50%,5.0,69.0,7.0,5.0,55.0
75%,7.0,85.0,8.0,7.0,71.0
max,9.0,99.0,9.0,9.0,100.0


In [78]:
df_encoded = pd.get_dummies(df, columns=['Extracurricular Activities'], drop_first=True)

In [79]:
print(df_encoded.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Sleep Hours                       10000 non-null  int64  
 3   Sample Question Papers Practiced  10000 non-null  int64  
 4   Performance Index                 10000 non-null  float64
 5   Extracurricular Activities_Yes    10000 non-null  bool   
dtypes: bool(1), float64(1), int64(4)
memory usage: 400.5 KB
None


In [80]:
y = df_encoded['Performance Index']
X = df_encoded.drop('Performance Index', axis=1)

In [81]:
from sklearn.model_selection import train_test_split

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [83]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 9254 to 7270
Data columns (total 5 columns):
 #   Column                            Non-Null Count  Dtype
---  ------                            --------------  -----
 0   Hours Studied                     8000 non-null   int64
 1   Previous Scores                   8000 non-null   int64
 2   Sleep Hours                       8000 non-null   int64
 3   Sample Question Papers Practiced  8000 non-null   int64
 4   Extracurricular Activities_Yes    8000 non-null   bool 
dtypes: bool(1), int64(4)
memory usage: 320.3 KB


In [84]:
model = My_MLR()
model.fit(X_train, y_train)
print("Weights:", model.weights)
print("Offset (Bias):", model.offset)

Weights: [2.85248393 1.0169882  0.47694148 0.19183144 0.60861668]
Offset (Bias): -33.92194621555667


In [85]:
y_pred = model.predict(X_test)
print("Predicted Price (lakhs):", y_pred)

Predicted Price (lakhs): [54.71185391748972 22.615512943389206 47.90314471125858 ...
 16.793419553617163 63.34327368258094 45.942623009994435]


In [86]:
from sklearn.metrics import r2_score

In [87]:
y_test_pred = model.predict(X_test)
print("R² Score (test):", r2_score(y_test, y_test_pred))

R² Score (test): 0.9889832909573146


In [88]:
from sklearn.linear_model import LinearRegression


In [89]:
model = LinearRegression()
model.fit(X_train, y_train)

In [90]:
y_pred = model.predict(X_test)

In [91]:
from sklearn.metrics import r2_score, mean_squared_error

In [92]:
r2 = r2_score(y_test, y_pred)
n = X_train.shape[0]
p = X_train.shape[1]

In [93]:
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

R² Score: 0.9889832909573145
MSE: 4.082628398521856


In [94]:
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print(adj_r2)

0.9889764003462045


In [95]:
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Coefficients: [2.85248393 1.0169882  0.47694148 0.19183144 0.60861668]
Intercept: -33.921946215556325
