In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [54]:
df=pd.read_csv('Student_Performance.csv')

In [55]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [56]:
df.isnull().sum()

Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         0
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64

In [57]:
df.dtypes

Hours Studied                         int64
Previous Scores                       int64
Extracurricular Activities           object
Sleep Hours                           int64
Sample Question Papers Practiced      int64
Performance Index                   float64
dtype: object

In [58]:
df_new=df.drop(columns=['Extracurricular Activities'])

In [59]:
df_new.head()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,9,1,91.0
1,4,82,4,2,65.0
2,8,51,7,2,45.0
3,5,52,5,2,36.0
4,7,75,8,5,66.0


## MULTICOLLINEARITY

In [60]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data=pd.DataFrame()
vif_data['features']=df_new.columns
vif_data['VIF']= [variance_inflation_factor(df_new.values, i) for i in range(len(df_new.columns))]


In [61]:
vif_data

Unnamed: 0,features,VIF
0,Hours Studied,8.63983
1,Previous Scores,108.557223
2,Sleep Hours,12.843993
3,Sample Question Papers Practiced,3.405578
4,Performance Index,99.559191


In [83]:
# Finding pairs of highly correlated features
corr_matrix=df_new.corr()
high_corr_threshold = 0.75  
corr_pairs = corr_matrix.unstack()
strong_pairs = corr_pairs[(abs(corr_pairs) > high_corr_threshold) & (abs(corr_pairs) < 1)]

# Sorting the pairs to identify the most correlated ones
sorted_pairs = strong_pairs.sort_values(kind="quicksort", ascending=False)

sorted_pairs

Previous Scores    Performance Index    0.915189
Performance Index  Previous Scores      0.915189
dtype: float64

##### No 2 input column has a correlation with each other hence all are important for prediction

In [84]:
df_new.head()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,9,1,91.0
1,4,82,4,2,65.0
2,8,51,7,2,45.0
3,5,52,5,2,36.0
4,7,75,8,5,66.0


In [85]:
x=df_new.iloc[:,0:4]

In [86]:
y=df_new.iloc[:,-1]

In [87]:
x.head()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced
0,7,99,9,1
1,4,82,4,2
2,8,51,7,2
3,5,52,5,2
4,7,75,8,5


In [88]:
from sklearn.model_selection import train_test_split

In [89]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [90]:
from sklearn.linear_model import LinearRegression

In [91]:
lr=LinearRegression()

In [92]:
lr.fit(x_train,y_train)

In [93]:
y_pred=lr.predict(x_test)

### Measuring Model Performance

In [94]:
from sklearn.metrics import mean_absolute_error,r2_score

In [95]:
print('r2_Score:',r2_score(y_test,y_pred))

r2_Score: 0.9881684717869549


In [96]:
print('MAE:',mean_absolute_error(y_test,y_pred))

MAE: 1.6194828377896093


In [97]:
y_pred

array([61.85979441, 42.04801792, 39.6470306 , ..., 74.06418577,
       91.59427019, 53.15746123])

In [98]:
x_test

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced
4440,7,71,5,5
5779,3,64,4,1
216,9,42,8,6
5873,1,49,4,2
3410,7,73,4,0
...,...,...,...,...
1269,3,78,5,3
7914,4,66,6,9
9435,5,88,5,8
8400,9,94,7,3


In [99]:
y_test

4440    60.0
5779    43.0
216     41.0
5873    21.0
3410    61.0
        ... 
1269    57.0
7914    48.0
9435    76.0
8400    93.0
4307    55.0
Name: Performance Index, Length: 2000, dtype: float64

### From our results we can say that our model is able to explain 98% variance 