In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
df=pd.read_csv("StudentPerformanceFactors.csv")

In [3]:
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


#### Checking for Null Values

In [4]:
df.isnull().sum()

Hours_Studied                  0
Attendance                     0
Parental_Involvement           0
Access_to_Resources            0
Extracurricular_Activities     0
Sleep_Hours                    0
Previous_Scores                0
Motivation_Level               0
Internet_Access                0
Tutoring_Sessions              0
Family_Income                  0
Teacher_Quality               78
School_Type                    0
Peer_Influence                 0
Physical_Activity              0
Learning_Disabilities          0
Parental_Education_Level      90
Distance_from_Home            67
Gender                         0
Exam_Score                     0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

In [6]:
df["Teacher_Quality"].value_counts()

Teacher_Quality
Medium    3925
High      1947
Low        657
Name: count, dtype: int64

In [7]:
df["Distance_from_Home"].value_counts()

Distance_from_Home
Near        3884
Moderate    1998
Far          658
Name: count, dtype: int64

In [8]:
df["Parental_Education_Level"].value_counts()

Parental_Education_Level
High School     3223
College         1989
Postgraduate    1305
Name: count, dtype: int64

In [9]:
df["Teacher_Quality"].isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
6602    False
6603    False
6604    False
6605    False
6606    False
Name: Teacher_Quality, Length: 6607, dtype: bool

#### Null Value Handling

In [10]:
imputer=SimpleImputer(missing_values=np.NaN,strategy="most_frequent")
imputer.fit(df.iloc[:,11:12].values)
df.iloc[:,11:12]=imputer.transform(df.iloc[:,11:12].values)
print(df)

      Hours_Studied  Attendance Parental_Involvement Access_to_Resources  \
0                23          84                  Low                High   
1                19          64                  Low              Medium   
2                24          98               Medium              Medium   
3                29          89                  Low              Medium   
4                19          92               Medium              Medium   
...             ...         ...                  ...                 ...   
6602             25          69                 High              Medium   
6603             23          76                 High              Medium   
6604             20          90               Medium                 Low   
6605             10          86                 High                High   
6606             15          67               Medium                 Low   

     Extracurricular_Activities  Sleep_Hours  Previous_Scores  \
0                     

In [11]:
df.iloc[:,11:12]

Unnamed: 0,Teacher_Quality
0,Medium
1,Medium
2,Medium
3,Medium
4,High
...,...
6602,Medium
6603,High
6604,Medium
6605,Medium


In [12]:
df["Teacher_Quality"].isnull().sum()

0

In [13]:
imputer=SimpleImputer(missing_values=np.NaN,strategy="most_frequent")
imputer.fit(df.iloc[:,16:18].values)
df.iloc[:,16:18]=imputer.transform(df.iloc[:,16:18].values)
print(df)

      Hours_Studied  Attendance Parental_Involvement Access_to_Resources  \
0                23          84                  Low                High   
1                19          64                  Low              Medium   
2                24          98               Medium              Medium   
3                29          89                  Low              Medium   
4                19          92               Medium              Medium   
...             ...         ...                  ...                 ...   
6602             25          69                 High              Medium   
6603             23          76                 High              Medium   
6604             20          90               Medium                 Low   
6605             10          86                 High                High   
6606             15          67               Medium                 Low   

     Extracurricular_Activities  Sleep_Hours  Previous_Scores  \
0                     

In [14]:
df["Parental_Education_Level"].isnull().sum()

0

In [15]:
df["Distance_from_Home"].isnull().sum()

0

#### Encoding Categorical Labels

Parental_Involvement, Access_to_Resources, Extracurricular_Activities, Motivation_Level, Internet_Access, Family_Income, Teacher_Quality, School_Type, Peer_Influence, Learning_Disabilities, Parental_Education, Distance_from_Home, Gender

In [16]:
le=LabelEncoder()
df["Parental_Involvement"]=le.fit_transform(df["Parental_Involvement"])

In [17]:
df["Access_to_Resources"]=le.fit_transform(df["Access_to_Resources"])
df["Extracurricular_Activities"]=le.fit_transform(df["Extracurricular_Activities"])
df["Motivation_Level"]=le.fit_transform(df["Motivation_Level"])
df["Internet_Access"]=le.fit_transform(df["Internet_Access"])
df["Family_Income"]=le.fit_transform(df["Family_Income"])
df["Teacher_Quality"]=le.fit_transform(df["Teacher_Quality"])
df["School_Type"]=le.fit_transform(df["School_Type"])
df["Peer_Influence"]=le.fit_transform(df["Peer_Influence"])
df["Learning_Disabilities"]=le.fit_transform(df["Learning_Disabilities"])
df["Parental_Education_Level"]=le.fit_transform(df["Parental_Education_Level"])
df["Distance_from_Home"]=le.fit_transform(df["Distance_from_Home"])
df["Gender"]=le.fit_transform(df["Gender"])

In [18]:
df["Parental_Involvement"]

0       1
1       1
2       2
3       1
4       2
       ..
6602    0
6603    0
6604    2
6605    0
6606    2
Name: Parental_Involvement, Length: 6607, dtype: int64

In [19]:
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,1,0,0,7,73,1,1,0,1,2,1,2,3,0,1,2,1,67
1,19,64,1,2,0,8,59,1,1,2,2,2,1,0,4,0,0,1,0,61
2,24,98,2,2,1,7,91,2,1,2,2,2,1,1,4,0,2,2,1,74
3,29,89,1,2,1,8,98,2,1,1,2,2,1,0,4,0,1,1,1,71
4,19,92,2,2,1,6,65,2,1,3,2,0,1,1,4,0,0,2,0,70


In [20]:

X=df.drop("Exam_Score",axis=1)
y=df["Exam_Score"]
print(y)

0       67
1       61
2       74
3       71
4       70
        ..
6602    68
6603    69
6604    68
6605    68
6606    64
Name: Exam_Score, Length: 6607, dtype: int64


In [21]:
# X=df.drop("Exam_Score",axis=1)
# y=df["Exam_Score"]
# print(y)
#train,test=train_test_split(X,y,test_size=0.2, random_state=32,shuffle=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.