# STUDENT PERFORMANCE PREDICTION SYSTEM

Steps to be taken:
- import libraries
- import data
- data cleaning
- perform EDA analysis
- encoding the catergorical data
- train the model
- test the model
- save the model as a file

### IMPORTING THE LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"]=(20,10)

### IMPORTING THE DATA

In [3]:
df = pd.read_csv("data/Students Performance.csv")

In [4]:
df.head()

Unnamed: 0,Student_ID,Student_Age,Sex,High_School_Type,Scholarship,Additional_Work,Sports_activity,Transportation,Weekly_Study_Hours,Attendance,Reading,Notes,Listening_in_Class,Project_work,Grade
0,STUDENT1,19-22,Male,Other,50%,Yes,No,Private,0,Always,Yes,Yes,No,No,AA
1,STUDENT2,19-22,Male,Other,50%,Yes,No,Private,0,Always,Yes,No,Yes,Yes,AA
2,STUDENT3,19-22,Male,State,50%,No,No,Private,2,Never,No,No,No,Yes,AA
3,STUDENT4,18,Female,Private,50%,Yes,No,Bus,2,Always,No,Yes,No,No,AA
4,STUDENT5,19-22,Male,Private,50%,No,No,Bus,12,Always,Yes,No,Yes,Yes,AA


In [5]:
df.shape

(145, 15)

In [6]:
## count by Student_Age
df.groupby('Student_Age')['Student_Age'].agg('count')

Student_Age
18       65
19-22    70
23-27    10
Name: Student_Age, dtype: int64

In [8]:
## count by Sex
df.groupby('Sex')['Sex'].agg('count')

Sex
Female    58
Male      87
Name: Sex, dtype: int64

In [11]:
## checking null values
df.isna().sum()

Student_ID            0
Student_Age           0
Sex                   0
High_School_Type      0
Scholarship           0
Additional_Work       0
Sports_activity       0
Transportation        0
Weekly_Study_Hours    0
Attendance            0
Reading               0
Notes                 0
Listening_in_Class    0
Project_work          0
Grade                 0
dtype: int64

In [17]:
## checking duplicates
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
140    False
141    False
142    False
143    False
144    False
Length: 145, dtype: bool

In [18]:
## removing duplicates
df.drop_duplicates()

Unnamed: 0,Student_ID,Student_Age,Sex,High_School_Type,Scholarship,Additional_Work,Sports_activity,Transportation,Weekly_Study_Hours,Attendance,Reading,Notes,Listening_in_Class,Project_work,Grade
0,STUDENT1,19-22,Male,Other,50%,Yes,No,Private,0,Always,Yes,Yes,No,No,AA
1,STUDENT2,19-22,Male,Other,50%,Yes,No,Private,0,Always,Yes,No,Yes,Yes,AA
2,STUDENT3,19-22,Male,State,50%,No,No,Private,2,Never,No,No,No,Yes,AA
3,STUDENT4,18,Female,Private,50%,Yes,No,Bus,2,Always,No,Yes,No,No,AA
4,STUDENT5,19-22,Male,Private,50%,No,No,Bus,12,Always,Yes,No,Yes,Yes,AA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,STUDENT141,19-22,Female,State,50%,Yes,Yes,Private,0,Always,No,Yes,No,Yes,CC
141,STUDENT142,18,Female,State,75%,No,No,Private,0,Never,No,Yes,Yes,No,CC
142,STUDENT143,18,Female,Private,75%,No,No,Private,0,Always,Yes,No,No,No,AA
143,STUDENT144,19-22,Female,State,75%,Yes,Yes,Bus,12,Sometimes,No,Yes,No,Yes,CB


In [19]:
df.corr()

  df.corr()


Unnamed: 0,Weekly_Study_Hours
Weekly_Study_Hours,1.0


In [29]:
## age encoding, setting 18 to '18 and below'

# def encode_age(Student_Age):
#     if Student_Age <= 18:
#         return '18 and below'
#     else:
#         return str(Student_Age)

In [31]:
# df['Age'] = df['Student_Age'].apply(encode_age)

In [32]:
## using Ordeinal encoding to encode the age

def age_encoding(Student_Age):
    if Student_Age =='18':
        return 1
    elif Student_Age=='19-22':
        return 2
    elif Student_Age=='23-27':
        return 3
    else:
        return None

In [33]:
df['Age'] = df['Student_Age'].apply(age_encoding)

In [47]:
df.head()

Unnamed: 0,Student_ID,Student_Age,Sex,High_School_Type,Scholarship,Additional_Work,Sports_activity,Transportation,Weekly_Study_Hours,Attendance,Reading,Notes,Listening_in_Class,Project_work,Grade,Age
0,STUDENT1,19-22,Male,Other,50,Yes,No,Private,0,Always,Yes,Yes,No,No,AA,2
1,STUDENT2,19-22,Male,Other,50,Yes,No,Private,0,Always,Yes,No,Yes,Yes,AA,2
2,STUDENT3,19-22,Male,State,50,No,No,Private,2,Never,No,No,No,Yes,AA,2
3,STUDENT4,18,Female,Private,50,Yes,No,Bus,2,Always,No,Yes,No,No,AA,1
4,STUDENT5,19-22,Male,Private,50,No,No,Bus,12,Always,Yes,No,Yes,Yes,AA,2


In [48]:
df.head()

Unnamed: 0,Student_ID,Student_Age,Sex,High_School_Type,Scholarship,Additional_Work,Sports_activity,Transportation,Weekly_Study_Hours,Attendance,Reading,Notes,Listening_in_Class,Project_work,Grade,Age
0,STUDENT1,19-22,Male,Other,50,Yes,No,Private,0,Always,Yes,Yes,No,No,AA,2
1,STUDENT2,19-22,Male,Other,50,Yes,No,Private,0,Always,Yes,No,Yes,Yes,AA,2
2,STUDENT3,19-22,Male,State,50,No,No,Private,2,Never,No,No,No,Yes,AA,2
3,STUDENT4,18,Female,Private,50,Yes,No,Bus,2,Always,No,Yes,No,No,AA,1
4,STUDENT5,19-22,Male,Private,50,No,No,Bus,12,Always,Yes,No,Yes,Yes,AA,2


In [72]:
## remove % on scholarship
df['Scholarship'] = df['Scholarship'].str.rstrip('%')

In [53]:
df.dtypes

Student_ID            object
Student_Age           object
Sex                   object
High_School_Type      object
Scholarship           object
Additional_Work       object
Sports_activity       object
Transportation        object
Weekly_Study_Hours     int64
Attendance            object
Reading               object
Notes                 object
Listening_in_Class    object
Project_work          object
Grade                 object
Age                    int64
dtype: object

In [51]:
df['Scholarship'].dtype

dtype('O')

In [55]:
# df['Scholarship'] = df['Scholarship'].fillna('0')
# df['Scholarship'] = df['Scholarship'].astype(int)

In [74]:
df["Scholarship"].isna().sum()

1

In [64]:
nan_values = df["Scholarship"].isna().sum()
none_values= (df["Scholarship"] == 'None').sum()

In [65]:
if nan_values > 0 or none_values > 0:
    total_missing = nan_values + none_values
    print(f"There are {total_missing} missing values in the 'Scholarship' column.")
    if nan_values > 0:
        print(f" - {nan_values} NaN values")
    if none_values > 0:
        print(f" - {none_values} 'None' values")
    else:
        print("No missing values found in the 'Scholarship' column.")

There are 1 missing values in the 'Scholarship' column.
 - 1 'None' values


In [66]:
df["Scholarship"].replace('None', 0, inplace=True)

In [75]:
df["Scholarship"].replace('NaN', 0, inplace=True)

In [70]:
none_values= (df["Scholarship"] == 'None').sum()

In [71]:
none_values

0

In [78]:
df["Scholarship"].isna().sum()

1

In [79]:
df["Scholarship"].fillna(0, inplace=True)

In [80]:
## now we change the data type for scholarship to integer
df['Scholarship'] = df['Scholarship'].astype(int)

In [85]:
df.dtypes

Student_ID            object
Student_Age           object
Sex                   object
High_School_Type      object
Scholarship            int32
Additional_Work       object
Sports_activity       object
Transportation        object
Weekly_Study_Hours     int64
Attendance            object
Reading               object
Notes                 object
Listening_in_Class    object
Project_work          object
Grade                 object
Age                    int64
dtype: object

In [86]:
df.corr()

  df.corr()


Unnamed: 0,Scholarship,Weekly_Study_Hours,Age
Scholarship,1.0,-0.096293,-0.316506
Weekly_Study_Hours,-0.096293,1.0,0.139149
Age,-0.316506,0.139149,1.0


In [87]:
df.groupby('Grade')['Grade'].agg('count')

Grade
AA      35
BA      24
BB      21
CB      10
CC      17
DC      13
DD      17
Fail     8
Name: Grade, dtype: int64

In [88]:
# Define a mapping of grades to numerical values
grade_mapping = {
    'AA': 8,
    'BA': 7,
    'BB': 6,
    'CB': 5,
    'CC': 4,
    'DC': 3,
    'DD': 2,
    'Fail': 1
}

# Map grades to numerical values using the mapping dictionary
df['grade_encoded'] = df['Grade'].map(grade_mapping)

In [89]:
df

Unnamed: 0,Student_ID,Student_Age,Sex,High_School_Type,Scholarship,Additional_Work,Sports_activity,Transportation,Weekly_Study_Hours,Attendance,Reading,Notes,Listening_in_Class,Project_work,Grade,Age,grade_encoded
0,STUDENT1,19-22,Male,Other,50,Yes,No,Private,0,Always,Yes,Yes,No,No,AA,2,8
1,STUDENT2,19-22,Male,Other,50,Yes,No,Private,0,Always,Yes,No,Yes,Yes,AA,2,8
2,STUDENT3,19-22,Male,State,50,No,No,Private,2,Never,No,No,No,Yes,AA,2,8
3,STUDENT4,18,Female,Private,50,Yes,No,Bus,2,Always,No,Yes,No,No,AA,1,8
4,STUDENT5,19-22,Male,Private,50,No,No,Bus,12,Always,Yes,No,Yes,Yes,AA,2,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,STUDENT141,19-22,Female,State,50,Yes,Yes,Private,0,Always,No,Yes,No,Yes,CC,2,4
141,STUDENT142,18,Female,State,75,No,No,Private,0,Never,No,Yes,Yes,No,CC,1,4
142,STUDENT143,18,Female,Private,75,No,No,Private,0,Always,Yes,No,No,No,AA,1,8
143,STUDENT144,19-22,Female,State,75,Yes,Yes,Bus,12,Sometimes,No,Yes,No,Yes,CB,2,5


In [90]:
df.corr()

  df.corr()


Unnamed: 0,Scholarship,Weekly_Study_Hours,Age,grade_encoded
Scholarship,1.0,-0.096293,-0.316506,-0.035407
Weekly_Study_Hours,-0.096293,1.0,0.139149,0.068097
Age,-0.316506,0.139149,1.0,0.207246
grade_encoded,-0.035407,0.068097,0.207246,1.0


In [91]:
df.groupby('Attendance')['Attendance'].agg('count')

Attendance
3             1
Always       98
Never        21
Sometimes    25
Name: Attendance, dtype: int64

In [96]:
df.drop(df[df['Attendance'] == '3'].index, inplace=True)

In [97]:
df['Attendance'].unique()

array(['Always', 'Never', 'Sometimes'], dtype=object)