In [8]:
# 1. Import all the required Python Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Dataset Description and Source
"""
Dataset: Student Performance Dataset
Source: https://www.kaggle.com/datasets/spscientist/students-performance-in-exams
This dataset contains scores of students in math, reading, and writing exams along with demographic info.
"""

# 3. Load the Dataset into pandas dataframe
df = pd.read_csv('StudentsPerformance.csv')  # Adjust filename if needed
df.head()

Unnamed: 0,STUDENT ID,Student Age,Sex,Graduated high-school type,Scholarship type,Additional work,Regular artistic or sports activity,Do you have a partner,Total salary if available,Transportation to the university,...,Preparation to midterm exams 1,Preparation to midterm exams 2,Taking notes in classes,Listening in classes,Discussion improves my interest and success in the course,Flip-classroom,Cumulative grade point average in the last semester (/4.00),Expected Cumulative grade point average in the graduation (/4.00),COURSE ID,GRADE
0,STUDENT1,2,2,3,3,1,2,2,1,1,...,1,1,3,2,1,2,1,1,1,1
1,STUDENT2,2,2,3,3,1,2,2,1,1,...,1,1,3,2,3,2,2,3,1,1
2,STUDENT3,2,2,2,3,2,2,2,2,4,...,1,1,2,2,1,1,2,2,1,1
3,STUDENT4,1,1,1,3,1,2,1,2,1,...,1,2,3,2,2,1,3,2,1,1
4,STUDENT5,2,2,1,3,2,2,1,3,1,...,2,1,2,2,2,1,2,2,1,1


In [11]:
df.isnull().sum()

STUDENT ID                                                           0
Student Age                                                          0
Sex                                                                  0
Graduated high-school type                                           0
Scholarship type                                                     0
Additional work                                                      0
Regular artistic or sports activity                                  0
Do you have a partner                                                0
Total salary if available                                            0
Transportation to the university                                     0
Accommodation type in Cyprus                                         0
Mother’s education                                                   0
Father’s education                                                   0
Number of sisters/brothers                                           0
Parent

In [12]:
df.describe(include='all')

Unnamed: 0,STUDENT ID,Student Age,Sex,Graduated high-school type,Scholarship type,Additional work,Regular artistic or sports activity,Do you have a partner,Total salary if available,Transportation to the university,...,Preparation to midterm exams 1,Preparation to midterm exams 2,Taking notes in classes,Listening in classes,Discussion improves my interest and success in the course,Flip-classroom,Cumulative grade point average in the last semester (/4.00),Expected Cumulative grade point average in the graduation (/4.00),COURSE ID,GRADE
count,145,145.0,145.0,145.0,145.0,145.0,145.0,145.0,145.0,145.0,...,145.0,145.0,145.0,145.0,145.0,145.0,145.0,145.0,145.0,145.0
unique,145,,,,,,,,,,...,,,,,,,,,,
top,STUDENT1,,,,,,,,,,...,,,,,,,,,,
freq,1,,,,,,,,,,...,,,,,,,,,,
mean,,1.62069,1.6,1.944828,3.572414,1.662069,1.6,1.57931,1.627586,1.62069,...,1.337931,1.165517,2.544828,2.055172,2.393103,1.806897,3.124138,2.724138,4.131034,3.227586
std,,0.613154,0.491596,0.537216,0.80575,0.474644,0.491596,0.495381,1.020245,1.061112,...,0.61487,0.408483,0.56494,0.674736,0.604343,0.810492,1.301083,0.916536,3.260145,2.197678
min,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,,1.0,1.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0
50%,,2.0,2.0,2.0,3.0,2.0,2.0,2.0,1.0,1.0,...,1.0,1.0,3.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0
75%,,2.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,...,2.0,1.0,3.0,3.0,3.0,2.0,4.0,3.0,7.0,5.0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 33 columns):
 #   Column                                                             Non-Null Count  Dtype 
---  ------                                                             --------------  ----- 
 0   STUDENT ID                                                         145 non-null    object
 1   Student Age                                                        145 non-null    int64 
 2   Sex                                                                145 non-null    int64 
 3   Graduated high-school type                                         145 non-null    int64 
 4   Scholarship type                                                   145 non-null    int64 
 5   Additional work                                                    145 non-null    int64 
 6   Regular artistic or sports activity                                145 non-null    int64 
 7   Do you have a partner              

In [14]:
df.shape

(145, 33)

In [17]:
# 5. Turn categorical variables into quantitative variables

# Display unique values in object (categorical) columns
print("\nUnique values in categorical columns:")
for col in df.select_dtypes(include='object').columns:
    print(f"{col}: {df[col].unique()}")

# Use Label Encoding or One-Hot Encoding
# We'll use one-hot encoding for better representation
df_encoded = pd.get_dummies(df, drop_first=True)  # Avoid dummy variable trap


Unique values in categorical columns:
STUDENT ID: ['STUDENT1' 'STUDENT2' 'STUDENT3' 'STUDENT4' 'STUDENT5' 'STUDENT6'
 'STUDENT7' 'STUDENT8' 'STUDENT9' 'STUDENT10' 'STUDENT11' 'STUDENT12'
 'STUDENT13' 'STUDENT14' 'STUDENT15' 'STUDENT16' 'STUDENT17' 'STUDENT18'
 'STUDENT19' 'STUDENT20' 'STUDENT21' 'STUDENT22' 'STUDENT23' 'STUDENT24'
 'STUDENT25' 'STUDENT26' 'STUDENT27' 'STUDENT28' 'STUDENT29' 'STUDENT30'
 'STUDENT31' 'STUDENT32' 'STUDENT33' 'STUDENT34' 'STUDENT35' 'STUDENT36'
 'STUDENT37' 'STUDENT38' 'STUDENT39' 'STUDENT40' 'STUDENT41' 'STUDENT42'
 'STUDENT43' 'STUDENT44' 'STUDENT45' 'STUDENT46' 'STUDENT47' 'STUDENT48'
 'STUDENT49' 'STUDENT50' 'STUDENT51' 'STUDENT52' 'STUDENT53' 'STUDENT54'
 'STUDENT55' 'STUDENT56' 'STUDENT57' 'STUDENT58' 'STUDENT59' 'STUDENT60'
 'STUDENT61' 'STUDENT62' 'STUDENT63' 'STUDENT64' 'STUDENT65' 'STUDENT66'
 'STUDENT67' 'STUDENT68' 'STUDENT69' 'STUDENT70' 'STUDENT71' 'STUDENT72'
 'STUDENT73' 'STUDENT74' 'STUDENT75' 'STUDENT76' 'STUDENT77' 'STUDENT78'
 'STUDENT

In [18]:
df_encoded.head()

Unnamed: 0,Student Age,Sex,Graduated high-school type,Scholarship type,Additional work,Regular artistic or sports activity,Do you have a partner,Total salary if available,Transportation to the university,Accommodation type in Cyprus,...,STUDENT ID_STUDENT90,STUDENT ID_STUDENT91,STUDENT ID_STUDENT92,STUDENT ID_STUDENT93,STUDENT ID_STUDENT94,STUDENT ID_STUDENT95,STUDENT ID_STUDENT96,STUDENT ID_STUDENT97,STUDENT ID_STUDENT98,STUDENT ID_STUDENT99
0,2,2,3,3,1,2,2,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,2,3,3,1,2,2,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,2,2,2,3,2,2,2,2,4,2,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,3,1,2,1,2,1,2,...,0,0,0,0,0,0,0,0,0,0
4,2,2,1,3,2,2,1,3,1,4,...,0,0,0,0,0,0,0,0,0,0
