https://www.kaggle.com/datasets/mdismielhossenabir/psychosocial-dimensions-of-student-life

In [16]:
# the usual imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, Normalizer
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [17]:
# load the data
df = pd.read_csv("med_student.csv")

In [18]:
# let's quickly see what we have
df.head()

Unnamed: 0,Age,Gender,AcademicPerformance,TakingNoteInClass,DepressionStatus,FaceChallangesToCompleteAcademicTask,LikePresentation,SleepPerDayHours,NumberOfFriend,LikeNewThings
0,23,Male,Average,No,Sometimes,Yes,Yes,12,,Yes
1,23,Male,Excellent,Sometimes,Yes,No,Yes,8,80.0,Yes
2,24,Male,Average,No,Sometimes,Sometimes,No,8,10.0,Yes
3,20,Female,Good,Yes,Sometimes,Yes,No,5,15.0,Yes
4,24,Female,Average,Yes,Yes,Yes,Yes,5,2.0,Yes


In [19]:
# data is quite small which is good to cee clear classification
df.describe()

Unnamed: 0,Age,SleepPerDayHours,NumberOfFriend
count,99.0,99.0,95.0
mean,22.515152,6.717172,16.189474
std,1.560767,1.738169,25.397811
min,20.0,4.0,0.0
25%,21.0,5.0,3.0
50%,23.0,7.0,6.0
75%,24.0,8.0,15.0
max,25.0,12.0,100.0


In [20]:
# oh wow this is so bad :)
# we have too many duplicates (almost the whole dataset)
df.duplicated().sum()

67

In [21]:
# and we have 4 Nan values
df.isna().sum()

Age                                     0
Gender                                  0
AcademicPerformance                     0
TakingNoteInClass                       0
DepressionStatus                        0
FaceChallangesToCompleteAcademicTask    0
LikePresentation                        0
SleepPerDayHours                        0
NumberOfFriend                          4
LikeNewThings                           0
dtype: int64

In [22]:
# Check for duplicate rows
duplicates = df.duplicated()

# Print duplicate rows
print(df[duplicates])

# Alternatively, you can count the number of duplicate rows
num_duplicates = df.duplicated().sum()
print("Number of duplicate rows:", num_duplicates)

# I think we have duplicates becuase data is quite genereal
# and might be repetitive for many cases
# I will delete Nan values but let's keep duplicates

    Age   Gender AcademicPerformance TakingNoteInClass DepressionStatus  \
32    23    Male             Average                No        Sometimes   
33    23    Male           Excellent         Sometimes              Yes   
34    24    Male             Average                No        Sometimes   
35    20  Female                Good               Yes        Sometimes   
36    24  Female             Average               Yes              Yes   
..   ...     ...                 ...               ...              ...   
94    21    Male                Good               Yes        Sometimes   
95    21  Female                Good               Yes              Yes   
96    25    Male                Good               Yes        Sometimes   
97    21    Male                Good               Yes               No   
98    22  Female             Average               Yes               No   

   FaceChallangesToCompleteAcademicTask LikePresentation  SleepPerDayHours  \
32                   

In [23]:
# drop Nan values
df = df.dropna()

In [25]:
# initialize LabelEncoder
label_encoder = LabelEncoder()

# fit LabelEncoder and transform columns
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# Print the mapping of encoded values to original categories
# I will use these values for tester row and for GUI
print("Encoded values:")
for category, encoded_value in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{category}: {encoded_value}")

Encoded values:
Female: 0
Male: 1


In [26]:
# initialize LabelEncoder
label_encoder = LabelEncoder()

# fit LabelEncoder and transform columns
df['AcademicPerformance'] = label_encoder.fit_transform(df['AcademicPerformance'])

# Print the mapping of encoded values to original categories
# I will use these values for tester row and for GUI
print("Encoded values:")
for category, encoded_value in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{category}: {encoded_value}")

Encoded values:
Average: 0
Below average: 1
Excellent: 2
Good: 3


In [27]:
# initialize LabelEncoder
label_encoder = LabelEncoder()

# fit LabelEncoder and transform columns
df['TakingNoteInClass'] = label_encoder.fit_transform(df['TakingNoteInClass'])

# Print the mapping of encoded values to original categories
# I will use these values for tester row and for GUI
print("Encoded values:")
for category, encoded_value in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{category}: {encoded_value}")

Encoded values:
No: 0
Sometimes: 1
Yes: 2


In [28]:
# initialize LabelEncoder
label_encoder = LabelEncoder()

# fit LabelEncoder and transform columns
df['DepressionStatus'] = label_encoder.fit_transform(df['DepressionStatus'])

# Print the mapping of encoded values to original categories
# I will use these values for tester row and for GUI
print("Encoded values:")
for category, encoded_value in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{category}: {encoded_value}")

Encoded values:
No: 0
Sometimes: 1
Yes: 2


In [29]:
# initialize LabelEncoder
label_encoder = LabelEncoder()

# fit LabelEncoder and transform columns
df['FaceChallangesToCompleteAcademicTask'] = label_encoder.fit_transform(df['FaceChallangesToCompleteAcademicTask'])

# Print the mapping of encoded values to original categories
# I will use these values for tester row and for GUI
print("Encoded values:")
for category, encoded_value in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{category}: {encoded_value}")

Encoded values:
No: 0
Sometimes: 1
Yes: 2


In [30]:
# initialize LabelEncoder
label_encoder = LabelEncoder()

# fit LabelEncoder and transform columns
df['LikePresentation'] = label_encoder.fit_transform(df['LikePresentation'])

# Print the mapping of encoded values to original categories
# I will use these values for tester row and for GUI
print("Encoded values:")
for category, encoded_value in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{category}: {encoded_value}")

Encoded values:
No: 0
Yes: 1


In [32]:
# initialize LabelEncoder
label_encoder = LabelEncoder()

# fit LabelEncoder and transform columns
df['LikeNewThings'] = label_encoder.fit_transform(df['LikeNewThings'])

# Print the mapping of encoded values to original categories
# I will use these values for tester row and for GUI
print("Encoded values:")
for category, encoded_value in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{category}: {encoded_value}")

Encoded values:
0: 0
1: 1
