In [50]:
from scipy import stats
import pandas as pd

###### Start of code #####

student_df = pd.read_csv('student_habits_performance.csv')


In [51]:
# Dataset makeup
print("Dataset")
print(student_df.head())
print("\nDataset information")
print(student_df.info())

# Descriptive statistics
print("\nDescriptive Statistics")
print(student_df.describe())

# Duplicates?
print("\nDuplicate rows:\n", student_df[student_df.duplicated()])

Dataset
  student_id  age  gender  study_hours_per_day  social_media_hours  \
0      S1000   23  Female                  0.0                 1.2   
1      S1001   20  Female                  6.9                 2.8   
2      S1002   21    Male                  1.4                 3.1   
3      S1003   23  Female                  1.0                 3.9   
4      S1004   19  Female                  5.0                 4.4   

   netflix_hours part_time_job  attendance_percentage  sleep_hours  \
0            1.1            No                   85.0          8.0   
1            2.3            No                   97.3          4.6   
2            1.3            No                   94.8          8.0   
3            1.0            No                   71.0          9.2   
4            0.5            No                   90.9          4.9   

  diet_quality  exercise_frequency parental_education_level internet_quality  \
0         Fair                   6                   Master          A

In [52]:
# Drop student_id 
student_df.drop(columns='student_id', inplace=True)

In [53]:
# Determine unique values in categorical columns
cat_columns = ['gender', 'part_time_job', 'diet_quality', 'parental_education_level', 
           'internet_quality', 'extracurricular_participation']

unique_values = {col: student_df[col].unique().tolist() for col in cat_columns}

for col, values in unique_values.items():
    print(f"Unique values in '{col}': {values}")

Unique values in 'gender': ['Female', 'Male', 'Other']
Unique values in 'part_time_job': ['No', 'Yes']
Unique values in 'diet_quality': ['Fair', 'Good', 'Poor']
Unique values in 'parental_education_level': ['Master', 'High School', 'Bachelor', nan]
Unique values in 'internet_quality': ['Average', 'Poor', 'Good']
Unique values in 'extracurricular_participation': ['Yes', 'No']


In [54]:
# Replace NaN with "No_Degree"
student_df['parental_education_level'] = student_df['parental_education_level'].fillna("No_Degree")

# Include dummy variables for columns containing binary values
dummy_df1 = pd.get_dummies(student_df['part_time_job'], prefix='part_time', drop_first=True, dtype=int)
dummy_df2 = pd.get_dummies(student_df['extracurricular_participation'], prefix='extra_part', drop_first=True, dtype=int)
student_df = pd.concat([student_df, dummy_df1, dummy_df2], axis=1)

# Create new columns corresponding to ordinal columns
mapping = {'Female': 0, 'Male': 1, 'Other': 2}
student_df['gender_num'] = student_df['gender'].map(mapping)

mapping = {'Poor': 0, 'Fair': 1, 'Good': 2}
student_df['diet_num'] = student_df['diet_quality'].map(mapping)

mapping = {'Poor': 0, 'Average': 1, 'Good': 2}
student_df['internet_num'] = student_df['internet_quality'].map(mapping)

mapping = {'No_Degree': 0, 'High School': 1, 'Bachelor': 2, 'Master': 3}
student_df['parent_ed_num'] = student_df['parental_education_level'].map(mapping)

In [55]:
# Check results
print("Dataset")
print(student_df.head())
print("\nDataset information")
print(student_df.info())

Dataset
   age  gender  study_hours_per_day  social_media_hours  netflix_hours  \
0   23  Female                  0.0                 1.2            1.1   
1   20  Female                  6.9                 2.8            2.3   
2   21    Male                  1.4                 3.1            1.3   
3   23  Female                  1.0                 3.9            1.0   
4   19  Female                  5.0                 4.4            0.5   

  part_time_job  attendance_percentage  sleep_hours diet_quality  \
0            No                   85.0          8.0         Fair   
1            No                   97.3          4.6         Good   
2            No                   94.8          8.0         Poor   
3            No                   71.0          9.2         Poor   
4            No                   90.9          4.9         Fair   

   exercise_frequency  ... internet_quality mental_health_rating  \
0                   6  ...          Average                    8   
1 