In [None]:
import pandas as pd
import numpy as np
import os

# Load the dataset
file_path = "C:/Users/berka/Downloads/dirtydata.csv"  # Update the file path
df = pd.read_csv(file_path, delimiter=";")  # Specify the delimiter

# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace("-", "_")
print("Cleaned column names:", df.columns)

# Columns to work with
work_1c = [
    "study_hours_per_week",
    "sleep_hours_per_night",
    "exam_stress_level",
    "screen_time_hours_per_day"
]

# Check for missing columns
missing_columns = [col for col in work_1c if col not in df.columns]
if missing_columns:
    raise KeyError(f"Missing columns in the dataset: {missing_columns}")

# Copy relevant columns
work_2c = df[work_1c].copy()

# 1. Clean "study_hours_per_week"
hours_study_mode = work_2c['study_hours_per_week'].mode()[0]
work_2c['study_hours_per_week'] = (
    work_2c['study_hours_per_week']
    .replace(['na', 'NA'], hours_study_mode)
    .astype(float)
    .apply(lambda x: max(0, min(x, 80)))  # Limit values between 0 and 80
    .round()
)

# 2. Clean "sleep_hours_per_night"
sleep_mode = work_2c['sleep_hours_per_night'].mode()[0]
work_2c['sleep_hours_per_night'] = (
    work_2c['sleep_hours_per_night']
    .replace(['na', 'NA'], sleep_mode)
    .astype(float)
    .apply(lambda x: max(0, min(x, 12)))  # Limit sleep hours between 0 and 12
    .round()
)

# 3. Clean "exam_stress_level"
stress_mode = work_2c['exam_stress_level'].mode()[0]
work_2c['exam_stress_level'] = (
    work_2c['exam_stress_level']
    .replace(['na', 'NA'], stress_mode)
    .astype(float)
    .apply(lambda x: max(1, min(x, 10)))  # Limit stress level between 1 and 10
    .round()
)

# 4. Clean "screen_time_hours_per_day"
screen_time_mode = work_2c['screen_time_hours_per_day'].mode()[0]
work_2c['screen_time_hours_per_day'] = (
    work_2c['screen_time_hours_per_day']
    .replace(['na', 'NA'], screen_time_mode)
    .astype(float)
    .apply(lambda x: max(0, min(x, 16)))  # Limit screen time between 0 and 16 hours
    .round()
)

# Rename columns
work_2c.rename(columns={
    "study_hours_per_week": "Weekly Study Hours",
    "sleep_hours_per_night": "Daily Sleep Hours",
    "exam_stress_level": "Exam Stress Level",
    "screen_time_hours_per_day": "Daily Screen Time Hours"
}, inplace=True)

# Save cleaned data
save_path = r'C:\Users\berka\Downloads\Project_University_Students'
if not os.path.exists(save_path):
    os.makedirs(save_path)
new_file_path = os.path.join(save_path, 'cleaned_university_students_data.csv')
work_2c.to_csv(new_file_path, index=False)

# Check cleaned data
print(work_2c.info())
print(work_2c.head())
print(f"Cleaned data saved to: {new_file_path}")
