# 🧼 Data Cleaning for Student Spending Dataset

This Jupyter Notebook covers the data cleaning process for the student spending dataset. We inspect features with missing values, impute where necessary, and prepare the data for analysis or modeling.

In [None]:
# 📦 The Necessary Imports
import pandas as pd
import numpy as np
from IPython.display import display

In [None]:
# 📂 Loading the Dataset
student_df = pd.read_csv("../data/student_spending.csv")
display(student_df)

In [None]:
# 🔍 A Deeper Dive into the Dataset
display(student_df.info())
display(student_df.describe(include='all'))

### 📌 Observations and Conclusion

The dataset is mostly complete. We'll proceed by inspecting and resolving missing values, converting categorical features, and applying imputation strategies.

In [None]:
# 🚑 Missing Value Imputations

null_numeric_features = []
null_categorical_features = []

numeric_features = student_df.select_dtypes(include='number')
categorical_features = student_df.select_dtypes(include='object')

In [None]:
# Imputing missing numeric values based on skew
for column in numeric_features:
    if student_df[column].isnull().any():
        null_numeric_features.append(column)
        skewness = student_df[column].skew()
        
        if skewness > 0:
            print(f"The feature {column} is right skewed")
        elif skewness < 0:
            print(f"The feature {column} is left skewed")
        else:
            print(f"The feature {column} is symmetric")

        student_df[column] = student_df[column].fillna(student_df[column].median())

In [None]:
# Imputing missing categorical features
for column in categorical_features:
    if student_df[column].isnull().any():
        print(f"The feature {column} has null values!")
        null_categorical_features.append(column)
        student_df[column] = student_df[column].fillna(student_df[column].mode()[0])

In [None]:
# Final null check
print(student_df.isnull().values.any())

### 🔁 Converting Categorical Features to Numerical

In [None]:
# Binary categorical features mapping
for column in categorical_features:
    if student_df[column].nunique() == 2:
        print(f"The feature {column} has two unique values!")
        values = student_df[column].unique()
        mapping = {values[0]: 0, values[1]: 1}
        student_df[column] = student_df[column].map(mapping).astype(int)

In [None]:
# Multi-class categorical features one-hot encoding
for column in categorical_features:
    if student_df[column].nunique() > 2:
        print(f"The feature {column} has more than two unique values!")
        encoded = pd.get_dummies(student_df[column], prefix=column, dtype=int)
        student_df.drop(columns=[column], inplace=True)
        student_df = pd.concat([student_df, encoded], axis=1)

In [None]:
# Final preview
display(student_df.head())