In [12]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("student_data_2.csv", sep=";")

results = []

for col in df.select_dtypes(include=np.number).columns:
    series = df[col].dropna()
    
    # Skip near-constant features
    if series.nunique() < 10:
        continue

    # Original skewness
    original_skew = series.skew()

    # Trim 5% on both ends
    lower = series.quantile(0.05)
    upper = series.quantile(0.95)
    trimmed = series[(series >= lower) & (series <= upper)]

    trimmed_skew = trimmed.skew()

    # Change in skewness
    skew_change = abs(original_skew) - abs(trimmed_skew)

    # Diagnosis logic
    if abs(original_skew) < 0.5:
        diagnosis = "Approximately symmetric"
    elif skew_change > 0.75:
        diagnosis = "Outlier-driven skew"
    else:
        diagnosis = "Truly skewed"

    results.append({
        "feature": col,
        "original_skew": round(original_skew, 3),
        "trimmed_skew": round(trimmed_skew, 3),
        "skew_reduction": round(skew_change, 3),
        "diagnosis": diagnosis
    })

skew_report = pd.DataFrame(results).sort_values(
    by="original_skew",
    key=abs,
    ascending=False
)

print(skew_report)


                               feature  original_skew  trimmed_skew  \
2                    Age at enrollment          2.055         1.693   
3  Curricular units 1st sem (enrolled)          1.619         1.570   
5  Curricular units 2nd sem (enrolled)          0.788         0.985   
4  Curricular units 1st sem (approved)          0.766        -0.627   
6  Curricular units 2nd sem (approved)          0.306        -0.421   
1                               Course         -0.190        -0.186   
0                     Application mode          0.122         0.118   

   skew_reduction                diagnosis  
2           0.362             Truly skewed  
3           0.049             Truly skewed  
5          -0.196             Truly skewed  
4           0.140             Truly skewed  
6          -0.115  Approximately symmetric  
1           0.004  Approximately symmetric  
0           0.004  Approximately symmetric  
