<a href="https://colab.research.google.com/github/erickxllx/Mid-Term-due-in-stages/blob/main/MidtermProjectCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
Data Cleaning & Preparation
Author: Peter Amoye,Erick Banegas,Alhassane Samassekou
Date: October 2025

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

print(">> Please upload your CSV file (e.g., student_info.csv)")
uploaded = files.upload()

csv_candidates = [k for k in uploaded.keys() if k.lower().endswith(".csv")]
assert csv_candidates, "No CSV file uploaded!"
csv_path = csv_candidates[0]

df = pd.read_csv(csv_path)
print("✅ CSV loaded successfully!")
print("Shape:", df.shape)
display(df.head())

print("=== Dataset Info ===")
print(df.info())

print("\n=== Missing Values ===")
print(df.isna().sum())

print("\n=== Summary Statistics ===")
display(df.describe(include='all'))

before = df.shape[0]
df.drop_duplicates(inplace=True)
print(f"Removed {before - df.shape[0]} duplicate rows.")

df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.title()

print("✅ Basic cleaning done.")

num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(exclude=[np.number]).columns

for col in num_cols:
    df[col].fillna(df[col].mean(), inplace=True)

for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

print("✅ Missing values handled.")

plt.figure(figsize=(10,6))
sns.boxplot(data=df.select_dtypes(include=[np.number]))
plt.title("Boxplot of Numerical Features")
plt.show()

if {'math_score','reading_score','writing_score','study_hours'}.issubset(df.columns):
    df['avg_score'] = df[['math_score','reading_score','writing_score']].mean(axis=1)
    df['study_efficiency'] = df['avg_score'] / (df['study_hours'] + 0.1)
    print("✅ Feature engineering completed.")
else:
    print("⚠️ Required columns missing for feature engineering.")

target_col = 'final_result'
if target_col in df.columns:
    plt.figure(figsize=(5,4))
    sns.countplot(data=df, x=target_col)
    plt.title("Distribution of Final Results")
    plt.show()

plt.figure(figsize=(8,6))
sns.heatmap(df.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap of Numeric Features")
plt.show()

output_path = 'student_info_cleaned.csv'
df.to_csv(output_path, index=False)
print(f"✅ Cleaned dataset saved as {output_path}.")
