In [None]:
# EDA_and_Modeling.ipynb

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load data
df = pd.read_csv('../data/student_data.csv')
df.head()

# --- EDA ---
# Check missing values
print(df.isnull().sum())

# Distribution of final grades
sns.histplot(df['final_grade'], bins=10, kde=True)
plt.show()

# Correlation heatmap
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

# --- Encode categorical features ---
le_gender = LabelEncoder()
le_family = LabelEncoder()
le_extra = LabelEncoder()

df['gender'] = le_gender.fit_transform(df['gender'])
df['family_support'] = le_family.fit_transform(df['family_support'])
df['extracurricular'] = le_extra.fit_transform(df['extracurricular'])

# Features and target
X = df.drop(['student_id','final_grade'], axis=1)
y = df['final_grade']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# --- Train Models ---
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name} - R2: {r2_score(y_test, y_pred):.4f}, RMSE: {mean_squared_error(y_test, y_pred, squared=False):.4f}")

# --- Feature Importance for Random Forest ---
rf = models['Random Forest']
importances = rf.feature_importances_
features = X.columns
plt.figure(figsize=(8,5))
sns.barplot(x=features, y=importances)
plt.title("Feature Importance - Random Forest")
plt.show()