In [50]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("train.csv")

In [51]:
parental_education_order = {"High School": 0, "Bachelor’s": 1, "Master’s": 2}
df["parental_education"] = df["parental_education"].map(parental_education_order)

In [52]:
df["age_group"] = pd.cut(df["age"], bins=[14, 16, 18, 20], labels=["14-16", "17-18", "19-20"])
df = pd.get_dummies(df, columns=["age_group"], drop_first=True)

In [None]:
# One-hot encode nominal features
df = pd.get_dummies(df, columns=["gender", "school_type"], drop_first=True)

# Ordinal encode parental education
parental_education_order = {"High School": 0, "Bachelor’s": 1, "Master’s": 2}
df["parental_education"] = df["parental_education"].map(parental_education_order)

# Custom mean encoding for extracurricular
mean_extracurricular = df.groupby("extracurricular")["final_math_score"].mean()
df["extracurricular_encoded"] = df["extracurricular"].map(mean_extracurricular)

# Drop original categorical features if necessary
df.drop(columns=["extracurricular"], inplace=True)

In [54]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X = df.drop(columns=["student_id", "final_math_score"])
y = df["final_math_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred)
    print(f"{name} RMSE: {rmse}")

Linear Regression RMSE: 27.914315507105975
Decision Tree RMSE: 41.9642625
Random Forest RMSE: 21.102173686250023


In [55]:
from xgboost import XGBRegressor

xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"XGBoost RMSE: {rmse_xgb}")

XGBoost RMSE: 22.37794471897152


In [56]:
import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0)
    }
    model = XGBRegressor(**params, random_state=42)
    scores = cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=3)
    return -scores.mean()

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

best_params = study.best_params
print(f"Best Parameters: {best_params}")

[I 2025-04-06 22:11:53,107] A new study created in memory with name: no-name-317c32b9-8e3e-4709-94b7-13e9deda8b20
[I 2025-04-06 22:12:01,179] Trial 0 finished with value: 22.989058406893623 and parameters: {'n_estimators': 479, 'max_depth': 10, 'learning_rate': 0.0767375607737907, 'subsample': 0.8143966105919305}. Best is trial 0 with value: 22.989058406893623.
[I 2025-04-06 22:12:03,774] Trial 1 finished with value: 23.28143186648136 and parameters: {'n_estimators': 139, 'max_depth': 9, 'learning_rate': 0.03923627665850914, 'subsample': 0.9737315838453847}. Best is trial 0 with value: 22.989058406893623.
[I 2025-04-06 22:12:04,504] Trial 2 finished with value: 19.877924932485445 and parameters: {'n_estimators': 238, 'max_depth': 3, 'learning_rate': 0.06017304030021948, 'subsample': 0.8194992374915092}. Best is trial 2 with value: 19.877924932485445.
[I 2025-04-06 22:12:05,141] Trial 3 finished with value: 20.58692099267843 and parameters: {'n_estimators': 206, 'max_depth': 3, 'learnin

KeyboardInterrupt: 

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(64, activation="relu", input_shape=(X_train.shape[1],)),
    Dense(32, activation="relu"),
    Dense(1)
])

model.compile(optimizer="adam", loss="mse")
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)
y_pred_nn = model.predict(X_test).flatten()
rmse_nn = mean_squared_error(y_test, y_pred_nn, squared=False)
print(f"Neural Network RMSE: {rmse_nn}")

ModuleNotFoundError: No module named 'tensorflow.keras'

In [42]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [5, 10],
    "learning_rate": [0.01, 0.1]
}

grid_search = GridSearchCV(XGBRegressor(random_state=42), param_grid, scoring="neg_mean_squared_error", cv=3)
grid_search.fit(X_train, y_train)
print(f"Best Params: {grid_search.best_params_}")

Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}


In [44]:
from sklearn.ensemble import StackingRegressor

estimators = [
    ("rf", RandomForestRegressor(random_state=42)),
    ("xgb", XGBRegressor(random_state=42))
]

stacker = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
stacker.fit(X_train, y_train)
y_pred_stack = stacker.predict(X_test)
rmse_stack = mean_squared_error(y_test, y_pred_stack)
print(f"Stacking RMSE: {rmse_stack}")

Stacking RMSE: 20.99275602464448


In [None]:
residuals = y_test - y_pred_stack
plt.scatter(y_pred_stack, residuals)
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.show()

In [None]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred_stack)
print(f"R² Score: {r2}")