In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor

In [2]:
url = 'https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv'
df = pd.read_csv(url)

# Step 1: Preparing the dataset
# lowercase column names and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Drop the 'student_id' column
df = df.drop(columns=['student_id'])

# fill missing values with zeros
df = df.fillna(0)

In [3]:
# Step 2: Train/Validation/Test Split (60%/20%/20%)
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=1)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=1)

# separate the target variable
y_train = df_train['jamb_score'].values
y_val = df_val['jamb_score'].values
y_test = df_test['jamb_score'].values

# drop target variable from features
df_train = df_train.drop(columns=['jamb_score'])
df_val = df_val.drop(columns=['jamb_score'])
df_test = df_test.drop(columns=['jamb_score'])

Question 1: Which feature is used for splitting the data?

In [5]:
# Step 3: Vectorize using DictVectorizer
dv = DictVectorizer(sparse=True)

X_train = dv.fit_transform(df_train.to_dict(orient='records'))
X_val = dv.transform(df_val.to_dict(orient='records'))
X_test = dv.transform(df_test.to_dict(orient='records'))

# Step 4: Train Decision Tree Regressor with max_depth=1
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

# feature used for splitting
split_feature = dv.feature_names_[dt.tree_.feature[0]]
split_feature

'study_hours_per_week'

Question 2: What's the RMSE of this model on the validation data?

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# train a random forest regressor with specified parameters
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# predict on the validation set
y_pred_val = rf.predict(X_val)

# calculate RMSE for the validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
rmse_val

43.157758977963624

Question 3: After which value of n_estimators does RMSE stop improving?

In [9]:
# # range of n_estimators to test, using step of 5 to capture the provided options exactly
# n_estimators_values = range(10, 201, 5)
# rmse_values = []

# # RandomForestRegressor for each n_estimators value and calculate RMSE on validation set
# for n in n_estimators_values:
#     rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
#     rf.fit(X_train, y_train)
#     y_pred_val = rf.predict(X_val)
#     rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
#     rmse_values.append(round(rmse, 3))  # Round RMSE to 3 decimal places for consistency

# # find the closest value among the answers options where RMSE stops improving significantly
# options = [10, 25, 80, 200]

# # locate the stabilization point based on the options
# best_option = None
# for i in range(1, len(rmse_values)):
#     # Check if the RMSE has stopped improving significantly (difference below threshold)
#     if abs(rmse_values[i] - rmse_values[i - 1]) < 0.001:
#         # Find the closest n_estimators from the options list
#         closest_option = min(options, key=lambda x: abs(x - n_estimators_values[i]))
#         best_option = closest_option
#         break

# rmse_values, best_option

In [10]:
# Define range for n_estimators with a step of 5
n_estimators_values = range(10, 201, 5)
rmse_values = []

# Calculate RMSE for each n_estimators value on the validation set
for n in n_estimators_values:
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred_val = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    rmse_values.append(round(rmse, 3))  # Round RMSE to 3 decimal places for consistency

# Identify the stabilization point based on minimal improvement
stabilization_point = None
for i in range(1, len(rmse_values)):
    if abs(rmse_values[i] - rmse_values[i - 1]) < 0.001:
        stabilization_point = n_estimators_values[i]
        break

rmse_values, stabilization_point


([43.158,
  42.168,
  41.79,
  41.721,
  41.556,
  41.33,
  41.076,
  40.927,
  40.957,
  40.892,
  40.774,
  40.665,
  40.588,
  40.555,
  40.503,
  40.475,
  40.435,
  40.422,
  40.365,
  40.351,
  40.348,
  40.329,
  40.302,
  40.275,
  40.286,
  40.278,
  40.263,
  40.242,
  40.254,
  40.216,
  40.2,
  40.196,
  40.187,
  40.147,
  40.136,
  40.136,
  40.152,
  40.166,
  40.138],
 185)

Answer6: 200 is the closest

Question 4: What's the best max_depth, using the mean RMSE?

In [13]:
# different values of max_depth and n_estimators to find the best max_depth based on mean RMSE

# define the parameters to test
max_depth_values = [10, 15, 20, 25]
n_estimators_values = range(10, 201, 10)
mean_rmse_results = {}

# loop over each max_depth
for max_depth in max_depth_values:
    rmse_list = []  # To store RMSE values for each n_estimators value with the current max_depth
    
    for n in n_estimators_values:
        rf = RandomForestRegressor(n_estimators=n, max_depth=max_depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred_val = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
        rmse_list.append(rmse)
    
    # mean RMSE for the current max_depth
    mean_rmse_results[max_depth] = np.mean(rmse_list)

# best max_depth with the lowest mean RMSE
best_max_depth = min(mean_rmse_results, key=mean_rmse_results.get)
mean_rmse_results, best_max_depth

({10: 40.13800364122155,
  15: 40.64396557212339,
  20: 40.60981968462706,
  25: 40.687581741749185},
 10)

Answer: best max_depth is for mean RSE 10

In [15]:
# RandomForestRegressor with the specified parameters
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

feature_importances = rf.feature_importances_

# mapedp feature importances to feature names for easier interpretation
feature_importance_dict = dict(zip(dv.feature_names_, feature_importances))

# filter for the specified features and identify the most important one
selected_features = ["study_hours_per_week", "attendance_rate", "distance_to_school", "teacher_quality"]
important_features = {feature: feature_importance_dict[feature] for feature in selected_features if feature in feature_importance_dict}
most_important_feature = max(important_features, key=important_features.get)

important_features, most_important_feature

({'study_hours_per_week': 0.25407520246568593,
  'attendance_rate': 0.15213485818850264,
  'distance_to_school': 0.13576097373953083,
  'teacher_quality': 0.08173314573636019},
 'study_hours_per_week')

Answer: study_hours_per_week

In [22]:
!pip install xgboost
import xgboost as xgb



In [24]:
# DMatrix for train and validation data
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Define parameters with eta = 0.3
xgb_params_03 = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

# parameters with eta = 0.1
xgb_params_01 = xgb_params_03.copy()
xgb_params_01['eta'] = 0.1

# watchlist to evaluate performance on validation data
watchlist = [(dtrain, 'train'), (dval, 'val')]

# train with eta = 0.3
model_03 = xgb.train(params=xgb_params_03, dtrain=dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
y_pred_val_03 = model_03.predict(dval)
rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred_val_03))

# train with eta = 0.1
model_01 = xgb.train(params=xgb_params_01, dtrain=dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
y_pred_val_01 = model_01.predict(dval)
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred_val_01))

# compare RMSE values
rmse_03, rmse_01


(43.342905809394544, 40.8318750593964)