In [2]:
# Download the data
!wget https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv

--2024-11-05 06:01:17--  https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexeygrigorev/datasets/refs/heads/master/jamb_exam_results.csv [following]
--2024-11-05 06:01:18--  https://raw.githubusercontent.com/alexeygrigorev/datasets/refs/heads/master/jamb_exam_results.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 391501 (382K) [text/plain]
Saving to: ‘jamb_exam_results.csv’


2024-11-05 06:01:18 (76.1 MB/s) - ‘jamb_exam_results.csv’ saved [391501/391501]



In [3]:
# Import library
import pandas as pd
df = pd.read_csv('/workspaces/machine_learning_zoomcamp/module_06/jamb_exam_results.csv')
df.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [4]:
# Data preparation
df.columns = df.columns.str.lower().str.replace(' ', '_')
df = df.drop(columns=['student_id'])
df = df.fillna(0)

In [5]:
# Splitting data
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

train_df, test_df = train_test_split(df, test_size=0.4, random_state=1)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=1)

target_column = 'jamb_score'
y_train = train_df[target_column].values
y_val = val_df[target_column].values
y_test = test_df[target_column].values

X_train = train_df.drop(columns=[target_column])
X_val = val_df.drop(columns=[target_column])
X_test = test_df.drop(columns=[target_column])

dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(X_train.to_dict(orient='records'))
X_val = dv.transform(X_val.to_dict(orient='records'))
X_test = dv.transform(X_test.to_dict(orient='records'))

#### Question-1

In [6]:
from sklearn.tree import DecisionTreeRegressor

dt_regressor = DecisionTreeRegressor(max_depth=1, random_state=1)
dt_regressor.fit(X_train, y_train)

split_feature_index = dt_regressor.tree_.feature[0]
split_feature_name = dv.feature_names_[split_feature_index]

split_feature_name

'study_hours_per_week'

#### Question-2

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Train a Random Forest Regressor with specified parameters
rf_regressor = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf_regressor.fit(X_train, y_train)

# Predict on the validation set and calculate RMSE
y_val_pred = rf_regressor.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))

rmse_val

np.float64(43.157758977963624)

#### Question-3

In [10]:
n_estimators_range = range(10, 201, 10)
rmse_values = []

# Train and evaluate models for each n_estimators value
for n in n_estimators_range:
    rf_regressor = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf_regressor.fit(X_train, y_train)
    y_val_pred = rf_regressor.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    rmse_values.append((n, round(rmse, 3)))

# Creating a DataFrame for viewing
rmse_values_df = pd.DataFrame(rmse_values, columns=['n_estimators', 'RMSE'])
rmse_values_df

Unnamed: 0,n_estimators,RMSE
0,10,43.158
1,20,41.79
2,30,41.556
3,40,41.076
4,50,40.957
5,60,40.774
6,70,40.588
7,80,40.503
8,90,40.435
9,100,40.365


#### Question-4

In [12]:
# Tuning Parameter
depth_values = [10, 15, 20, 25]
n_estimators_range = range(10, 201, 10)
results = []

for max_depth in depth_values:
    rmse_scores = []
    for n in n_estimators_range:
        rf_regressor = RandomForestRegressor(n_estimators=n, max_depth=max_depth, random_state=1, n_jobs=-1)
        rf_regressor.fit(X_train, y_train)
        y_val_pred = rf_regressor.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        rmse_scores.append(rmse)

    mean_rmse = np.mean(rmse_scores)
    results.append((max_depth, round(mean_rmse, 3)))

results_df = pd.DataFrame(results, columns=['max_depth', 'mean_RMSE'])
best_max_depth = results_df.loc[results_df['mean_RMSE'].idxmin()]

results_df, best_max_depth

(   max_depth  mean_RMSE
 0         10     40.138
 1         15     40.644
 2         20     40.610
 3         25     40.688,
 max_depth    10.000
 mean_RMSE    40.138
 Name: 0, dtype: float64)

#### Question-5

In [None]:
# Model setup
rf_regressor = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_regressor.fit(X_train, y_train)

feature_importances = rf_regressor.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': dv.feature_names_,
    'importance': feature_importances
}).sort_values(by='importance', ascending=False)

# Result
top_features = feature_importance_df.head(4)
top_features

Unnamed: 0,feature,importance
27,study_hours_per_week,0.254075
4,attendance_rate,0.152135
5,distance_to_school,0.135761
28,teacher_quality,0.081733


#### Question-6

In [15]:
!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/95/a4/16490d38b4854a1ce4995f4088bcb701b5057f711e34c95cd6e29792cdde/xgboost-2.1.2-py3-none-manylinux_2_28_x86_64.whl.metadata
  Downloading xgboost-2.1.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Obtaining dependency information for nvidia-nccl-cu12 from https://files.pythonhosted.org/packages/ed/1f/6482380ec8dcec4894e7503490fc536d846b0d59694acad9cf99f27d0e7d/nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.2-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
[2K   [90m━━━━━

In [16]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error


In [17]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

watchlist = [(dtrain, 'train'), (dval, 'validation')]

# Define parameters
xgb_params_03 = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

xgb_params_01 = xgb_params_03.copy()
xgb_params_01['eta'] = 0.1

# Train model with eta = 0.3
model_03 = xgb.train(xgb_params_03, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)
y_val_pred_03 = model_03.predict(dval)
rmse_03 = np.sqrt(mean_squared_error(y_val, y_val_pred_03))

# Train model with eta = 0.1
model_01 = xgb.train(xgb_params_01, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)
y_val_pred_01 = model_01.predict(dval)
rmse_01 = np.sqrt(mean_squared_error(y_val, y_val_pred_01))

# Compare RMSE scores
print(f"RMSE with eta=0.3: {rmse_03}")
print(f"RMSE with eta=0.1: {rmse_01}")

RMSE with eta=0.3: 41.15979263175636
RMSE with eta=0.1: 40.257357110659946
