### 1. study_hours_per_week
### 2. 42.13
### 3. 80
### 4. 10
### 5. distance_to_school
### 6. 0.1

In [2]:
import pandas as pd
df = pd.read_csv('jamb_exam_results.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor

df = df.drop(columns=['student_id'])
df = df.fillna(0)

X = df.drop(columns=['jamb_score'])
y = df['jamb_score']

# Разделение данных на train, validation и test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

# Преобразование в матрицы
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(X_train.to_dict(orient='records'))
X_val = dv.transform(X_val.to_dict(orient='records'))
X_test = dv.transform(X_test.to_dict(orient='records'))

model = DecisionTreeRegressor(max_depth=1, random_state=1)
model.fit(X_train, y_train)

feature_idx = model.tree_.feature[0]  # Индекс признака, используемого для разбиения
feature_name = dv.feature_names_[feature_idx]
print("Признак, используемый для разбиения данных:", feature_name)

Признак, используемый для разбиения данных: study_hours_per_week


In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

rf_model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf_model.fit(X_train, y_train)

y_val_pred = rf_model.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("RMSE на валидационных данных:", round(rmse_val, 3))

RMSE на валидационных данных: 43.158


In [15]:
# Вопрос 3: После какого значения n_estimators RMSE перестает улучшаться?

rmse_values = []
n_estimators_range = range(10, 201, 10)

# Перебор различных значений n_estimators и расчет RMSE для каждого
for n in n_estimators_range:
    rf_model = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    y_val_pred = rf_model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    rmse_values.append((n, rmse))
    print(f"n_estimators={n}, RMSE={round(rmse, 3)}")

# Ищем когда RMSE перестает улучшаться
previous_rmse_rounded = round(rmse_values[0][1], 3)
threshold = 0.01  # Порог значимости изменений

for n, rmse in rmse_values[1:]:
    current_rmse_rounded = round(rmse, 3)
    if abs(current_rmse_rounded - previous_rmse_rounded) < threshold:
        print(f"RMSE перестает улучшаться при n_estimators={n}")
        break
    previous_rmse_rounded = current_rmse_rounded


# Вопрос 4: Выбор лучшего max_depth

max_depth_values = [10, 15, 20, 25]
best_rmse = float('inf')
best_max_depth = None

# Перебор max_depth и n_estimators для расчета среднего RMSE
for max_depth in max_depth_values:
    rmse_for_max_depth = []
    
    for n in n_estimators_range:
        rf_model = RandomForestRegressor(n_estimators=n, max_depth=max_depth, random_state=1, n_jobs=-1)
        rf_model.fit(X_train, y_train)
        y_val_pred = rf_model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        rmse_for_max_depth.append(rmse)
    
    avg_rmse = np.mean(rmse_for_max_depth)
    print(f"max_depth={max_depth}, среднее RMSE={round(avg_rmse, 3)}")
    
    if avg_rmse < best_rmse:
        best_rmse = avg_rmse
        best_max_depth = max_depth

print(f"Лучшее значение max_depth: {best_max_depth}")

n_estimators=10, RMSE=43.158
n_estimators=20, RMSE=41.79
n_estimators=30, RMSE=41.556
n_estimators=40, RMSE=41.076
n_estimators=50, RMSE=40.957
n_estimators=60, RMSE=40.774
n_estimators=70, RMSE=40.588
n_estimators=80, RMSE=40.503
n_estimators=90, RMSE=40.435
n_estimators=100, RMSE=40.365
n_estimators=110, RMSE=40.348
n_estimators=120, RMSE=40.302
n_estimators=130, RMSE=40.286
n_estimators=140, RMSE=40.263
n_estimators=150, RMSE=40.254
n_estimators=160, RMSE=40.2
n_estimators=170, RMSE=40.187
n_estimators=180, RMSE=40.136
n_estimators=190, RMSE=40.152
n_estimators=200, RMSE=40.138
RMSE перестает улучшаться при n_estimators=150
max_depth=10, среднее RMSE=40.138
max_depth=15, среднее RMSE=40.644
max_depth=20, среднее RMSE=40.61
max_depth=25, среднее RMSE=40.688
Лучшее значение max_depth: 10


In [20]:
# 5 вопрос

rf_model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Получение важности признаков
importances = rf_model.feature_importances_

# Создание словаря с признаками и их важностью
features = ['study_hours_per_week', 'attendance_rate', 'distance_to_school', 'teacher_quality']
feature_importance_dict = dict(zip(features, importances))

for feature, importance in feature_importance_dict.items():
    print(f"{feature}: {round(importance, 3)}")

# Найдём самый важный признак
most_important_feature = max(feature_importance_dict, key=feature_importance_dict.get)
print(f"Самый важный признак: {most_important_feature}")

study_hours_per_week: 0.01
attendance_rate: 0.009
distance_to_school: 0.064
teacher_quality: 0.034
Самый важный признак: distance_to_school


In [21]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Создание watchlist для отслеживания прогресса модели
watchlist = [(dtrain, 'train'), (dval, 'eval')]

xgb_params = {
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

# Обучение модели с eta=0.3
xgb_params['eta'] = 0.3
model_0_3 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)
# Прогнозирование на валидационном наборе данных для eta=0.3
y_val_pred_0_3 = model_0_3.predict(dval)
rmse_0_3 = np.sqrt(mean_squared_error(y_val, y_val_pred_0_3))
print(f"RMSE для eta=0.3: {rmse_0_3}")

# Обучение модели с eta=0.1
xgb_params['eta'] = 0.1
model_0_1 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)
# Прогнозирование на валидационном наборе данных для eta=0.1
y_val_pred_0_1 = model_0_1.predict(dval)
rmse_0_1 = np.sqrt(mean_squared_error(y_val, y_val_pred_0_1))
print(f"RMSE для eta=0.1: {rmse_0_1}")

# Сравнение RMSE
if rmse_0_3 < rmse_0_1:
    print("Лучшее значение RMSE с eta=0.3")
elif rmse_0_3 > rmse_0_1:
    print("Лучшее значение RMSE с eta=0.1")
else:
    print("Both give equal value")

[0]	train-rmse:42.84835	eval-rmse:44.52338
[1]	train-rmse:39.96423	eval-rmse:42.83406
[2]	train-rmse:37.91231	eval-rmse:41.62607
[3]	train-rmse:36.51126	eval-rmse:41.25491
[4]	train-rmse:35.52212	eval-rmse:40.84075
[5]	train-rmse:34.77126	eval-rmse:40.71677
[6]	train-rmse:34.03898	eval-rmse:40.72669
[7]	train-rmse:33.62820	eval-rmse:40.68822
[8]	train-rmse:32.94729	eval-rmse:40.81273
[9]	train-rmse:32.27703	eval-rmse:40.84939
[10]	train-rmse:31.73818	eval-rmse:40.83759
[11]	train-rmse:31.31360	eval-rmse:40.80575
[12]	train-rmse:30.72949	eval-rmse:40.84238
[13]	train-rmse:30.11486	eval-rmse:40.96020
[14]	train-rmse:29.43538	eval-rmse:40.98775
[15]	train-rmse:29.23018	eval-rmse:41.04798
[16]	train-rmse:28.64113	eval-rmse:41.08375
RMSE для eta=0.3: 41.15979263175636
[0]	train-rmse:45.64414	eval-rmse:46.63724
[1]	train-rmse:44.26862	eval-rmse:45.58724
[2]	train-rmse:43.08569	eval-rmse:44.76209
[3]	train-rmse:42.05227	eval-rmse:44.02498
[4]	train-rmse:41.10533	eval-rmse:43.40640
[5]	train-r