In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# Подготовка
df = pd.read_csv('jamb_exam_results.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
df = df.drop(columns=['student_id'])
df = df.fillna(0)

X = df.drop(columns=['jamb_score'])
y = df['jamb_score']

print(df.head())

# Разделяем данные на обучающую, валидационную и тестовую выборки
train_set, temp_set, train_y, temp_y = train_test_split(X, y, test_size=0.4, random_state=42)
val_set, test_set, val_y, test_y = train_test_split(temp_set, temp_y, test_size=0.5, random_state=42)

converter = DictVectorizer(sparse=True)
train_set_converted = converter.fit_transform(train_set.to_dict(orient='records'))
val_set_converted = converter.transform(val_set.to_dict(orient='records'))
test_set_converted = converter.transform(test_set.to_dict(orient='records'))

# Обучение дерева решений
decision_tree = DecisionTreeRegressor(max_depth=1, random_state=42)
decision_tree.fit(train_set_converted, train_y)

first_split_attribute = converter.feature_names_[decision_tree.tree_.feature[0]]
print()
print(f"Для разбиения данных используется признак: {first_split_attribute}")

# Обучение случайного леса
forest = RandomForestRegressor(n_estimators=10, random_state=42, n_jobs=-1)
forest.fit(train_set_converted, train_y)

val_predict = forest.predict(val_set_converted)
val_rmse = np.sqrt(mean_squared_error(val_y, val_predict))
print()
print(f"Значение RMSE на валидационных данных: {val_rmse}")

# Поиск значения оптимального параметра
rmse_data = {}
for num_trees in range(10, 201, 10):
    forest = RandomForestRegressor(n_estimators=num_trees, random_state=42, n_jobs=-1)
    forest.fit(train_set_converted, train_y)
    val_predict = forest.predict(val_set_converted)
    rmse_data[num_trees] = np.sqrt(mean_squared_error(val_y, val_predict))

for num_trees, rmse_score in rmse_data.items():
    print(f"n_estimators: {num_trees}, RMSE: {rmse_score:.3f}")

depth_options = [10, 15, 20, 25]
depth_results = {}

for max_depth in depth_options:
    rmse_scores = []
    for num_trees in range(10, 201, 10):
        forest = RandomForestRegressor(n_estimators=num_trees, max_depth=max_depth, random_state=42, n_jobs=-1)
        forest.fit(train_set_converted, train_y)
        val_predict = forest.predict(val_set_converted)
        rmse_scores.append(np.sqrt(mean_squared_error(val_y, val_predict)))
    depth_results[max_depth] = np.mean(rmse_scores)

# Лучшее значение max_depth
optimal_depth = min(depth_results, key=depth_results.get)
print(f"Лучшее значение: {optimal_depth}")



   jamb_score  study_hours_per_week  attendance_rate  teacher_quality  \
0         192                    22               78                4   
1         207                    14               88                4   
2         182                    29               87                2   
3         210                    29               99                2   
4         199                    12               98                3   

   distance_to_school school_type school_location extra_tutorials  \
0                12.4      Public           Urban             Yes   
1                 2.7      Public           Rural              No   
2                 9.6      Public           Rural             Yes   
3                 2.6      Public           Urban              No   
4                 8.8      Public           Urban              No   

  access_to_learning_materials parent_involvement it_knowledge  age  gender  \
0                          Yes               High       Medium   17

In [2]:
# Пункты 5 и 6
best_forest = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=42, n_jobs=-1)
best_forest.fit(train_set_converted, train_y)

boost_train = xgb.DMatrix(train_set_converted, label=train_y)
boost_val = xgb.DMatrix(val_set_converted, label=val_y)

evaluation = [(boost_train, 'train'), (boost_val, 'eval')]

boost_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 42,
    'verbosity': 1
}

boost_model_03 = xgb.train(boost_params, boost_train, num_boost_round=100, evals=evaluation, early_stopping_rounds=10)
boost_params['eta'] = 0.1
boost_model_01 = xgb.train(boost_params, boost_train, num_boost_round=100, evals=evaluation, early_stopping_rounds=10)

print(f"Лучшее значение RMSE с eta=0.3: {boost_model_03.best_score}")
print(f"Лучшее значение RMSE с eta=0.1: {boost_model_01.best_score}")
attribute_importances = best_forest.feature_importances_
top_attribute = converter.feature_names_[np.argmax(attribute_importances)]
print(f"Самый важный признак: {top_attribute}")

[0]	train-rmse:42.71579	eval-rmse:45.17452
[1]	train-rmse:39.82920	eval-rmse:43.33307
[2]	train-rmse:37.76334	eval-rmse:42.21338
[3]	train-rmse:36.28364	eval-rmse:41.85801
[4]	train-rmse:35.07326	eval-rmse:41.33436
[5]	train-rmse:34.19555	eval-rmse:41.08693
[6]	train-rmse:33.44294	eval-rmse:40.91599
[7]	train-rmse:32.67600	eval-rmse:40.88081
[8]	train-rmse:32.01974	eval-rmse:40.97827
[9]	train-rmse:31.53993	eval-rmse:40.88101
[10]	train-rmse:31.06525	eval-rmse:40.93403
[11]	train-rmse:30.73302	eval-rmse:40.99632
[12]	train-rmse:30.37063	eval-rmse:40.99476
[13]	train-rmse:29.88920	eval-rmse:41.01079
[14]	train-rmse:29.51041	eval-rmse:40.97999
[15]	train-rmse:29.18156	eval-rmse:41.04403
[16]	train-rmse:29.07860	eval-rmse:41.05910
[0]	train-rmse:45.50472	eval-rmse:47.29604
[1]	train-rmse:44.14512	eval-rmse:46.35598
[2]	train-rmse:42.95150	eval-rmse:45.45687
[3]	train-rmse:41.89408	eval-rmse:44.71852
[4]	train-rmse:40.93352	eval-rmse:44.07986
[5]	train-rmse:40.08378	eval-rmse:43.54894
[6]	