In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv('jamb_exam_results.csv')

In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

if 'student_id' in df.columns:
    df = df.drop('student_id', axis=1)

df = df.fillna(0)

X = df.drop('jamb_score', axis=1)
y = df['jamb_score']

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.25, random_state=1
)

dv = DictVectorizer(sparse=True)

train_dict = X_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = X_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = X_test.to_dict(orient='records')
X_test = dv.transform(test_dict)

feature_names = dv.get_feature_names_out()

### 1 Задание

In [5]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

tree_feature_index = dt.tree_.feature[0]
feature_used = feature_names[tree_feature_index]
print(f"Признак, используемый для разбиения: {feature_used}")

Признак, используемый для разбиения: study_hours_per_week


### 2 Задание

In [6]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE на валидационных данных: {rmse:.3f}")

RMSE на валидационных данных: 42.137


### 3 Задание

In [7]:
best_rmse = float('inf')
best_n = 0
n_estimators_range = range(10, 201, 10)
rmse_values = []

for n in n_estimators_range:
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_values.append(rmse)

    if rmse < best_rmse:
        best_rmse = rmse
        best_n = n

for i in range(1, len(rmse_values)):
    improvement = rmse_values[i-1] - rmse_values[i]
    if improvement < 0.001:
        stopping_point = n_estimators_range[i-1]
        break
else:
    stopping_point = best_n

print(f"RMSE перестает улучшаться после n_estimators={stopping_point}")

RMSE перестает улучшаться после n_estimators=90


### 4 Задание

In [8]:
max_depth_values = [10, 15, 20, 25]
best_depth = None
best_avg_rmse = float('inf')

for depth in max_depth_values:
    rmse_scores = []

    for n in n_estimators_range:
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)

    avg_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)
    print(f"max_depth={depth}: средний RMSE={avg_rmse:.3f} (±{std_rmse:.3f})")

    if avg_rmse < best_avg_rmse:
        best_avg_rmse = avg_rmse
        best_depth = depth

print(f"Лучшее max_depth: {best_depth} со средним RMSE={best_avg_rmse:.3f}")

max_depth=10: средний RMSE=40.392 (±0.248)
max_depth=15: средний RMSE=40.735 (±0.380)
max_depth=20: средний RMSE=40.740 (±0.368)
max_depth=25: средний RMSE=40.788 (±0.388)
Лучшее max_depth: 10 со средним RMSE=40.392


### 5 Задание

In [10]:
rf_final = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf_final.fit(X_train, y_train)

feature_importances = rf_final.feature_importances_
feature_importance_dict = dict(zip(feature_names, feature_importances))

sorted_features = sorted(feature_importance_dict.items(),
                        key=lambda x: x[1], reverse=True)

most_important_feature = sorted_features[0][0]

target_features = [
    'study_hours_per_week',
    'attendance_rate',
    'distance_to_school',
    'teacher_quality'
]

most_important_target = None
max_importance = -1

for feature in target_features:
    if feature in feature_importance_dict:
        importance = feature_importance_dict[feature]
        if importance > max_importance:
            max_importance = importance
            most_important_target = feature
    else:
        print(f"  {feature}: признак не найден")

print(f"\nСамый важный признак из указанных четырех: {most_important_target}")


Самый важный признак из указанных четырех: study_hours_per_week


### Ответы
1. study_hours_per_week
2. 42.137
3. n_estimators=90
4. max_depth=10
5. study_hours_per_week