In [86]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import warnings
import joblib
warnings.filterwarnings('ignore')

In [87]:
df = pd.read_csv('diamonds_train.csv')

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43018 entries, 0 to 43017
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    43018 non-null  float64
 1   cut      43018 non-null  object 
 2   color    43018 non-null  object 
 3   clarity  43018 non-null  object 
 4   depth    43018 non-null  float64
 5   table    43018 non-null  float64
 6   price    43018 non-null  int64  
 7   x        43018 non-null  float64
 8   y        43018 non-null  float64
 9   z        43018 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.3+ MB


In [89]:
print(df.head())

   carat        cut color clarity  depth  table  price     x     y     z
0   0.51       Good     D     SI2   63.9   55.0   1180  5.04  5.10  3.24
1   0.72      Ideal     E     VS2   60.8   57.0   3091  5.79  5.82  3.53
2   0.70  Very Good     D    VVS2   62.8   60.0   4022  5.65  5.69  3.56
3   0.36      Ideal     D     SI1   61.2   57.0    663  4.59  4.63  2.82
4   0.54  Very Good     D     SI1   60.0   59.8   1593  5.30  5.34  3.18


# Предварительная обработка данных

In [90]:
missing = df.isnull().sum()
print("Пропущенные значения:")
print(missing[missing > 0] if missing.sum() > 0 else "Пропущенных значений нет")

# Проверяем категориальные переменные
categorical_cols = ['cut', 'color', 'clarity']
for col in categorical_cols:
    if col in df.columns:
        print(f"{col}: {sorted(df[col].unique())}")


Пропущенные значения:
Пропущенных значений нет
cut: ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good']
color: ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity: ['I1', 'IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2']


## Кодирование признаков

In [91]:
# Определяем порядок качества для каждой категории
cut_order = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_order = ['J', 'I', 'H', 'G', 'F', 'E', 'D']
clarity_order = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

cut_mapping = {cut: i for i, cut in enumerate(cut_order)}
color_mapping = {color: i for i, color in enumerate(color_order)}
clarity_mapping = {clarity: i for i, clarity in enumerate(clarity_order)}

# Применяем кодирование
df['cut_encoded'] = df['cut'].map(cut_mapping)
df['color_encoded'] = df['color'].map(color_mapping)
df['clarity_encoded'] = df['clarity'].map(clarity_mapping)


## Новые признаки

In [92]:
df['volume'] = df['x'] * df['y'] * df['z']
df['density'] = df['carat'] / (df['volume'] + 1e-8)
df['table_ratio'] = df['table'] / df['depth']
df['size_ratio'] = df['x'] / df['y']
df['avg_dimension'] = (df['x'] + df['y'] + df['z']) / 3
df['surface_area'] = 2 * (df['x']*df['y'] + df['x']*df['z'] + df['y']*df['z'])


## Обработка выбросов

In [93]:
# Удаляем записи с нулевыми или отрицательными размерами
initial_size = df.shape[0]
df = df[(df['x'] > 0) & (df['y'] > 0) & (df['z'] > 0)]

# Очень большие или очень маленькие камни
Q1 = df['volume'].quantile(0.01)
Q3 = df['volume'].quantile(0.99)
df = df[(df['volume'] >= Q1) & (df['volume'] <= Q3)]

final_size = df.shape[0]
print(f"Удалено записей: {initial_size - final_size}")
print(f"Размер данных после обработки выбросов: {df.shape}")

Удалено записей: 862
Размер данных после обработки выбросов: (42156, 19)


## Подготовка признаков

In [94]:
# Определяем признаки для обучения
feature_columns = [
    'carat', 'depth', 'table', 'x', 'y', 'z',
    'cut_encoded', 'color_encoded', 'clarity_encoded',
    'volume', 'density', 'table_ratio', 'size_ratio', 'avg_dimension', 'surface_area'
]

# Удаляем признаки, которые могут содержать NaN после преобразований
feature_columns = [col for col in feature_columns if col in df.columns]

print(f"Признаки для обучения ({len(feature_columns)}): {feature_columns}")

# Подготавливаем X и y
X = df[feature_columns]
y = df['price']

print(f"Размеры: X {X.shape}, y {y.shape}")
print(f"Статистика цены - среднее: ${y.mean():,.2f}, медиана: ${y.median():,.2f}")

Признаки для обучения (15): ['carat', 'depth', 'table', 'x', 'y', 'z', 'cut_encoded', 'color_encoded', 'clarity_encoded', 'volume', 'density', 'table_ratio', 'size_ratio', 'avg_dimension', 'surface_area']
Размеры: X (42156, 15), y (42156,)
Статистика цены - среднее: $3,847.17, медиана: $2,401.00


## Разделение на train/test


In [95]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    shuffle=True
)

print(f"Train: {X_train.shape[0]:,} образцов ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test: {X_test.shape[0]:,} образцов ({X_test.shape[0]/len(X)*100:.1f}%)")

Train: 33,724 образцов (80.0%)
Test: 8,432 образцов (20.0%)


## Масштабирование

In [96]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## Тест моделей

In [97]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\n--- Обучение {name} ---")
    model.fit(X_train_scaled, y_train)

    # Предсказания
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    # Метрики
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

    # Кросс-валидация
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')

    results[name] = {
        'model': model,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }

    print(f"R² - Train: {train_r2:.4f}, Test: {test_r2:.4f}")
    print(f"RMSE - Train: ${train_rmse:,.2f}, Test: ${test_rmse:,.2f}")
    print(f"Кросс-валидация R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"Переобучение: {train_r2 - test_r2:.4f}")


--- Обучение Linear Regression ---
R² - Train: 0.9192, Test: 0.9185
RMSE - Train: $1,087.13, Test: $1,096.07
Кросс-валидация R²: 0.9187 (+/- 0.0052)
Переобучение: 0.0008

--- Обучение Random Forest ---
R² - Train: 0.9974, Test: 0.9825
RMSE - Train: $194.65, Test: $508.25
Кросс-валидация R²: 0.9809 (+/- 0.0015)
Переобучение: 0.0149

--- Обучение Gradient Boosting ---
R² - Train: 0.9786, Test: 0.9782
RMSE - Train: $559.24, Test: $566.63
Кросс-валидация R²: 0.9764 (+/- 0.0021)
Переобучение: 0.0004


## Сравнение моделей

In [98]:
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Test_R2': [results[name]['test_r2'] for name in results.keys()],
    'Test_RMSE': [results[name]['test_rmse'] for name in results.keys()],
    'CV_Mean': [results[name]['cv_mean'] for name in results.keys()],
    'Overfitting': [results[name]['train_r2'] - results[name]['test_r2'] for name in results.keys()]
}).sort_values('Test_R2', ascending=False)

print(comparison_df)

# Выбираем лучшую модель
best_model_name = comparison_df.iloc[0]['Model']
best_model = results[best_model_name]['model']
print(f"\nЛучшая: {best_model_name}")


               Model   Test_R2    Test_RMSE   CV_Mean  Overfitting
1      Random Forest  0.982470   508.251386  0.980860     0.014941
2  Gradient Boosting  0.978211   566.630178  0.976370     0.000416
0  Linear Regression  0.918473  1096.068404  0.918711     0.000764

Лучшая: Random Forest


## Настройка гиперпараметров


In [99]:
if best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
elif best_model_name == 'Gradient Boosting':
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 4, 5]
    }
else:
    param_grid = {}

if param_grid:
    grid_search = GridSearchCV(
        best_model, param_grid, cv=3, scoring='r2',
        n_jobs=-1, verbose=1
    )
    grid_search.fit(X_train_scaled, y_train)
    best_model = grid_search.best_estimator_
    print(f"Лучшие параметры: {grid_search.best_params_}")
    print(f"Лучший score: {grid_search.best_score_:.4f}")
else:
    print("настройка не требуется")

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Лучшие параметры: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Лучший score: 0.9812


## Финальная оценка


In [100]:
y_train_pred = best_model.predict(X_train_scaled)
y_test_pred = best_model.predict(X_test_scaled)

final_train_r2 = r2_score(y_train, y_train_pred)
final_test_r2 = r2_score(y_test, y_test_pred)
final_train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
final_test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"ФИНАЛЬНЫЕ МЕТРИКИ:")
print(f"R² - Train: {final_train_r2:.4f}, Test: {final_test_r2:.4f}")
print(f"RMSE - Train: ${final_train_rmse:,.2f}, Test: ${final_test_rmse:,.2f}")
print(f"Переобучение: {final_train_r2 - final_test_r2:.4f}")

ФИНАЛЬНЫЕ МЕТРИКИ:
R² - Train: 0.9956, Test: 0.9825
RMSE - Train: $253.07, Test: $508.19
Переобучение: 0.0131


## Модель на всех данных

In [101]:
# Масштабируем все данные
X_scaled = scaler.fit_transform(X)

# Обучаем финальную модель на всех данных
final_model = best_model
final_model.fit(X_scaled, y)

# Оценка на всех данных
predictions = final_model.predict(X_scaled)
final_r2 = r2_score(y, predictions)
final_rmse = np.sqrt(mean_squared_error(y, predictions))

print(f"R² на всех данных: {final_r2:.4f}")
print(f"RMSE на всех данных: ${final_rmse:,.2f}")

R² на всех данных: 0.9957
RMSE на всех данных: $252.17


### Сохранение модели

In [102]:
joblib.dump(final_model, 'final_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump({
    'cut_mapping': cut_mapping,
    'color_mapping': color_mapping,
    'clarity_mapping': clarity_mapping
}, 'encoders.pkl')
joblib.dump(feature_columns, 'feature_columns.pkl')

['feature_columns.pkl']

## Загрузка, преобразование, тест данных

In [103]:
df_test = pd.read_csv('diamonds_test.csv')
print(f"Размер тестовых данных: {df_test.shape}")

# Сохраняем id для submission
test_ids = df_test['id'].copy()

Размер тестовых данных: (5379, 10)


In [104]:
# Применяем те же преобразования, что и к обучающим данным
df_test['cut_encoded'] = df_test['cut'].map(cut_mapping)
df_test['color_encoded'] = df_test['color'].map(color_mapping)
df_test['clarity_encoded'] = df_test['clarity'].map(clarity_mapping)

# Создаем те же признаки
df_test['volume'] = df_test['x'] * df_test['y'] * df_test['z']
df_test['density'] = df_test['carat'] / (df_test['volume'] + 1e-8)
df_test['table_ratio'] = df_test['table'] / df_test['depth']
df_test['size_ratio'] = df_test['x'] / df_test['y']
df_test['avg_dimension'] = (df_test['x'] + df_test['y'] + df_test['z']) / 3
df_test['surface_area'] = 2 * (df_test['x']*df_test['y'] + df_test['x']*df_test['z'] + df_test['y']*df_test['z'])

In [105]:
# Подготавливаем признаки для тестовых данных
X_test_final = df_test[feature_columns]

# Масштабируем
X_test_scaled = scaler.transform(X_test_final)

# Предсказываем
predictions = final_model.predict(X_test_scaled)

print(f"Статистика предсказаний:")
print(f"Min: ${predictions.min():,.2f}")
print(f"Max: ${predictions.max():,.2f}")
print(f"Mean: ${predictions.mean():,.2f}")
print(f"Median: ${np.median(predictions):,.2f}")

Статистика предсказаний:
Min: $354.91
Max: $18,128.92
Mean: $3,916.84
Median: $2,440.00


# Сохранение результатов

In [106]:
df_result = pd.DataFrame({'id': df_test['id'], 'price': predictions})
df_result.to_csv('submission.csv', index=False)