In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import warnings

warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.2f}'.format  # Формат вывода
pd.options.display.max_columns = 500  # Кол-во отображаемых столбцов
%matplotlib inline

train = pd.read_csv('train.csv').rename(columns=lambda x: x.replace(' ', '_'))

In [5]:
target = 'Premium_Amount'
train = train.set_index('id')
train = train.drop('Policy_Start_Date', axis=1)

num_cols = train.drop(target, axis=1).select_dtypes(include=np.number).columns.tolist()
cat_cols = train.select_dtypes(exclude=np.number).columns.tolist()

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


X = train.drop(target, axis=1)
y = np.log1p(train[target])


# Пайплайн для числовых данных
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Пайплайн для категориальных данных
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Трансформер для обработки данных
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)])


X = preprocessor.fit_transform(X)

In [10]:
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_validate, train_test_split

# Функция для вычисления RMSLE
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Создание кастомного scorer для GridSearchCV
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

In [9]:
from sklearn.linear_model import LinearRegression

model_lr = LinearRegression()

cv_lr = cross_validate(model_lr, X, y, scoring=rmsle_scorer, cv=5, return_train_score=True)

# Преобразование MSE в RMSLE
rmsle_test = np.sqrt(-cv_lr['test_score'])
rmsle_train = np.sqrt(-cv_lr['train_score'])

print(f'Средний RMSLE на TrainFold: {np.mean(rmsle_train):.4f}')
print(f'Средний RMSLE на TestFold: {np.mean(rmsle_test):.4f}')

Средний RMSLE на TrainFold: 1.0430
Средний RMSLE на TestFold: 1.0430


In [11]:
from sklearn.tree import DecisionTreeRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

model_dt = DecisionTreeRegressor(random_state=33)
model_dt.fit(X_train, y_train)

# Предсказания на обучающей и тестовой выборках
y_train_pred = model_dt.predict(X_train)
y_test_pred = model_dt.predict(X_test)


rmsle_train = rmsle(y_train, y_train_pred)
rmsle_test = rmsle(y_test, y_test_pred)

# Вывод результатов
print(f'RMSLE train: {rmsle_train:.4f}')
print(f'RMSLE test: {rmsle_test:.4f}')

RMSLE train: 0.0000
RMSLE test: 1.5465
