In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso


In [2]:
train_data = pd.read_csv('/content/drive/MyDrive/Kaggle/Linear_regression/data/train.csv')
kaggle_test_data = pd.read_csv("/content/drive/MyDrive/Kaggle/Linear_regression/data/test.csv")

In [3]:
# Очистка данных
def clearing(data):
  drop_colomns = data.columns[data.isna().mean() >= 0.4]
  data_clean = data.drop(drop_colomns, axis=1).reset_index(drop=True)
  return data_clean


train_data = clearing(data=train_data)
kaggle_test_data_X = clearing(data=kaggle_test_data)

In [4]:
# Декодирование категориальных признаков
def encoding(data,  ohe, datatype=None):
  categorial_cols = data.columns[data.dtypes == "object"]
  if datatype == "train":
    encoded_array = ohe.fit_transform(data[categorial_cols])
  elif datatype == "test":
    encoded_array = ohe.transform(data[categorial_cols])
  encoded_columns = ohe.get_feature_names_out(categorial_cols)
  encoded_data = pd.DataFrame(encoded_array, columns=encoded_columns)
  data_encoded = pd.concat([data.drop(categorial_cols, axis=1), encoded_data], axis=1)
  data_encoded = data_encoded.fillna(data_encoded.mean())
  return data_encoded

ohe = OneHotEncoder(drop="first", sparse_output=False, handle_unknown='ignore')

train_data = encoding(data=train_data, ohe=ohe, datatype="train")
kaggle_test_data_X = encoding(data=kaggle_test_data_X, ohe=ohe, datatype="test")



In [5]:
train_data_X = train_data.drop("SalePrice", axis=1)
train_data_y = train_data["SalePrice"]

In [6]:
# Масштабирование
categorical_cols = ohe.get_feature_names_out()
numeric_cols_to_scale = [col for col in train_data_X.columns if (col not in categorical_cols) and (col != "Id")]
scaler = ColumnTransformer([
    ("scale_numeric", StandardScaler(), numeric_cols_to_scale)
], remainder="passthrough") # Создание экземпляра класса

# Масштабирование трейна
scaled_train_data_X = scaler.fit_transform(train_data_X)
scaled_train_data_X = pd.DataFrame(scaled_train_data_X, columns=scaler.get_feature_names_out(), index=train_data_X.index)
train_data = pd.concat([scaled_train_data_X, train_data_y], axis=1)

# Масштабирование кагл-теста
scaled_kaggle_test_data_X = scaler.transform(kaggle_test_data_X)
kaggle_test_data_X = pd.DataFrame(scaled_kaggle_test_data_X, columns=scaler.get_feature_names_out(), index=kaggle_test_data_X.index)
kaggle_test_data_X.head(7)




Unnamed: 0,scale_numeric__MSSubClass,scale_numeric__LotFrontage,scale_numeric__LotArea,scale_numeric__OverallQual,scale_numeric__OverallCond,scale_numeric__YearBuilt,scale_numeric__YearRemodAdd,scale_numeric__MasVnrArea,scale_numeric__BsmtFinSF1,scale_numeric__BsmtFinSF2,...,remainder__SaleType_ConLI,remainder__SaleType_ConLw,remainder__SaleType_New,remainder__SaleType_Oth,remainder__SaleType_WD,remainder__SaleCondition_AdjLand,remainder__SaleCondition_Alloca,remainder__SaleCondition_Family,remainder__SaleCondition_Normal,remainder__SaleCondition_Partial
0,-0.872563,0.451936,0.110763,-0.795151,0.381743,-0.340077,-1.15638,-0.57441,0.053428,0.604293,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-0.872563,0.497357,0.37585,-0.071836,0.381743,-0.43944,-1.30174,0.023903,1.051363,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.073375,0.179413,0.332053,-0.795151,-0.5172,0.852269,0.6364,-0.57441,0.761852,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.073375,0.361095,-0.054002,-0.071836,0.381743,0.88539,0.6364,-0.463612,0.347326,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1.492282,-1.228623,-0.552407,1.374795,-0.5172,0.686666,0.345679,-0.57441,-0.39619,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5,0.073375,0.224833,-0.051798,-0.071836,-0.5172,0.719786,0.442586,-0.57441,-0.973018,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
6,-0.872563,-0.06675,-0.254246,-0.071836,1.280685,0.686666,1.072482,-0.57441,1.077682,-0.288653,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [7]:
train_data_X = train_data.drop("SalePrice", axis=1)
train_data_y = train_data["SalePrice"]

In [8]:
# Избавление от мултиколлениарности
# Локальный скор стал лучше, но на чуть чуть, на kaggle стал хуже

def remove_high_corr(data, threshold = 0.9):
  corr_matrix = data.corr()
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
  to_drop = [column for column in upper.columns if any(abs(upper[column]) > threshold)]
  reduced_data = data.drop(columns=to_drop)
  return reduced_data

train_data_X = remove_high_corr(train_data_X, threshold=0.9)
features_to_keep = train_data_X.columns
kaggle_test_data_X = kaggle_test_data_X[features_to_keep]




In [9]:
# Разбиение выборки
X_train, X_test, y_train, y_test = train_test_split(train_data_X, train_data_y, test_size=0.3, random_state=42)


In [10]:
# Обучение регрессии без регуляризации
# Обучение модели на тренировочной выборке
baseline = LinearRegression()
baseline.fit(X_train, y_train)
y_pred = baseline.predict(X_test)

# Метрики оценки качества решения
mse = mean_squared_error(y_true=y_test, y_pred=y_pred)
rmse = root_mean_squared_error(y_true=y_test, y_pred=y_pred)
r2_score = r2_score(y_true=y_test, y_pred=y_pred)
print(rmse)

43858.098081229364


In [11]:
# Линейная регрессия + L2 рег
param_grid = {"alpha": np.logspace(-3, 2, 20)}
ridge = Ridge()
grid = GridSearchCV(ridge, param_grid, cv=10, scoring='neg_root_mean_squared_error')
grid.fit(X_train, y_train)
print("Лучшее alpha (GridSearchCV):", grid.best_params_["alpha"])
ridge_model = grid.best_estimator_

y_pred = ridge_model.predict(X_test)

# Метрики оценки качества решения
mse = mean_squared_error(y_true=y_test, y_pred=y_pred)
rmse = root_mean_squared_error(y_true=y_test, y_pred=y_pred)
# r2_score = r2_score(y_true=y_test, y_pred=y_pred)
print(rmse)

Лучшее alpha (GridSearchCV): 16.23776739188721
29638.543201723474


In [12]:
# Линейная регрессия + L1 рег
param_grid = {"alpha": np.logspace(1, 4, 20)} # Разный масштаб alpha в сравнении с Ridge
lasso = Lasso(max_iter=10000)
grid = GridSearchCV(lasso, param_grid, cv=10, scoring='neg_root_mean_squared_error')
grid.fit(X_train, y_train)
print("Лучшее alpha (GridSearchCV):", grid.best_params_["alpha"])
lasso_model = grid.best_estimator_

y_pred = lasso_model.predict(X_test)
# Метрики оценки качества решения
mse = mean_squared_error(y_true=y_test, y_pred=y_pred)
rmse = root_mean_squared_error(y_true=y_test, y_pred=y_pred)
# r2_score = r2_score(y_true=y_test, y_pred=y_pred)
print(rmse)

Лучшее alpha (GridSearchCV): 263.6650898730358
30120.511780984096


In [13]:
# Предикты для kaggle
kaggle_test_data_y = lasso_model.predict(kaggle_test_data_X)
submission = pd.DataFrame({
    "Id": kaggle_test_data["Id"],
    "SalePrice": kaggle_test_data_y
})
submission.to_csv("submission.csv", index=False)
