In [1]:
import pandas as pd

df = pd.read_csv('clean.csv')

In [2]:
df.shape

(1905, 40)

In [3]:
import numpy as np
import pickle
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR
from catboost import CatBoostClassifier

In [4]:
import plotly.graph_objects as go

In [5]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from tqdm import tqdm

In [6]:
lvl0 = df.SalePrice.describe()['min']
lvl25 = df.SalePrice.describe()['25%']
lvl50 = df.SalePrice.describe()['50%']
lvl75 = df.SalePrice.describe()['75%']
lvl100 = df.SalePrice.describe()['max']

In [7]:
# Вводим колонку классификатора (4 класса)

def Criteria(price):
    if lvl0 <= price < lvl25:
        return 0
    elif lvl25 <= price < lvl50:
        return 1
    elif lvl50 <= price < lvl75:
        return 2
    return 3

df['Quality'] = df['SalePrice'].apply(lambda x: Criteria(x))

In [8]:
# Целевая переменная и удаление ненужных колонок
y_global = df[["Quality","SalePrice"]]
X_global = df

In [9]:
# Train/test split
X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(X_global, y_global, test_size=0.2, random_state=0, stratify=y_global.Quality)

# Train/test for classification
X_train = X_train_g.drop(columns=["SalePrice", "Quality"])
X_test = X_test_g.drop(columns=["SalePrice", "Quality"]) 
y_train = y_train_g.drop(columns=["SalePrice"]) 
y_test = y_test_g.drop(columns=["SalePrice"])


In [10]:
X_train.head()

Unnamed: 0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Lot Shape,Land Contour,Lot Config,Condition 1,Bldg Type,House Style,...,Bedroom AbvGr,Kitchen Qual,TotRms AbvGrd,Fireplaces,Garage Type,Garage Finish,Garage Area,Garage Qual,Wood Deck SF,Open Porch SF
856,50,RL,60.0,7200,Reg,Lvl,Inside,Norm,1Fam,1.5Fin,...,3,TA,6,0,Detchd,Unf,480.0,TA,0,0
770,60,RL,76.0,9291,IR1,Lvl,Corner,RRNe,1Fam,2Story,...,3,Gd,7,0,Attchd,RFn,506.0,TA,144,70
75,160,FV,24.0,2544,Reg,Lvl,Inside,Norm,Twnhs,2Story,...,2,Gd,4,0,Detchd,RFn,480.0,TA,0,172
1168,60,RL,89.0,10557,IR1,Lvl,Inside,Norm,1Fam,2Story,...,4,Gd,9,1,Attchd,Fin,806.0,TA,108,87
1616,60,RL,69.22459,12104,IR1,Lvl,FR3,Norm,1Fam,2Story,...,4,Gd,8,0,Attchd,RFn,617.0,TA,400,45


In [11]:
np.random.seed(0)

In [12]:
y_train = y_train['Quality']

In [13]:
with open('x_train.pickle', 'wb') as f:
    pickle.dump(X_train, f, pickle.HIGHEST_PROTOCOL)
with open('y_train.pickle', 'wb') as f:
    pickle.dump(y_train, f, pickle.HIGHEST_PROTOCOL)

In [17]:
# Категориальные и числовые признаки
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=["object"]).columns.tolist()

# Препроцессинг
# Числовой препроцессор: заполняем NaN → медианой, затем масштабируем
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  RobustScaler())
#    ,("pca", PCA(n_components=n_components)) # количество компонент
])

# Категориальный препроцессор: заполняем NaN → 'Missing', затем OneHot
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

# Выбираем критерий поиска "лучшего решения" на сетке
scoring = 'accuracy'

# Словарь моделей и параметров
models_params = {

    "RandomForest": (RandomForestClassifier(random_state=0), {
        "classifier__n_estimators": [75, 80, 100, 200],
        "classifier__max_depth": [20, 21, 22, 23]
    }),
    "CatBoost": (CatBoostClassifier(random_state=0), {
        "classifier__max_depth": [4],
        "classifier__verbose": [False],
    }),

    "XGBoost": (XGBClassifier(random_state=0), {
        "classifier__num_class":[4], 
        "classifier__eval_metric":['mlogloss'],
        "classifier__enable_categorical":[True],
        "classifier__max_depth":[4],
        "classifier__learning_rate":[0.3],
        "classifier__n_estimators":[100],
    }),
}

# Сравнение моделей
results = []

for name, (model, params) in tqdm(models_params.items()):
    pipe = Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", model)
    ])

    grid = GridSearchCV(pipe, param_grid=params, scoring=scoring, cv=3, n_jobs=-1)
    grid.fit(X_train, y_train)

    y_pred = grid.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    #p_score = precision_score(y_test, y_pred)
    #r_score = recall_score(y_test, y_pred)
    results.append({
        "Model": name,
        "Best Params": grid.best_params_,
        f"Validation {scoring}": grid.best_score_,
        "Test Accuracy": acc,
        #"Test Precision": p_score,
        #"Test Recall": r_score
        "Best Estimator": grid.best_estimator_
    })

classification_df = pd.DataFrame(results)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:26<00:00,  8.91s/it]


In [18]:
classification_df["Best Params"][1]

{'classifier__max_depth': 4, 'classifier__verbose': False}

In [19]:
classification_df.sort_values(by=[f"Validation {scoring}"], ascending=False)

Unnamed: 0,Model,Best Params,Validation accuracy,Test Accuracy,Best Estimator
1,CatBoost,"{'classifier__max_depth': 4, 'classifier__verb...",0.793963,0.792651,"(ColumnTransformer(transformers=[('num',\n ..."
0,RandomForest,"{'classifier__max_depth': 22, 'classifier__n_e...",0.791995,0.782152,"(ColumnTransformer(transformers=[('num',\n ..."
2,XGBoost,"{'classifier__enable_categorical': True, 'clas...",0.76378,0.776903,"(ColumnTransformer(transformers=[('num',\n ..."


In [20]:
classification_df.sort_values(by=[f"Test Accuracy"], ascending=False)

Unnamed: 0,Model,Best Params,Validation accuracy,Test Accuracy,Best Estimator
1,CatBoost,"{'classifier__max_depth': 4, 'classifier__verb...",0.793963,0.792651,"(ColumnTransformer(transformers=[('num',\n ..."
0,RandomForest,"{'classifier__max_depth': 22, 'classifier__n_e...",0.791995,0.782152,"(ColumnTransformer(transformers=[('num',\n ..."
2,XGBoost,"{'classifier__enable_categorical': True, 'clas...",0.76378,0.776903,"(ColumnTransformer(transformers=[('num',\n ..."


In [21]:
with open('classification.pickle', 'wb') as f:
    pickle.dump(classification_df, f, pickle.HIGHEST_PROTOCOL)

# Регрессия

In [48]:
# Словарь моделей и параметров
models_params = {
        "KNN": (KNeighborsRegressor(), {
        "regressor__n_neighbors": [6, 7, 8],
        "regressor__weights": ["uniform", "distance"],
        'regressor__p': [1, 2]  # 1=Manhattan, 2=Euclidean
    }),
        "ElasticNet": (ElasticNet(max_iter=1000), {
        "regressor__alpha": [0.07, 0.1, 0.12, 0.15],
        "regressor__l1_ratio": [0.75, 0.8, 0.85] # 0 = Ridge, 1 = Lasso
    }),
        "RandomForest": (RandomForestRegressor(random_state=0), {
        "regressor__n_estimators": [130, 140, 150],
        "regressor__max_depth": [21, 22, 23]
    })
}

In [49]:
def find_the_best_regression(X, y, scoring, baseline=False):
    
    # Категориальные и числовые признаки
    numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
    
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler",  RobustScaler())
    #    ,("pca", PCA(n_components=n_components)) # количество компонент
    ])
    
    # Категориальный препроцессор: заполняем NaN → 'Missing', затем OneHot
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])
    
    preprocessor = ColumnTransformer([
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])
    
    X_train, y_train = X, y
    
    results = []
    
    for name, (model, params) in tqdm(models_params.items()):
        pipe = Pipeline([
            ("preprocessing", preprocessor),
            ("regressor", model)
        ])
    
        grid = GridSearchCV(pipe, param_grid=params, scoring=scoring, cv=3, n_jobs=-1)
        grid.fit(X_train, y_train)

        if baseline:
            y_pred = grid.predict(X_test) 
            test_r2, mae, rmse = r2_score(y_test, y_pred), mean_absolute_error(y_test, y_pred), root_mean_squared_error(y_test, y_pred)
        else:
            test_r2, mae, rmse = None, None, None
            
        results.append({
            "Model": name,
            "Best Params": grid.best_params_,
             f"Validation {scoring}": grid.best_score_,
            "Test R²": test_r2,
            "Test MAE": mae,
            "Test RMSE": rmse,
            "Best Estimator": grid.best_estimator_
        })

    results_df = pd.DataFrame(results)
       
    return results_df

In [50]:
# Train/test for regression
# колонка Quality должна быть оставлена в тренировочных выборках, так как по ней осущевстляется фильтрация данных для обучения моделей.
X_train = X_train_g.drop(columns=["SalePrice"])
X_test = X_test_g.drop(columns=["SalePrice", "Quality"])
y_train = y_train_g 
y_test = y_test_g.drop(columns=["Quality"])

In [None]:
with open('x_test.pickle', 'wb') as f:
    pickle.dump(X_test, f, pickle.HIGHEST_PROTOCOL)
with open('y_test.pickle', 'wb') as f:
    pickle.dump(y_test, f, pickle.HIGHEST_PROTOCOL)

In [51]:
scoring = 'r2'
models = []

In [52]:
for k in range(4):
    Xk = X_train.loc[X_train['Quality'] == k].drop(columns=['Quality'])
    yk = y_train.loc[y_train['Quality'] == k, 'SalePrice']
    model = find_the_best_regression(Xk, yk, scoring)
    models.append(model)
    #TODO - shap values для каждого типа регрессии, определение важности параметров для каждого класса

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:12<00:00,  4.06s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:07<00:00,  2.34s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00,  2.20s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00,  2.18s/it]


In [53]:
with open('regression.pickle', 'wb') as f:
    pickle.dump(models, f, pickle.HIGHEST_PROTOCOL)

## Baseline

In [54]:
y_train = y_train['SalePrice']
results_df = find_the_best_regression(X_train.drop(columns=["Quality"]), y_train, 'r2', baseline=True)

results_df

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:08<00:00, 22.87s/it]


Unnamed: 0,Model,Best Params,Validation r2,Test R²,Test MAE,Test RMSE,Best Estimator
0,KNN,"{'regressor__n_neighbors': 7, 'regressor__p': ...",0.858822,0.861995,19.347841,28.108901,"(ColumnTransformer(transformers=[('num',\n ..."
1,ElasticNet,"{'regressor__alpha': 0.07, 'regressor__l1_rati...",0.91107,0.91427,16.065511,22.154541,"(ColumnTransformer(transformers=[('num',\n ..."
2,RandomForest,"{'regressor__max_depth': 23, 'regressor__n_est...",0.907528,0.905332,16.31732,23.280833,"(ColumnTransformer(transformers=[('num',\n ..."


In [55]:
with open('baseline.pickle', 'wb') as f:
    pickle.dump(results, f, pickle.HIGHEST_PROTOCOL)