In [365]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import os
import math

In [366]:
pd.set_option("display.max_rows", 1000)

# Preprocessing

In [367]:
df = pd.read_csv('train.csv')
val = pd.read_csv('test.csv')

In [368]:
TARGET_COL = ['SalePrice']

ID_COL = 'Id'

REAL_COLS = ['MSSubClass', 'OverallQual', 'OverallCond', 'LotFrontage', 'LotArea',
             'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
             '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
             'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
             'TotRmsAbvGrd','Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF',
             'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']

CAT_COLS = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
            'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
            'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
            'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
            'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
            'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
            'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
            
DATA_COLS = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'MoSold', 'YrSold']


##for function delete_nan_value
BASEMENT_COLS = ['BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
MASONRY_COLS = ['MasVnrArea']
GARAGE_COLS =['GarageFinish', 'GarageQual', 'GarageCond']
GARAGE_COL = ['GarageYrBlt']

MAX_NUMBER_NAN_VALUE_COLS = ['PoolQC', 'Fence', 'MiscFeature', 'Alley', 'BsmtQual', 'GarageType' , 'FireplaceQu']
NUMBER_OF_TEST = 15



In [369]:
def preprocess_target(df: pd.DataFrame) -> pd.DataFrame:
    df[TARGET_COL] = df[TARGET_COL].astype(np.int32)
    return df

def extract_target(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    df, target = df.drop(TARGET_COL, axis = 1), df[TARGET_COL]
    return df, target

def cast_types(df: pd.DataFrame) -> pd.DataFrame:
    df[CAT_COLS] = df[CAT_COLS].astype('category')
    df[REAL_COLS] = df[REAL_COLS].astype(np.int32)
    df[DATA_COLS] = df[DATA_COLS].astype(np.int32)
    return df

def set_idx(df: pd.DataFrame, idx_col: str) -> pd.DataFrame:
    df = df.set_index(idx_col)
    return df


In [370]:
def delete_nan_value(df: pd.DataFrame) ->pd.DataFrame:
    ##basement 
    for col in BASEMENT_COLS:
        temp_array = list(df[col])
        count = 0
        for index, row in df.iterrows():
            if row['BsmtQual'] == 'NA':
                if pd.isna(row[col]):
                    temp_array[count] = 'NA'
            count += 1
        df = df.drop(columns = col)
        count = 0
        for index, row in df.iterrows():
            df.loc[index,col] = temp_array[count]
            count += 1 
        df = df[(pd.isna(df[col])) == False] 
    
    ##Masonry veneer
    for col in MASONRY_COLS:
        temp_array = list(df[col])
        count = 0
        for index, row in df.iterrows():
            if row['MasVnrType'] == 'None':
                if pd.isna(row[col]):
                    temp_array[count] = 0
            count += 1
        df = df.drop(columns = col)
        count = 0
        for index, row in df.iterrows():
            df.loc[index,col] = temp_array[count]
            count += 1 
        df = df[(pd.isna(df[col])) == False] 
    
    ##Electrical 
    df = df[(pd.isna(df['Electrical'])) == False] 

    ##Garage
    for col in GARAGE_COLS:
        temp_array = list(df[col])
        count = 0
        for index, row in df.iterrows():
            if row['GarageType'] == 'NA':
                if pd.isna(row[col]):
                    temp_array[count] = 'NA'
            count += 1
        df = df.drop(columns = col)
        count = 0
        for index, row in df.iterrows():
            df.loc[index,col] = temp_array[count]
            count += 1 
        df = df[(pd.isna(df[col])) == False] 
    for col in GARAGE_COL:
        temp_array = list(df[col])
        count = 0
        for index, row in df.iterrows():
            if row['GarageType'] == 'NA':
                if pd.isna(row[col]):
                    temp_array[count] = 0
            count += 1
        df = df.drop(columns = col)
        count = 0
        for index, row in df.iterrows():
            df.loc[index,col] = temp_array[count]
            count += 1 
        df = df[(pd.isna(df[col])) == False]            
    return df

In [371]:
def full_columns_with_max_number_of_nan(df: pd.DataFrame) -> pd.DataFrame:
    df[MAX_NUMBER_NAN_VALUE_COLS] = df[MAX_NUMBER_NAN_VALUE_COLS].replace(np.nan, 'NA')
    df['LotFrontage'] = df['LotFrontage'].replace(np.nan, 0) 
    df['MasVnrType'] = df['MasVnrType'].replace(np.nan, 'None')

    return df


In [372]:
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
    df = set_idx(df, ID_COL)
    df = full_columns_with_max_number_of_nan(df)
    df = delete_nan_value(df)
    df = cast_types(df)
    return df

In [373]:
def preprocess_val(df: pd.DataFrame) -> pd.DataFrame:
    df = set_idx(df, ID_COL)
    df = full_columns_with_max_number_of_nan(df)
    df = delete_nan_value(df)
    df = df.apply(lambda x: x.fillna(x.value_counts().index[0]))
    df = cast_types(df)
    return df

In [374]:
import pickle
from typing import Union
from pandas import DataFrame
from pandas.core.indexes.base import Index as PandasIndex


def save_as_pickle(obj: Union[DataFrame, PandasIndex], path: str) -> None:
    if isinstance(obj, DataFrame):
        obj.to_pickle(path)
    elif isinstance(obj, PandasIndex):
        with open('path', 'wb') as f:
            pickle.dump(obj, f)

In [375]:
df = preprocess_df(df)
val = preprocess_val(val)

##только для train()
df, target = extract_target(df)
target = preprocess_target(target)
#save_as_pickle(target, '../data/processed/target_for_data_for_train.pkl')
#save_as_pickle(df, '../data/interim/data_for_train.pkl')

##только для val
#save_as_pickle(val, '../data/interim/val.pkl')

# Feature generation

In [376]:
# interim_data_path = '../data/interim/'
# train = pd.read_pickle(os.path.join(interim_data_path, 'data_for_train.pkl'))
# test = pd.read_pickle(os.path.join(interim_data_path, 'val.pkl'))

In [377]:
train = df
test = val 

In [378]:
##for feature generation  class of building

def LotFrontage_points(lotfrontage_) -> int:
    if lotfrontage_ >= 100:
        return 5
    elif 40 <= lotfrontage_ < 100:
        return 4
    elif 30 <= lotfrontage_ < 40:
        return 3
    elif 0 < lotfrontage_ < 30:
        return 2 
    else:
        return 0

def LotArea_points(lotarea_) -> int:
    if lotarea_ >= 16000:
        return 5
    elif 9000 <= lotarea_ < 16000:
        return 4
    elif 5000 <= lotarea_ < 9000:
        return 3
    elif 0 < lotarea_ < 5000:
        return 2
    else:
        return 0

def Utilities_points(utilities_) -> int:
    if utilities_ == 'AllPub':
        return 5
    elif utilities_ == 'NoSewr':
        return 4
    elif utilities_ == 'NoSeWa':
        return 3
    elif utilities_ == 'ELO':
        return 2 
    else:
        return 0

def OverallQual_points(overallqual_) -> int:
    excellent = [10, 9, 8]
    good = [7, 6]
    average = [5, 4]
    poor = [3, 2, 1]
    if overallqual_ in excellent:
        return 5
    elif overallqual_ in good:
        return 4
    elif overallqual_ in average:
        return 3
    elif overallqual_ in poor:
        return 2
    else:
        return 0

def OverallCond_points(overallcond_) -> int:
    excellent = [10, 9, 8]
    good = [7, 6]
    average = [5, 4]
    poor = [3, 2, 1]
    if overallcond_ in excellent:
        return 5
    elif overallcond_ in good:
        return 4
    elif overallcond_ in average:
        return 3
    elif overallcond_ in poor:
        return 2
    else:
        return 0

def YearBuilt_points(yearbuilt_) -> int:
    if yearbuilt_ >= 2000:
        return 5
    elif 1971 <= yearbuilt_ < 2000:
        return 4
    elif 1920 <= yearbuilt_ < 1971:
        return 3
    elif yearbuilt_ < 1920:
        return 2
    else:
        return 0

def BsmtFinType1_points(bsmtfintype1_) -> int:
    if bsmtfintype1_ == 'GLQ' or bsmtfintype1_ == 'ALQ':
        return 5
    elif bsmtfintype1_ == 'BLQ' or bsmtfintype1_ == 'Rec':
        return 4
    elif bsmtfintype1_ == 'LwQ':
        return 3
    elif bsmtfintype1_ == 'Unf':
        return 2 
    else:
        return 0

def BsmtFinType2_points(bsmtfintype2_) -> int:
    if bsmtfintype2_ == 'GLQ' or bsmtfintype2_ == 'ALQ':
        return 5
    elif bsmtfintype2_ == 'BLQ' or bsmtfintype2_ == 'Rec':
        return 4
    elif bsmtfintype2_ == 'LwQ':
        return 3
    elif bsmtfintype2_ == 'Unf':
        return 2 
    else:
        return 0

def TotalBsmtSF_points(totalbsmtsf_) -> int:
    if totalbsmtsf_ >= 3000:
        return 5
    elif 1000 <= totalbsmtsf_ < 3000:
        return 4
    elif 500 <= totalbsmtsf_ < 1000:
        return 3
    elif 0 < totalbsmtsf_ < 500:
        return 2
    else:
        return 0

def TotRmsAbvGrd_points(totrmsabvgrd_) -> int:
    if totrmsabvgrd_ >= 10:
        return 5
    elif 6 <= totrmsabvgrd_ < 10:
        return 4
    elif 3 <= totrmsabvgrd_ < 6:
        return 3
    elif totrmsabvgrd_ < 3:
        return 2
    else:
        return 0

def GarageType_points(garagetype_) -> int:
    if garagetype_ == '2Types' or garagetype_ == 'Attchd':
        return 5
    elif garagetype_ == 'Basment' or garagetype_ == 'BuiltIn':
        return 4
    elif garagetype_ == 'CarPort':
        return 3
    elif garagetype_ == 'Detchd':
        return 2 
    else:
        return 0

def GarageYrBlt_points(garageyrblt_) -> int:
    if garageyrblt_ >= 2000:
        return 5
    elif 1971 <= garageyrblt_ < 2000:
        return 4
    elif 1920 <= garageyrblt_ < 1971:
        return 3
    elif garageyrblt_ < 1920:
        return 2
    else:
        return 0

def GarageArea_points(garagearea_) -> int:
    if garagearea_ >= 900:
        return 5
    elif 400 <= garagearea_ < 900:
        return 4
    elif 200 <= garagearea_ < 400:
        return 3
    elif 0 < garagearea_ < 200:
        return 2
    else:
        return 0

def PoolArea_points(poolarea_) -> int:
    if poolarea_ >= 100:
        return 5
    elif 50 <= poolarea_ < 100:
        return 4
    elif 2 <= poolarea_ < 50:
        return 3
    elif 0 < poolarea_ < 2:
        return 2
    else:
        return 0

def MiscFeature_points(miscfeature_) -> int:
    if miscfeature_ != 'NA':
        return 5
    else:
        return 0


In [379]:
def class_of_building(df: pd.DataFrame) -> pd.DataFrame:
    building_class = []
    for index, row in df.iterrows():
        count = 0
        count += LotFrontage_points(row['LotFrontage'])
        count += LotArea_points(row['LotArea'])
        count += Utilities_points(row['Utilities'])
        count += OverallQual_points(row['OverallQual'])
        count += OverallCond_points(row['OverallCond'])
        count += YearBuilt_points(row['YearBuilt'])
        count += BsmtFinType1_points(row['BsmtFinType1'])
        count += BsmtFinType2_points(row['BsmtFinType2'])
        count += TotalBsmtSF_points(row['TotalBsmtSF'])
        count += TotRmsAbvGrd_points(row['TotRmsAbvGrd'])
        count += GarageType_points(row['GarageType'])
        count += GarageYrBlt_points(row['GarageYrBlt'])
        count += GarageArea_points(row['GarageArea'])
        count += PoolArea_points(row['PoolArea'])
        count += MiscFeature_points(row['MiscFeature']) 
        building_class.append(round( count/NUMBER_OF_TEST))
    i = 0 
    for index, row in df.iterrows():
            df.loc[index,'ClassBuilt'] = building_class[i]
            i += 1 
    df['ClassBuilt'] = df['ClassBuilt'].astype(np.int32)
    return df
    

In [380]:
## age at the sold moment 
def age_of_building_at_the_time_of_sale(df: pd.DataFrame) -> pd.DataFrame:
    building_age = []
    for index, row in df.iterrows():
        building_age.append(int(row['YrSold'] - row['YearBuilt']))
    i = 0 
    for index, row in df.iterrows():
            df.loc[index,'BuiltAge'] = building_age[i]
            i += 1 
    df['BuiltAge'] = df['BuiltAge'].astype(np.int32)
    return df

In [381]:
def feature_generation(df: pd.DataFrame) -> pd.DataFrame:
    df = class_of_building(df)
    df = age_of_building_at_the_time_of_sale(df)
    return df

In [382]:
train = feature_generation(train)
test = feature_generation(test)

# save_as_pickle(test, '../data/processed/test.pkl')
# save_as_pickle(train, '../data/processed/data_for_train.pkl')


# Modeling

In [383]:
import os
import pandas as pd
import numpy as np

from sklearn.svm import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.compose import *
from sklearn.pipeline import *
from sklearn.metrics import *
from sklearn.impute import *
from sklearn.multioutput import *
from sklearn.metrics import precision_score, f1_score, recall_score, roc_auc_score

#%pip install catboost
import catboost
from catboost import Pool,cv
from catboost import CatBoostClassifier, CatBoostRegressor

import skmultilearn
from skmultilearn.model_selection import IterativeStratification

#%pip install category_encoders
import category_encoders as ce
#%pip install lightgbm
import lightgbm as ltb

## Training

In [384]:
# processed_data_path = '../data/processed/'
# train = pd.read_pickle(os.path.join(processed_data_path, 'data_for_train.pkl'))
# target = pd.read_pickle(os.path.join(processed_data_path, 'target_for_data_for_train.pkl'))

In [391]:
train_ = train

In [392]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(train_, target, test_size=0.3, random_state=77)

In [397]:
# from sklearn.model_selection import GridSearchCV

# model_ = CatBoostRegressor(cat_features = CAT_COLS)
# parameters = {
#     'iterations': [100,150,300,500,1000,1100], 
#     'loss_function':['RMSE', 'MAPE', 'MAE'],
#     'learning_rate' : [0.01, 0.1, 0.02, 0.3],
#     'ctr_leaf_count_limit': [5, 10, 20, 30]
#     }
# clf = GridSearchCV(model_, parameters)
# clf.fit(X_train, Y_train)

In [388]:
##CATBOOST
model = CatBoostRegressor(
    iterations = 1000,
    loss_function='MAPE',
    cat_features = CAT_COLS)


pipeline_castboost = Pipeline ([
    ('model_cast', model)])

pipeline_castboost.fit(X_train,Y_train)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


0:	learn: 0.2827555	total: 64.3ms	remaining: 1m 4s
1:	learn: 0.2803795	total: 131ms	remaining: 1m 5s
2:	learn: 0.2781094	total: 224ms	remaining: 1m 14s
3:	learn: 0.2772690	total: 298ms	remaining: 1m 14s
4:	learn: 0.2756719	total: 364ms	remaining: 1m 12s
5:	learn: 0.2737766	total: 427ms	remaining: 1m 10s
6:	learn: 0.2727841	total: 493ms	remaining: 1m 9s
7:	learn: 0.2708255	total: 564ms	remaining: 1m 9s
8:	learn: 0.2702294	total: 638ms	remaining: 1m 10s
9:	learn: 0.2690027	total: 695ms	remaining: 1m 8s
10:	learn: 0.2676575	total: 763ms	remaining: 1m 8s
11:	learn: 0.2667437	total: 834ms	remaining: 1m 8s
12:	learn: 0.2666251	total: 864ms	remaining: 1m 5s
13:	learn: 0.2663087	total: 924ms	remaining: 1m 5s
14:	learn: 0.2645759	total: 988ms	remaining: 1m 4s
15:	learn: 0.2632008	total: 1.06s	remaining: 1m 5s
16:	learn: 0.2617398	total: 1.14s	remaining: 1m 5s
17:	learn: 0.2612499	total: 1.21s	remaining: 1m 5s
18:	learn: 0.2601923	total: 1.26s	remaining: 1m 4s
19:	learn: 0.2591212	total: 1.32s	r

In [389]:
from sklearn.metrics import mean_squared_error,mean_absolute_error, mean_absolute_percentage_error ,mean_squared_log_error
from sklearn.metrics import r2_score

y_predict = pipeline_castboost.predict(X_test)

print(r2_score(Y_test, y_predict))
print(mean_squared_log_error(Y_test, y_predict))
print(mean_absolute_percentage_error(Y_test, y_predict))

0.22323370258365827
0.09329096492247681
0.17645928714925696


In [399]:
from sklearn.ensemble import RandomForestRegressor

real_pipe = Pipeline([
    ('scaler', StandardScaler())])

cat_pipe = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore',sparse=False))
])

preprocess_pipe = ColumnTransformer(transformers=[
    ('real_cols', real_pipe, REAL_COLS),
    ('cat_cols', cat_pipe, CAT_COLS),
    ('cat_bost_cols', ce.CountEncoder(), CAT_COLS), ##for work with cat_features
]
)

model_ltb = RandomForestRegressor()

model_pipe = Pipeline([
    ('preprocess', preprocess_pipe),
    ('model', model_ltb)
]
)

pipline_ltb = model_pipe
pipline_ltb.fit(X_train, Y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [400]:
y_predict = pipline_ltb.predict(X_test)

print(r2_score(Y_test, y_predict))
print(mean_squared_log_error(Y_test, y_predict))
print(mean_absolute_percentage_error(Y_test, y_predict))

0.8512954222995871
0.023178064153545997
0.11025103242494004


In [None]:
# from joblib import dump, load
# path_to_model = '../models/'
# dump(pipeline_castboost, os.path.join(path_to_model, 'pipeline_castboost.joblib')) 
# dump(pipline_ltb, os.path.join(path_to_model, 'pipline_ltb.joblib')) 

## Inference

In [None]:
# #processed_data_path = '../data/processed/'
# #path_to_model = '../models/'
# test = pd.read_pickle(os.path.join(processed_data_path, 'test.pkl'))
# model_1  = load(os.path.join(path_to_model, 'pipeline_castboost.joblib')) 
# model_2 = load(os.path.join(path_to_model, 'pipline_ltb.joblib')) 

In [None]:
y_pred_1  = model_1.predict(test)
print(y_pred_1)
y_pred_2  = model_2.predict(test)
print(y_pred_2)