# Дипломный проект


## Прогнозирование стоимости квартир на побережье Черного моря

Описание:
Нам поставлена задача создать модель, которая будет предсказывать стоимость квартир на Черноморском побережье.
Если наша модель работает хорошо, то мы сможем быстро выявлять выгодные предложения (когда желаемая цена продавца ниже предсказанной рыночной цены).

Датасет был получен при помощи парсинга сайта cian.ru и обработан на этапе EDA.

В данном ноутбуке мы сделаем следующее:
* Обработаем и отнормируем признаки
* Построим "наивную"/baseline модель, предсказывающую цену по общей площади и городу (с ней будем сравнивать другие модели)
* Построим модель на основи логистической регресии
* Обучим модель на основе случайного леса
* Сделаем  модель на основе градиентного бустинга с помощью CatBoost
* Применим кросс-валидацию для градиетного бустинга
* На основе предыдущих шагов выберем оптимальную модель

# Загрузка библиотек

In [1]:
#!pip install catboost

In [2]:
import random
import numpy as np 
import pandas as pd 
import sys

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm

# plt
import matplotlib.pyplot as plt
#увеличим дефолтный размер графиков
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5
#графики в svg выглядят более четкими
%config InlineBackend.figure_format = 'svg' 
%matplotlib inline

In [3]:
print('Python       :', sys.version.split('\n')[0])
print('Numpy        :', np.__version__)

Python       : 3.8.3 (default, Jul  2 2020, 16:21:59) 
Numpy        : 1.18.5


In [4]:
!pip freeze > requirements.txt

# Настройки

In [5]:
# зафиксируем RANDOM_SEED, чтобы эксперименты были воспроизводимы
RANDOM_SEED = 42

TEST_SIZE = 0.2

# Вспомогательные функции

In [6]:
# напишем функцию для расчета mape
def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred-y_true)/y_true))

# Загружаем данные

In [7]:
df = pd.read_csv('cian_eda.csv')

In [8]:
df.head()

Unnamed: 0,newBuilding,flatType,floorNumber,fromDeveloper,isApartments,isAuction,kitchenArea,livingArea,roomsCount,totalArea,...,cargoLiftsCount,materialType,hasBalcony,isBasement,isFirstFloor,isLastFloor,cityPopulation,cityArea,populationDensity,areaPerRoom
0,0,rooms,2,False,False,True,10.411423,26.0,1,54.4,...,0,monolithBrick,1,0,0,0,79056,18.2,4343.736264,54.4
1,1,rooms,7,True,False,True,8.091819,24.330994,1,42.28,...,0,monolith,0,0,0,0,79056,18.2,4343.736264,42.28
2,1,rooms,6,True,True,True,10.124343,30.442517,1,52.9,...,0,monolith,0,0,0,0,79056,18.2,4343.736264,52.9
3,0,rooms,2,False,True,True,10.0,13.81135,1,24.0,...,0,monolith,1,0,0,0,79056,18.2,4343.736264,24.0
4,0,rooms,3,False,False,True,8.61239,25.896281,1,45.0,...,1,monolith,1,0,0,0,79056,18.2,4343.736264,45.0


# Предобработка данных

In [9]:
# Составим список бинарных категориальных признаков:
bin_features = [
    'newBuilding',
    'fromDeveloper',
    'isApartments',
    'isAuction',
    'isComplete',
    'hasBalcony',
    'isBasement',
    'isFirstFloor',
    'isLastFloor',
    'passengerLiftsCount',
    'cargoLiftsCount',
]

# Составим список категориальных признаков:
cat_features = [
    'flatType',
    'region',
    'city',
    'materialType',
]
 
# Составим список числовых признаков:
num_features = [
    'floorNumber',
    'kitchenArea',
    'livingArea',
    'roomsCount',
    'totalArea',
    'floorsCount',
    'cityPopulation',
    'cityArea',
    'populationDensity',
    'areaPerRoom',
]

In [10]:
def preproc_data(df_input):
    '''includes several functions to pre-process the predictor data.'''
    
    df_output = df.copy()
 
    # Нормализация данных и логорифмирование
    scaler = MinMaxScaler()
    for column in num_features:
        df_output[column] = scaler.fit_transform(df_output[[column]])[:,0]
        # логорифмирование ухудшило результат
        # df_output[column] = np.log(df_output[column])
        
    # ################### Categorical Features ############################################################## 
      
    # # Label Encoding
    for column in bin_features:
        df_output[column] = df_output[column].astype('category').cat.codes
        
    # # One-Hot Encoding:
    df_output = pd.get_dummies(df_output, columns=cat_features, dummy_na=False)
    
    return df_output

In [11]:
# Запускаем и проверяем, что получилось
df_preproc = preproc_data(df)
df_preproc.sample(10)

Unnamed: 0,newBuilding,floorNumber,fromDeveloper,isApartments,isAuction,kitchenArea,livingArea,roomsCount,totalArea,price,...,materialType_foamConcreteBlock,materialType_gasSilicateBlock,materialType_monolith,materialType_monolithBrick,materialType_old,materialType_panel,materialType_stalin,materialType_unknown,materialType_wireframe,materialType_wood
15013,0,0.0,0,0,0,0.13825,0.230493,0.0,0.327731,5100000,...,0,0,1,0,0,0,0,0,0,0
4842,0,0.0,0,0,0,0.082508,0.144068,0.0,0.184874,2550000,...,0,0,0,0,0,0,0,0,0,0
27437,0,0.333333,0,0,0,0.115512,0.364407,0.4,0.470588,7100000,...,0,0,0,0,0,1,0,0,0,0
2350,0,0.0,0,0,0,0.069307,0.170339,0.0,0.195798,3000000,...,0,0,0,0,0,0,0,0,0,0
2892,0,0.125,0,1,0,0.187518,0.306572,0.2,0.458824,5200000,...,0,0,0,0,0,0,0,1,0,0
2576,1,0.125,0,0,0,0.148515,0.135593,0.0,0.273109,2928750,...,0,0,1,0,0,0,0,0,0,0
24241,0,0.0,0,0,0,0.219472,0.225424,0.0,0.420168,4300000,...,0,0,0,0,0,0,0,1,0,0
17799,0,0.0,0,0,0,0.049505,0.135593,0.0,0.117647,3850000,...,0,0,0,0,0,0,0,1,0,0
24829,0,0.125,0,0,0,0.082508,0.219492,0.0,0.27395,4100000,...,0,0,0,0,0,0,0,0,0,0
23817,0,0.0,0,0,0,0.115512,0.144068,0.0,0.229412,3600000,...,0,0,1,0,0,0,0,0,0,0


In [12]:
df_preproc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31315 entries, 0 to 31314
Data columns (total 61 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   newBuilding                     31315 non-null  int8   
 1   floorNumber                     31315 non-null  float64
 2   fromDeveloper                   31315 non-null  int8   
 3   isApartments                    31315 non-null  int8   
 4   isAuction                       31315 non-null  int8   
 5   kitchenArea                     31315 non-null  float64
 6   livingArea                      31315 non-null  float64
 7   roomsCount                      31315 non-null  float64
 8   totalArea                       31315 non-null  float64
 9   price                           31315 non-null  int64  
 10  floorsCount                     31315 non-null  float64
 11  isComplete                      31315 non-null  int8   
 12  passengerLiftsCount             

## Split Data

In [13]:
y = df_preproc.price.values
X = df_preproc.drop(['price'], axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, shuffle=True, random_state=RANDOM_SEED)

# Обучение модели

## Model 1: Создадим "наивную" модель 
Эта модель будет предсказывать среднюю стоимость квартиры по общей площади и городу. 
C ней будем сравнивать другие модели.



In [15]:
# split данных для наивной модели
data_train, data_test = train_test_split(df, test_size=TEST_SIZE, shuffle=True, random_state=RANDOM_SEED)

# Наивная модель
predicts = []
for index, row in pd.DataFrame(data_test[['totalArea', 'city']]).iterrows():
    query = f"totalArea == '{row[0]}' and city == '{row[1]}'"
    predicts.append(data_train.query(query)['price'].median())

# заполним не найденные совпадения
predicts = pd.DataFrame(predicts)
predicts = predicts.fillna(predicts.median())

# округлим
predicts = (predicts // 1000) * 1000

#оцениваем точность
print(f"Точность наивной модели по метрике MAPE: {(mape(data_test['price'], predicts.values[:, 0]))*100:0.2f}%")

Точность наивной модели по метрике MAPE: 22.29%


## Model 2: LogisticRegression

In [16]:
# model = LogisticRegression(solver='saga', max_iter=200, n_jobs=-1)
# # Обучаем модель на тестовом наборе данных
# model.fit(X_train, y_train)

In [17]:
# test_predict_LR = model.predict(X_test)
# print(f"TEST mape: {(mape(y_test, test_predict_LR))*100:0.2f}%")

TEST mape: 23.36

Логистическая регрессия работает долго на нашем датасете и показывает результаты хуже чем у наивной модели.

## Model 3: RandomForestRegressor

In [18]:
model = RandomForestRegressor(n_estimators=100, verbose=1, n_jobs=-1, random_state=RANDOM_SEED)

In [19]:
# Обучаем модель на тестовом наборе данных
model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.1s finished


RandomForestRegressor(n_jobs=-1, random_state=42, verbose=1)

In [20]:
test_predict_RFR = model.predict(X_test)
print(f"TEST mape: {(mape(y_test, test_predict_RFR))*100:0.2f}%")

TEST mape: 15.88%


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


Случайный лес работает очень быстро и показывает приемлемые результаты.

## Model 4: CatBoostRegressor

Обучим модель на основе catboost, предварительно настроим параметры для получения наилучшего результата.

In [21]:
model = CatBoostRegressor(iterations = 7000,
                          #depth=12,
                          #learning_rate = 0.02,
                          random_seed = RANDOM_SEED,
                          eval_metric='MAPE',
                          custom_metric=['RMSE', 'MAE'],
                          od_wait=500,
                          #task_type='GPU',
                         )
model.fit(X_train, y_train,
         eval_set=(X_test, y_test),
         verbose_eval=100,
         use_best_model=True,
         #plot=True
         )

Learning rate set to 0.025198
0:	learn: 0.4471982	test: 0.4488975	best: 0.4488975 (0)	total: 54.1ms	remaining: 6m 18s
100:	learn: 0.2245133	test: 0.2226927	best: 0.2226927 (100)	total: 619ms	remaining: 42.3s
200:	learn: 0.1973308	test: 0.1952692	best: 0.1952692 (200)	total: 1.3s	remaining: 44.2s
300:	learn: 0.1884244	test: 0.1868748	best: 0.1868748 (300)	total: 1.94s	remaining: 43.2s
400:	learn: 0.1837890	test: 0.1827972	best: 0.1827972 (400)	total: 2.76s	remaining: 45.4s
500:	learn: 0.1805586	test: 0.1803172	best: 0.1803172 (500)	total: 3.57s	remaining: 46.3s
600:	learn: 0.1779863	test: 0.1784174	best: 0.1784174 (600)	total: 4.57s	remaining: 48.7s
700:	learn: 0.1757665	test: 0.1768621	best: 0.1768611 (699)	total: 5.55s	remaining: 49.9s
800:	learn: 0.1737174	test: 0.1754082	best: 0.1754082 (800)	total: 6.31s	remaining: 48.8s
900:	learn: 0.1719221	test: 0.1743347	best: 0.1743347 (900)	total: 7.19s	remaining: 48.7s
1000:	learn: 0.1703156	test: 0.1733613	best: 0.1733613 (1000)	total: 8.09

<catboost.core.CatBoostRegressor at 0x7faff527fcd0>

In [22]:
test_predict_catboost = model.predict(X_test)
print(f"TEST mape: {(mape(y_test, test_predict_catboost))*100:0.2f}%")

TEST mape: 16.23%


Catboost показал результаты хуже чем RandomForest

## Model 5: Catboost CV

Организуем обучение модели на 5 фолдах, с дальнейшим объединением предсказаний от каждой модели.

In [23]:
def cat_model(y_train, X_train, X_test, y_test):
    model = CatBoostRegressor(iterations = 7000,
                              #depth=12,
                              #learning_rate = 0.02,
                              random_seed = RANDOM_SEED,
                              eval_metric='MAPE',
                              custom_metric=['RMSE', 'MAE'],
                              od_wait=500,
                              #task_type='GPU',
                            )
    model.fit(X_train, y_train,
            eval_set=(X_test, y_test),
            verbose_eval=100,
            use_best_model=True
            )
    
    return(model)

In [24]:
submissions = pd.DataFrame(0,columns=["sub_1"],index=df.index) # в submissions пишем предикты по каждой модели
score_ls = []
splits = list(KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED).split(X, y))

for idx, (train_idx, test_idx) in tqdm(enumerate(splits), total=5,):
    # use the indexes to extract the folds in the train and validation data
    X_train, y_train, X_test, y_test = X.iloc[train_idx], y[train_idx], X.iloc[test_idx], y[test_idx]
    # model for this fold
    model = cat_model(y_train, X_train, X_test, y_test,)
    # score model on test
    test_predict = model.predict(X_test)
    test_score = mape(y_test, test_predict)
    score_ls.append(test_score)
    print(f"{idx+1} Fold Test MAPE: {mape(y_test, test_predict):0.3f}")
    # submissions
    submissions[f'sub_{idx+1}'] = model.predict(X)
    model.save_model(f'catboost_fold_{idx+1}.model')
    
print(f'Mean Score: {np.mean(score_ls):0.3f}')
print(f'Std Score: {np.std(score_ls):0.4f}')
print(f'Max Score: {np.max(score_ls):0.3f}')
print(f'Min Score: {np.min(score_ls):0.3f}')

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Learning rate set to 0.025198
0:	learn: 0.4476575	test: 0.4493186	best: 0.4493186 (0)	total: 14.8ms	remaining: 1m 43s
100:	learn: 0.2240187	test: 0.2222793	best: 0.2222793 (100)	total: 719ms	remaining: 49.1s
200:	learn: 0.1969992	test: 0.1953606	best: 0.1953606 (200)	total: 1.42s	remaining: 48s
300:	learn: 0.1880496	test: 0.1868910	best: 0.1868910 (300)	total: 2.04s	remaining: 45.3s
400:	learn: 0.1833518	test: 0.1827151	best: 0.1827151 (400)	total: 3.07s	remaining: 50.6s
500:	learn: 0.1800014	test: 0.1800000	best: 0.1800000 (500)	total: 3.86s	remaining: 50s
600:	learn: 0.1775002	test: 0.1781819	best: 0.1781819 (600)	total: 4.66s	remaining: 49.6s
700:	learn: 0.1753625	test: 0.1766504	best: 0.1766504 (700)	total: 5.39s	remaining: 48.4s
800:	learn: 0.1735508	test: 0.1754997	best: 0.1754997 (800)	total: 6.04s	remaining: 46.8s
900:	learn: 0.1717889	test: 0.1743939	best: 0.1743939 (900)	total: 6.83s	remaining: 46.3s
1000:	learn: 0.1701092	test: 0.1733268	best: 0.1733252 (999)	total: 7.59s	re

1800:	learn: 0.1595768	test: 0.1751345	best: 0.1751345 (1800)	total: 14.2s	remaining: 40.9s
1900:	learn: 0.1587025	test: 0.1747575	best: 0.1747575 (1900)	total: 15s	remaining: 40.2s
2000:	learn: 0.1578041	test: 0.1743827	best: 0.1743827 (2000)	total: 15.7s	remaining: 39.3s
2100:	learn: 0.1569062	test: 0.1739840	best: 0.1739838 (2099)	total: 16.6s	remaining: 38.8s
2200:	learn: 0.1560603	test: 0.1736655	best: 0.1736588 (2195)	total: 17.5s	remaining: 38.1s
2300:	learn: 0.1552238	test: 0.1733383	best: 0.1733383 (2300)	total: 18.3s	remaining: 37.3s
2400:	learn: 0.1545140	test: 0.1730693	best: 0.1730693 (2400)	total: 19.1s	remaining: 36.7s
2500:	learn: 0.1537559	test: 0.1727692	best: 0.1727692 (2500)	total: 19.9s	remaining: 35.8s
2600:	learn: 0.1529605	test: 0.1724971	best: 0.1724951 (2599)	total: 20.8s	remaining: 35.2s
2700:	learn: 0.1522642	test: 0.1722517	best: 0.1722508 (2696)	total: 21.7s	remaining: 34.5s
2800:	learn: 0.1515647	test: 0.1720208	best: 0.1720208 (2800)	total: 22.6s	remaini

3500:	learn: 0.1478403	test: 0.1695625	best: 0.1695625 (3500)	total: 29.8s	remaining: 29.8s
3600:	learn: 0.1472705	test: 0.1694877	best: 0.1694821 (3595)	total: 30.5s	remaining: 28.8s
3700:	learn: 0.1467138	test: 0.1694211	best: 0.1694180 (3693)	total: 31.3s	remaining: 27.9s
3800:	learn: 0.1461697	test: 0.1693031	best: 0.1692949 (3797)	total: 32.3s	remaining: 27.2s
3900:	learn: 0.1456519	test: 0.1691620	best: 0.1691620 (3900)	total: 33.2s	remaining: 26.4s
4000:	learn: 0.1451273	test: 0.1691009	best: 0.1690996 (3992)	total: 33.9s	remaining: 25.4s
4100:	learn: 0.1446162	test: 0.1689941	best: 0.1689941 (4100)	total: 34.9s	remaining: 24.6s
4200:	learn: 0.1441179	test: 0.1688693	best: 0.1688593 (4192)	total: 35.7s	remaining: 23.8s
4300:	learn: 0.1436283	test: 0.1687415	best: 0.1687415 (4300)	total: 37.2s	remaining: 23.4s
4400:	learn: 0.1431266	test: 0.1686333	best: 0.1686325 (4399)	total: 38.2s	remaining: 22.5s
4500:	learn: 0.1426185	test: 0.1686044	best: 0.1685965 (4489)	total: 38.9s	remai

5300:	learn: 0.1377336	test: 0.1680194	best: 0.1680194 (5300)	total: 44.9s	remaining: 14.4s
5400:	learn: 0.1372924	test: 0.1679935	best: 0.1679935 (5400)	total: 45.7s	remaining: 13.5s
5500:	learn: 0.1368527	test: 0.1679801	best: 0.1679744 (5460)	total: 46.4s	remaining: 12.7s
5600:	learn: 0.1364171	test: 0.1679242	best: 0.1679242 (5600)	total: 47.2s	remaining: 11.8s
5700:	learn: 0.1359627	test: 0.1678149	best: 0.1678146 (5699)	total: 47.9s	remaining: 10.9s
5800:	learn: 0.1355680	test: 0.1677487	best: 0.1677480 (5798)	total: 48.7s	remaining: 10.1s
5900:	learn: 0.1351796	test: 0.1677197	best: 0.1677168 (5848)	total: 49.5s	remaining: 9.21s
6000:	learn: 0.1347469	test: 0.1676175	best: 0.1676107 (5997)	total: 50.2s	remaining: 8.36s
6100:	learn: 0.1343666	test: 0.1675882	best: 0.1675812 (6097)	total: 51s	remaining: 7.51s
6200:	learn: 0.1339512	test: 0.1675466	best: 0.1675428 (6186)	total: 51.7s	remaining: 6.67s
6300:	learn: 0.1335746	test: 0.1675106	best: 0.1675031 (6298)	total: 52.5s	remaini

5 Fold Test MAPE: 0.169

Mean Score: 0.167
Std Score: 0.0024
Max Score: 0.169
Min Score: 0.162


In [25]:
submissions.head(10)

Unnamed: 0,sub_1,sub_2,sub_3,sub_4,sub_5
0,6150055.0,6420567.0,7830331.0,6739648.0,6355821.0
1,6664767.0,6343361.0,6702276.0,6559411.0,6709008.0
2,10740020.0,10885230.0,10788240.0,10570010.0,10951060.0
3,4823804.0,4305026.0,4819319.0,5041234.0,5062820.0
4,6543354.0,6530025.0,6656011.0,6705112.0,6794609.0
5,7529061.0,7794898.0,8086521.0,7916178.0,7654107.0
6,3861869.0,3579131.0,3660145.0,3561473.0,3538923.0
7,6091849.0,6783414.0,6499555.0,6205268.0,6508734.0
8,6800348.0,6472557.0,6901472.0,6562097.0,6784395.0
9,4324761.0,4585461.0,4623920.0,4229054.0,4899326.0


In [26]:
submissions['blend'] = (submissions.sum(axis=1))/len(submissions.columns)

In [27]:
#test_predict_catboost = model.predict(X_test)
print(f"TEST mape: {(mape(y, submissions['blend']))*100:0.2f}%")

TEST mape: 13.70%


Благодаря кросс-валидации нам удалось улучшить результаты работы catboost на 2.53 % по метрике mape.

# Общие выводы:


В результате проведенной работы нам удалось добиться наилучшего результата, используя кросс-валидацию для catboost. Но если смотреть на скорость обучения, то для прототипа нам может подойти и случайный лес.