## Улучшение модели

Испоьзуя данные из предыдущего файла попробуем ограничить количество признаков и сравнить качество предсказания для модели с ограниченным количеством признаков.  
Для этого вначале используем обычный препроцессинг, а затем обрежем датасет с учетом опрделенной мажности признаков.

In [1]:
import numpy as np
import pandas as pd

In [12]:
pd.options.mode.chained_assignment = None

In [2]:
import pickle

In [3]:
import catboost

In [4]:
from ipynb.fs.full import process_functions as func

In [5]:
import importlib

In [6]:
importlib.reload(func)

<module 'ipynb.fs.full.process_functions' (D:\Learning\GeekBrains\_Final Project\process_functions.ipynb)>

In [224]:
from sklearn.model_selection import train_test_split

Загрузим и проанализируем важность признаков для последующего удаления малозначимых.

In [185]:
importances = pd.read_csv('feature_importances.csv')

In [186]:
importances.head()

Unnamed: 0,features,importance
0,full_sq,40.565687
1,life_sq,0.757903
2,floor,1.038487
3,material,0.284146
4,build_year,1.832796


In [187]:
importances = importances.sort_values(by=['importance'], ascending=False)

In [188]:
importances.shape

(288, 2)

In [189]:
importances.head(10)

Unnamed: 0,features,importance
0,full_sq,40.565687
7,state,2.486626
6,kitch_sq,1.836963
4,build_year,1.832796
5,num_room,1.714835
281,cafe_count_5000_price_high,1.621309
227,cafe_sum_2000_max_price_avg,1.595274
225,cafe_count_2000,1.390217
105,ttk_km,1.192754
233,cafe_count_2000_price_2500,1.160971


На основании выведенного фрагмента датафрэйма четко видно, что наиболее значительное влияние имеет площадь жилья (full_sq). 
Остальные факторы намного менее значимы.
Поставим условную границу для отсечения малозначимых признаков в виде 1.0

In [190]:
importances = importances[importances['importance'] > 1.0]

In [191]:
importances.shape

(11, 2)

In [192]:
features_list = importances['features'].to_list()

In [193]:
importances.head(12)

Unnamed: 0,features,importance
0,full_sq,40.565687
7,state,2.486626
6,kitch_sq,1.836963
4,build_year,1.832796
5,num_room,1.714835
281,cafe_count_5000_price_high,1.621309
227,cafe_sum_2000_max_price_avg,1.595274
225,cafe_count_2000,1.390217
105,ttk_km,1.192754
233,cafe_count_2000_price_2500,1.160971


#### Обработка датасета с учетом важности признаков.

Загрузим еще раз исходный датасет и повторим препроцессинг обрезав малозначимые признаки

In [183]:
df = pd.read_csv('project_data/attempt_4/train.csv')

In [184]:
df.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,1,2011-08-20,43,27.0,4.0,,,,,,...,9,4,0,13,22,1,0,52,4,5850000
1,2,2011-08-23,34,19.0,3.0,,,,,,...,15,3,0,15,29,1,10,66,14,6000000
2,3,2011-08-27,43,29.0,2.0,,,,,,...,10,3,0,11,27,0,4,67,10,5700000
3,4,2011-09-01,89,50.0,9.0,,,,,,...,11,2,1,4,4,0,0,26,3,13100000
4,5,2011-09-05,77,77.0,4.0,,,,,,...,319,108,17,135,236,2,91,195,14,16331452


In [146]:
with open('mean_values.pickle', 'rb') as f:
            means_dict = pickle.load(f)

In [147]:
 means_dict

{'full_sq': 54.11184513245251,
 'life_sq': 35.99719165274595,
 'kitch_sq': 6.2605214278464585,
 'floor': 8.0,
 'num_room': 2.0}

In [148]:
features_list

['full_sq',
 'state',
 'kitch_sq',
 'build_year',
 'num_room',
 'cafe_count_5000_price_high',
 'cafe_sum_2000_max_price_avg',
 'cafe_count_2000',
 'ttk_km',
 'cafe_count_2000_price_2500',
 'floor']

In [194]:
features_list.append('timestamp')

In [195]:
features_list.append('price_doc')

In [196]:
df = df[features_list]

In [151]:
df.head()

Unnamed: 0,full_sq,state,kitch_sq,build_year,num_room,cafe_count_5000_price_high,cafe_sum_2000_max_price_avg,cafe_count_2000,ttk_km,cafe_count_2000_price_2500,floor,timestamp
0,43,,,,,0,1042.86,36,10.918587,2,4.0,2011-08-20
1,34,,,,,0,1190.48,21,3.103996,2,3.0,2011-08-23
2,43,,,,,0,977.27,24,2.927487,1,2.0,2011-08-27
3,89,,,,,1,1120.0,25,14.606501,1,9.0,2011-09-01
4,77,,,,,17,1269.23,483,1.721834,50,4.0,2011-09-05


Применим функции к датасету еще раз.

In [152]:
def num_values_process(df, means):
    """data transforming and outliers removing"""
    
    #exchange array values to single number
    indices_room = df[df['num_room'].apply(lambda x: isinstance(x, (np.ndarray)))]['num_room'].index
    for index_val in indices_room:
        df['num_room'].iloc[index_val] = round(sum(df['num_room'].iloc[index_val]/len(df['num_room'].iloc[index_val])),0)
    
    # trim too large values
    df = df.loc[(df['full_sq'] < 1000)
           & (df['floor'] < 50)
           & (df['num_room'] < 12)
           & (df['kitch_sq'] < 100)]
    
    #full square   
    df.loc[df['full_sq'] < 14, 'full_sq'] = means['full_sq']
    
    #floor
    df.loc[df['floor'] < 1, 'floor'] = means['floor']    
    
    # num room
    df.loc[df['num_room'] < 1, 'num_room'] = means['num_room']
    
    #kitchen square
    df.loc[df['kitch_sq'] < 2, 'kitch_sq'] = means['kitch_sq']
    
    return df

In [197]:
indices_room = df[df['num_room'].apply(lambda x: isinstance(x, (np.ndarray)))]['num_room'].index
for index_val in indices_room:
    df['num_room'].iloc[index_val] = round(sum(df['num_room'].iloc[index_val]/len(df['num_room'].iloc[index_val])),0)

In [198]:
indices_floor = df[df['floor'].apply(lambda x: isinstance(x, (np.ndarray)))]['floor'].index
for index_val in indices_floor:
    df['floor'].iloc[index_val] = round(sum(df['floor'].iloc[index_val]/len(df['floor'].iloc[index_val])),0)

In [199]:
indices_state = df[df['state'].apply(lambda x: isinstance(x, (np.ndarray)))]['state'].index
for index_val in indices_state:
    df['state'].iloc[index_val] = round(sum(df['state'].iloc[index_val]/len(df['state'].iloc[index_val])),0)

In [200]:
df=num_values_process(df, means_dict)

In [201]:
features_list

['full_sq',
 'state',
 'kitch_sq',
 'build_year',
 'num_room',
 'cafe_count_5000_price_high',
 'cafe_sum_2000_max_price_avg',
 'cafe_count_2000',
 'ttk_km',
 'cafe_count_2000_price_2500',
 'floor',
 'timestamp',
 'price_doc']

In [202]:
mode_state = df['state'].mode().to_list()[0]
mode_state

2.0

In [203]:
df['state'] = df['state'].fillna(mode_state)

In [204]:
df.head()

Unnamed: 0,full_sq,state,kitch_sq,build_year,num_room,cafe_count_5000_price_high,cafe_sum_2000_max_price_avg,cafe_count_2000,ttk_km,cafe_count_2000_price_2500,floor,timestamp,price_doc
7672,73.0,2.0,11.0,,2.0,0,1166.67,3,10.868389,0,17.0,2013-04-27,10100000
8056,54.111845,3.0,12.0,1907.0,1.0,26,1503.45,474,2.301037,69,2.0,2013-05-21,2750000
8111,85.0,2.0,6.260521,,3.0,0,1500.0,3,17.691722,1,13.0,2013-05-23,7574496
8135,53.0,3.0,8.0,1980.0,2.0,0,1141.03,41,11.018216,2,10.0,2013-05-25,9000000
8144,41.0,2.0,6.260521,,1.0,0,1500.0,2,20.604213,0,13.0,2013-05-27,4457400


In [205]:
means_by_state = dict()

In [206]:
for column in features_list[:-2]:
    mean_series = df.groupby('state')[column].agg(pd.Series.mode)
    means_by_state[column] = mean_series

In [207]:
for key in means_by_state:
        values = means_by_state[key]
        df[key] = df[key].fillna(df['state'].apply(lambda x: values.get(x)))  

In [208]:
df.shape

(20887, 13)

In [209]:
def year_population(value):
    if value is None:
        return None
    elif value < 22:
        return value + 2000
    elif value//1000 == 0 and value//100 == 0 and value > 22:
        return value + 1900
    elif value//1000 == 0 and value // 100 > 0:        
        return None  
    else:
        return value  

In [210]:
df['build_year'] = df['build_year'].apply(lambda x: year_population(x)) 

In [211]:
df.isna().sum()

full_sq                        0
state                          0
kitch_sq                       0
build_year                     1
num_room                       0
cafe_count_5000_price_high     0
cafe_sum_2000_max_price_avg    0
cafe_count_2000                0
ttk_km                         0
cafe_count_2000_price_2500     0
floor                          0
timestamp                      0
price_doc                      0
dtype: int64

In [212]:
build_year_mode = df['build_year'].mode().to_list()[0]
build_year_mode

1970.0

In [213]:
df['build_year']=df['build_year'].fillna(build_year_mode)

In [214]:
df.isna().sum()

full_sq                        0
state                          0
kitch_sq                       0
build_year                     0
num_room                       0
cafe_count_5000_price_high     0
cafe_sum_2000_max_price_avg    0
cafe_count_2000                0
ttk_km                         0
cafe_count_2000_price_2500     0
floor                          0
timestamp                      0
price_doc                      0
dtype: int64

In [215]:
for column in features_list[:-2]:
    if column in ['build_year', 'state', 'num_room', 'cafe_count_5000_price_high', 
                  'cafe_count_2000', 'floor', 'cafe_count_2000_price_2500']:
        df[column] = df[column].astype('int32')
    else:
        df[column] = df[column].astype('float')

In [216]:
df.head()

Unnamed: 0,full_sq,state,kitch_sq,build_year,num_room,cafe_count_5000_price_high,cafe_sum_2000_max_price_avg,cafe_count_2000,ttk_km,cafe_count_2000_price_2500,floor,timestamp,price_doc
7672,73.0,2,11.0,1970,2,0,1166.67,3,10.868389,0,17,2013-04-27,10100000
8056,54.111845,3,12.0,1907,1,26,1503.45,474,2.301037,69,2,2013-05-21,2750000
8111,85.0,2,6.260521,1970,3,0,1500.0,3,17.691722,1,13,2013-05-23,7574496
8135,53.0,3,8.0,1980,2,0,1141.03,41,11.018216,2,10,2013-05-25,9000000
8144,41.0,2,6.260521,1970,1,0,1500.0,2,20.604213,0,13,2013-05-27,4457400


### Обучение и проверка модели без учета макроэкономических показателей

In [218]:
y = df['price_doc']

In [220]:
y.head()

7672    10100000
8056     2750000
8111     7574496
8135     9000000
8144     4457400
Name: price_doc, dtype: int64

In [219]:
X = df[features_list[:-2]]

In [221]:
X.head()

Unnamed: 0,full_sq,state,kitch_sq,build_year,num_room,cafe_count_5000_price_high,cafe_sum_2000_max_price_avg,cafe_count_2000,ttk_km,cafe_count_2000_price_2500,floor
7672,73.0,2,11.0,1970,2,0,1166.67,3,10.868389,0,17
8056,54.111845,3,12.0,1907,1,26,1503.45,474,2.301037,69,2
8111,85.0,2,6.260521,1970,3,0,1500.0,3,17.691722,1,13
8135,53.0,3,8.0,1980,2,0,1141.03,41,11.018216,2,10
8144,41.0,2,6.260521,1970,1,0,1500.0,2,20.604213,0,13


In [225]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.33, random_state = 33)

In [226]:
cat_features = ['build_year', 'state', 'num_room', 'cafe_count_5000_price_high', 
                  'cafe_count_2000', 'floor', 'cafe_count_2000_price_2500']

In [227]:
model_cb_best = catboost.CatBoostRegressor(depth = 8,
                                           iterations = 100,
                                           learning_rate = 0.1,
                                           l2_leaf_reg = 1,
                                           cat_features = cat_features)

In [229]:
model_cb_best.fit(X_train, 
             y_train, 
             eval_set = (X_valid, y_valid),
             verbose=False, 
             plot = True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x2319ded3790>

Используем тестовый датасет для оценки изменеий

In [230]:
valid_df = pd.read_csv('project_data/attempt_4/test.csv')

In [236]:
X_test = func.process(valid_df, 'test')

In [237]:
X_test = X_test[features_list[:-2]]

In [238]:
y_test = model_cb_best.predict(X_test)

In [239]:
result = pd.DataFrame(valid_df['id'])

In [240]:
result['price_doc'] = y_test.tolist()

In [241]:
result = result.set_index('id')

In [242]:
result.to_csv('submission.csv')

#### Результат
Объем датасета значительно уменьшился, но точность при этом стала хуже.

### Обучение и проверка модели с учетом макроэкономических показателей