In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import pickle

from scipy.stats import mode

from IPython.display import Image

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, KFold

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, GradientBoostingClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error as mse, r2_score as r2, accuracy_score

from sklearn.svm import SVC

from sklearn.preprocessing import MinMaxScaler, RobustScaler

from sklearn.decomposition import PCA

from sklearn.manifold import TSNE

from sklearn.preprocessing import StandardScaler, RobustScaler

from sklearn.cluster import KMeans, DBSCAN

In [None]:
def evaluate_predict(true_values, pred_values, save=False):
    
    print(f'all sample R2: {r2(true_values, pred_values):.3f}')
    
    plt.figure(figsize=(8, 8))
    
    sns.scatterplot(x=pred_values, y=true_values)
    plt.plot(linestyle='--', color='black') 
    
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('True vs Predicted values')

    plt.show()

In [None]:
def preprocess_data(data):
    pp_data = data.drop(data[(data['DistrictId'] == 0) | 
                             (data['Rooms'] == 0) | 
                             (data['Rooms'] > 5) |  
                             (data['Square'] > 300) | 
                             (data['Square'] < 10) | 
                             (data['LifeSquare'] > data['Square']) |
                             (data['LifeSquare'] > 100) |
                             (data['LifeSquare'] < 5) |
                             (data['KitchenSquare'] < 3) |
                             (data['KitchenSquare'] / data['Square'] > 0.5) |
                             (data['HouseFloor'] > 50) |
                             (data['HouseFloor'] == 0) |
                             (data['HouseFloor'] < data['Floor']) |
                             (data['HouseYear'] > 2021) |
                             (data['HouseYear'] == 1977) |
                             (data['Social_3'] > 140)
                            ].index)

    pp_data = pp_data.dropna()

    return pp_data

In [None]:
def gs_gbr(data, column, test_param):
    
    column = str(column)
    x = data.drop(columns=column)
    y = data[column]

    x_train, x_val, y_train, y_val = train_test_split(x, y,
                                                      test_size=0.33, 
                                                      shuffle=True, 
                                                      random_state=42)

    model = GridSearchCV(estimator=GradientBoostingRegressor(),
                              param_grid=test_param,
                              scoring='r2',
                              cv=5)

    model.fit(x_train, y_train)

    param = model.best_params_
    print(param)
    
    print(f'GridSearchCV train sample R2: {model.best_score_:.3f}')
    
    model = GradientBoostingRegressor(**param, criterion='mse')

    model.fit(x_train, y_train)

    y_train_pred = model.predict(x_train)
    
    print(f'train sample R2: {r2(y_train, y_train_pred):.3f}')
    
    y_valid_pred = model.predict(x_val)

    print(f'valid sample R2: {r2(y_val, y_valid_pred):.3f}')
    
    model.fit(x, y)

    y_train_pred = model.predict(x)

    evaluate_predict(y, y_train_pred)
    
    return model

In [None]:
def gs_gbc(data, column, test_param):
    
    column = str(column)
    x = data.drop(columns=column)
    y = data[column]

    x_train, x_val, y_train, y_val = train_test_split(x, y,
                                                      test_size=0.33, 
                                                      shuffle=True, 
                                                      random_state=42)

    model = GridSearchCV(estimator=GradientBoostingClassifier(),
                              param_grid=test_param,
                              scoring='accuracy',
                              cv=5)

    model.fit(x_train, y_train)

    param = model.best_params_
    print(param)
    
    print(f'GridSearchCV train sample accuracy: {model.best_score_:.3f}')
    
    model = GradientBoostingClassifier(**param, criterion='mse')

    model.fit(x_train, y_train)

    y_train_pred = model.predict(x_train)
    
    print(f'train sample accuracy: {accuracy_score(y_train, y_train_pred):.3f}')
    
    y_valid_pred = model.predict(x_val)

    print(f'valid sample accuracy: {accuracy_score(y_val, y_valid_pred):.3f}')
    
    model.fit(x, y)

    y_train_pred = model.predict(x)

    evaluate_predict(y, y_train_pred)
    
    return model

In [None]:
def svc(data, column):
    
    column = str(column)
    x = data.drop(columns=column)
    y = data[column]

    x_train, x_val, y_train, y_val = train_test_split(x, y,
                                                      test_size=0.33, 
                                                      shuffle=True, 
                                                      random_state=42)
    scaler = MinMaxScaler()

    x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)

    x_val = pd.DataFrame(scaler.transform(x_val), columns=x_val.columns)
        
    c_values = np.logspace(-2, 5, 36)
    accuracy_on_valid = []
    accuracy_on_train = []

    for i, value in enumerate(c_values):
        model= SVC(C=value, gamma="auto")
        
        model.fit(x_train, y_train)

        y_valid_pred = model.predict(x_val)
        y_pred_train = model.predict(x_train)
        
        if i % 5 == 0:
            print('C = {}'.format(value))
            
            print(f'valid sample accuracy: {accuracy_score(y_val, y_valid_pred):.3f}')

            print(f'train sample accuracy: {accuracy_score(y_train, y_pred_train):.3f}')

    return model

In [None]:
def reduce_dims(df, dims, method='pca', perplexity=30):
    
    assert method in ['pca', 'tsne'], 'Неверно указан метод'
    
    if method=='pca':
        dim_reducer = PCA(n_components=dims, random_state=42)
        components = dim_reducer.fit_transform(df)
    elif method == 'tsne':
        dim_reducer = TSNE(n_components=dims, learning_rate=250, random_state=42, perplexity=perplexity)
        components = dim_reducer.fit_transform(df)
    else:
        print('Error')
        
    colnames = ['component_' + str(i) for i in range(1, dims+1)]
    return dim_reducer, pd.DataFrame(data = components, columns = colnames) 

In [None]:
def display_components_in_2D_space(components_df, labels=None):
    components_with_labels_df = pd.concat([components_df, pd.DataFrame(labels)], axis=1)

    figsize = (10, 7)
    if labels is not None:
        components_with_labels_df.plot(kind='scatter', x='component_1', y='component_2', 
                                         c=components_with_labels_df.iloc[:, -1], cmap=plt.get_cmap('jet'),
                                         alpha=0.5, figsize=figsize)
    else:
        components_with_labels_df.plot(kind='scatter', x='component_1', y='component_2', alpha=0.5, figsize=figsize)

    plt.xlabel('component_1')
    plt.ylabel('component_2')
    plt.title('2D mapping of objects')    
    plt.show()

In [None]:
def corr(data):
    corr = data.corr()
    ax = sns.heatmap(
        corr, 
        vmin=-1, vmax=1, center=0,
        cmap=sns.diverging_palette(20, 220, n=200),
        square=True)
    
    ax.set_xticklabels(
        ax.get_xticklabels(),
        horizontalalignment='right');

In [None]:
data = pd.read_csv('train.csv')

no_teach_column = ['Id', 
                   'Ecology_1', 
                   'Social_1', 
                   'Helthcare_2', 
                   'Shops_1',
                   'Ecology_2', 
                   'Ecology_3', 
                   'Shops_2',
                   'Price']

data_recov = data[no_teach_column]
no_teach_column_noobj = ['Ecology_1', 
                         'Social_1', 
                         'Helthcare_2', 
                         'Shops_1',
                         'Price']

data_recov_noobj = data[no_teach_column_noobj]

obj = ['Ecology_2', 'Ecology_3', 'Shops_2']
data_obj = data[obj]
data_obj = pd.get_dummies(data_obj)
data_id = data['Id']
data = data.drop(no_teach_column, axis=1)

In [None]:
data_recov.info()

In [None]:
%%time

sns.distplot(data['Healthcare_1']);
# sns_plot = sns.pairplot(data_recov_noobj);  #очень долго считает
# sns_plot.savefig('./output/pairplot_bad_data.png')

# plt.clf()  

# Image(filename='./output/pairplot_data_recov_noobj.png')

In [None]:
broken_column = 'LifeSquare'

data_work = preprocess_data(data).drop(columns='Healthcare_1')

test_param = {'n_estimators': [170], 
              'min_samples_leaf': [12],
              'max_depth': [4]}

ls_model = gs_gbr(data_work, broken_column, test_param)

In [None]:
data_work.info()

In [None]:
data_correct = data.dropna(subset=['LifeSquare']).drop(data[(data['LifeSquare'] > data['Square']) |
                                                            (data['LifeSquare'] > 100) |
                                                            (data['LifeSquare'] < 5)].index)

data_predicted = data.drop(data_correct.index).drop(broken_column, axis=1).drop(columns='Healthcare_1')

data_predicted[broken_column] = ls_model.predict(data_predicted)

data = data_correct.append(data_predicted).sort_index()
data.describe()

In [None]:
broken_column = 'Healthcare_1'

data_work = preprocess_data(data)

test_param = {'n_estimators': [300], 
              'min_samples_leaf': [9],
              'max_depth': [6]}

h1_model = gs_gbr(data_work, broken_column, test_param)

In [None]:
data_correct = data.dropna()

data_predicted = data.drop(data_correct.index).drop(broken_column, axis=1)

data_predicted[broken_column] = h1_model.predict(data_predicted)

data = data_correct.append(data_predicted).sort_index()
data.describe()

In [None]:
broken_column = 'DistrictId'

data_work = preprocess_data(data)

test_param = {'n_estimators': [100], 
              'min_samples_leaf': [13],
              'max_depth': [18]}

d_model = gs_gbr(data_work, broken_column, test_param)

In [None]:
data['DistrictId'].value_counts()

## GradientBoostingClassifier

In [None]:
# broken_column = 'DistrictId'

# data_work = preprocess_data(data)

# test_param = {'n_estimators': [100], 
#               'min_samples_leaf': [10],
#               'max_depth': [20]}

# dc_model = gs_gbc(data_work, broken_column, test_param)

# считает очень долго
# {'max_depth': 20, 'min_samples_leaf': 10, 'n_estimators': 100}
# GridSearchCV train sample accuracy: 0.243
# train sample accuracy: 0.400
# valid sample accuracy: 0.265
# all sample R2: -0.030

# no words

## Support Vector Machine

In [None]:
# broken_column = 'DistrictId'

# data_work = preprocess_data(data)

# d_svc_model = svc(data_work, broken_column)

# C = 0.01
# valid sample accuracy: 0.040
# train sample accuracy: 0.038
# C = 0.1
# valid sample accuracy: 0.040
# train sample accuracy: 0.038
# C = 1.0
# valid sample accuracy: 0.084
# train sample accuracy: 0.096
# C = 10.0
# valid sample accuracy: 0.203
# train sample accuracy: 0.237
# C = 100.0
# valid sample accuracy: 0.416
# train sample accuracy: 0.490
# C = 1000.0
# valid sample accuracy: 0.561
# train sample accuracy: 0.747
# C = 10000.0
# valid sample accuracy: 0.615
# train sample accuracy: 0.897
# C = 100000.0
# valid sample accuracy: 0.614
# train sample accuracy: 0.960

# # GBC works better
# # long calculation

In [None]:
data_correct = data.drop(data[(data['DistrictId'] == 0)].index)

data_predicted = data.drop(data_correct.index).drop(broken_column, axis=1)

data_predicted[broken_column] = d_model.predict(data_predicted)

data = data_correct.append(data_predicted).sort_index()
data.describe()

In [None]:
broken_column = 'Rooms'

data_work = preprocess_data(data)

test_param = {'n_estimators': [100], 
              'min_samples_leaf': [20],
              'max_depth': [20]}

r_model = gs_gbc(data_work, broken_column, test_param)

In [None]:
data_correct = data.drop(data[(data['Rooms'] == 0) | (data['Rooms'] > 5)].index)

data_predicted = data.drop(data_correct.index).drop(broken_column, axis=1)

data_predicted[broken_column] = r_model.predict(data_predicted)

data = data_correct.append(data_predicted).sort_index()
data.describe()


In [None]:
broken_column = 'Square'

data_work = preprocess_data(data)

test_param = {'n_estimators': [120], 
              'min_samples_leaf': [3],
              'max_depth': [4]}

sq_model = gs_gbr(data_work, broken_column, test_param)

In [None]:
data_correct = data.drop(data[(data['Square'] < 10) | (data['Square'] > 300)].index)

data_predicted = data.drop(data_correct.index).drop(broken_column, axis=1)

data_predicted[broken_column] = sq_model.predict(data_predicted)

data = data_correct.append(data_predicted).sort_index()
data.describe()

In [None]:
broken_column = 'KitchenSquare'

data_work = preprocess_data(data)

test_param = {'n_estimators': [150], 
              'min_samples_leaf': [13],
              'max_depth': [6]}

ksr_model = gs_gbr(data_work, broken_column, test_param)

# GradientBoostingClassifier
# ksc_model = gs_gbc(data_work, broken_column, test_param) 
# 
# long calc
# {'max_depth': 5, 'min_samples_leaf': 17, 'n_estimators': 150}
# GridSearchCV train sample accuracy: 0.585
# train sample accuracy: 0.842
# valid sample accuracy: 0.589
# all sample R2: 0.635

In [None]:
# ks_svc_model = svc(data_work, broken_column)

# Result
# C = 0.01
# valid sample accuracy: 0.200
# train sample accuracy: 0.194
# C = 0.1
# valid sample accuracy: 0.368
# train sample accuracy: 0.351
# C = 1.0
# valid sample accuracy: 0.473
# train sample accuracy: 0.464
# C = 10.0
# valid sample accuracy: 0.504
# train sample accuracy: 0.498
# C = 100.0
# valid sample accuracy: 0.517
# train sample accuracy: 0.527
# C = 1000.0
# valid sample accuracy: 0.543
# train sample accuracy: 0.583
# C = 10000.0
# valid sample accuracy: 0.548
# train sample accuracy: 0.639
# C = 100000.0
# valid sample accuracy: 0.550
# train sample accuracy: 0.714

# GBR works better

In [None]:
data_correct = data.drop(data[(data['KitchenSquare'] < 3) | 
                              (data['KitchenSquare'] / data['Square'] > 0.5)].index)

data_predicted = data.drop(data_correct.index).drop(broken_column, axis=1)

data_predicted[broken_column] = ksr_model.predict(data_predicted)

data = data_correct.append(data_predicted).sort_index()
data.describe()

In [None]:
broken_column = 'HouseFloor'

data_work = preprocess_data(data)

test_param = {'n_estimators': [200], 
              'min_samples_leaf': [40],
              'max_depth': [10]}

hf_model = gs_gbr(data_work, broken_column, test_param)

In [None]:
# hf_svc_model = svc(data_work, broken_column)

# Result:
# C = 0.01
# valid sample accuracy: 0.189
# train sample accuracy: 0.192
# C = 0.1
# valid sample accuracy: 0.319
# train sample accuracy: 0.335
# C = 1.0
# valid sample accuracy: 0.370
# train sample accuracy: 0.386
# C = 10.0
# valid sample accuracy: 0.462
# train sample accuracy: 0.499
# C = 100.0
# valid sample accuracy: 0.485
# train sample accuracy: 0.519
# C = 1000.0
# valid sample accuracy: 0.512
# train sample accuracy: 0.579
# C = 10000.0
# valid sample accuracy: 0.520
# train sample accuracy: 0.652
# C = 100000.0
# valid sample accuracy: 0.503
# train sample accuracy: 0.728

# GBR better

In [None]:
data_correct = data.drop(data[(data['HouseFloor'] > 50) |
                              (data['HouseFloor'] == 0) |
                              (data['HouseFloor'] < data['Floor'])].index)

data_predicted = data.drop(data_correct.index).drop(broken_column, axis=1)

data_predicted[broken_column] = hf_model.predict(data_predicted)

data = data_correct.append(data_predicted).sort_index()
data.describe()

In [None]:
data[(data['Floor'] > data['HouseFloor'])].describe()

In [None]:
broken_column = 'HouseYear'

data_work = preprocess_data(data)

test_param = {'n_estimators': [150], 
              'min_samples_leaf': [20],
              'max_depth': [9]}

hy_model = gs_gbr(data_work, broken_column, test_param)

In [None]:
data_correct = data.drop(data[(data['HouseYear'] > 2020) |
                              (data['HouseYear'] == 1977)].index)

data_predicted = data.drop(data_correct.index).drop(broken_column, axis=1)

data_predicted[broken_column] = hy_model.predict(data_predicted)

data = data_correct.append(data_predicted).sort_index()
data.describe()

In [None]:
data[(data['HouseYear'] > 2020)]

In [None]:
broken_column = 'Social_3'

data_work = preprocess_data(data)

test_param = {'n_estimators': [200], 
              'min_samples_leaf': [10],
              'max_depth': [10]}

s3_model = gs_gbr(data_work, broken_column, test_param)

In [None]:
data_correct = data.drop(data[(data['Social_3'] > 140)].index)

data_predicted = data.drop(data_correct.index).drop(broken_column, axis=1)

data_predicted[broken_column] = s3_model.predict(data_predicted)

data = data_correct.append(data_predicted).sort_index()
data.describe()

In [None]:
data_v1 = pd.concat([data, data_recov_noobj], axis=1)
data_v1.describe()

In [None]:
# %%time

# sns_plot = sns.pairplot(data);
# sns_plot.savefig("pairplot_data_r.png")

# plt.clf()  

# Image(filename='pairplot_data_r.png')

awesome and beautiful

In [None]:
corr(data_v1)

In [None]:
r_scaler = RobustScaler()

colnames = data_v1.columns
data_v1_scaled = pd.DataFrame(r_scaler.fit_transform(data_v1), columns=colnames)

data_v1_scaled.head()

In [None]:
# data_v2 = data_v1

In [None]:
# data_sq = data_v1_scaled[['Rooms', 'Square', 'LifeSquare']]

# data_s12 = data_v1_scaled[['Social_1', 'Social_2']]

# pca = PCA(n_components=1, random_state=42)

# data_v2['data_sq'] = pca.fit_transform(data_sq)

# data_v2['data_s12'] = pca.fit_transform(data_s12)


# data_v2 = data_v2.drop(['Rooms', 'Square', 'LifeSquare', 'Social_1', 'Social_2'], axis=1)
# data_v2.describe()

In [None]:
# data_v1 = data_v1.drop(['data_sq', 'data_s12'], axis=1)

In [None]:
# corr(data_v2)

In [None]:
# data_5c = data_v1_scaled[['Rooms', 'Square', 'LifeSquare', 'Social_1', 'Social_2']]
# dim_reducer5c, components_5c = reduce_dims(data_5c, 2, method='tsne')

In [None]:
# data_v3 = pd.concat([data_v1, components_5c], axis=1)
# data_v3 = data_v3.drop(['Rooms', 'Square', 'LifeSquare', 'Social_2'], axis=1)
# data_v3.describe()

In [None]:
# colnames = data_v3.columns
# data_v3_scaled = pd.DataFrame(r_scaler.fit_transform(data_v3), columns=colnames)

In [None]:
# corr(data_v3)

In [None]:
# target = data_v3['Price']
# display_components_in_2D_space(components_5c, target)

In [None]:
# data_8c = data_v1_scaled[['Rooms', 'Square', 'LifeSquare', 'Social_1', 'Social_2', 'Healthcare_1', 'Shops_1']]
# dim_reducer8c, components_8c = reduce_dims(data_8c, 3, method='tsne')

In [None]:
# data_v4 = pd.concat([data, components_8c], axis=1)
# data_v4 = data_v4.drop(['Rooms', 'Square', 'LifeSquare', 'Social_2', 'Healthcare_1'], axis=1)
# data_v4.describe()

In [None]:
# corr(data_v4)

In [None]:
# data_neg_cor_1 = data_v1_scaled[['HouseYear', 'DistrictId']]
# dim_reducer_neg_cor_1, components_neg_cor_1 = reduce_dims(data_neg_cor_1, 1, method='tsne')
# data_v5 = pd.concat([data_v4, components_neg_cor_1], axis=1)
# data_v5 = data_v4.drop(['HouseYear', 'DistrictId'], axis=1)
# data_v5.describe()

In [None]:
# corr(data_v5)

In [None]:
# data_neg_cor_2 = data_v3_scaled[['HouseYear', 'DistrictId', 'component_1', 'KitchenSquare']]
# dim_reducer_neg_cor_2, components_neg_cor_2 = reduce_dims(data_neg_cor_2, 2, method='tsne')
# data_v6 = pd.concat([data_v4, components_neg_cor_2], axis=1)
# data_v6 = data_v4.drop(['HouseYear', 'DistrictId', 'component_1', 'KitchenSquare'], axis=1)
# data_v6.describe()

In [None]:
# data_v1.columns

In [None]:
data_r_s = data_v1['Square'] / data_v1['Rooms']
data_l_k = data_v1['LifeSquare'] / data_v1['KitchenSquare']

data_v7 = data_v1[['DistrictId', 
                  'Floor',
                  'HouseFloor', 
                  'HouseYear', 
                  'Social_2', 
                  'Social_3', 
                  'Healthcare_1',
                  'Price', 
                  'Ecology_1', 
                  'Social_1', 
                  'Helthcare_2', 
                  'Shops_1']]
data_v7 = pd.concat([data_v7, data_r_s, data_l_k], axis=1)
data_v7

In [None]:
data_v8 = data_v1[['DistrictId',
                   'Rooms', 
                   'Square', 
                   'LifeSquare', 
                   'KitchenSquare',
                   'Floor',
                   'HouseFloor', 
                   'HouseYear', 
                   'Social_2', 
                   'Social_3', 
                   'Healthcare_1',
                   'Price', 
                   'Ecology_1', 
                   'Social_1', 
                   'Helthcare_2', 
                   'Shops_1']]
data_v8 = pd.concat([data_v8, data_r_s, data_l_k], axis=1)
data_v8

In [None]:
data_v1_o = pd.concat([data_v1, data_obj], axis=1)
# data_v2_o = pd.concat([data_v2, data_obj], axis=1)
# data_v3_o = pd.concat([data_v3, data_obj], axis=1)
# data_v4_o = pd.concat([data_v4, data_obj], axis=1)
# data_v5_o = pd.concat([data_v5, data_obj], axis=1)
# data_v6_o = pd.concat([data_v6, data_obj], axis=1)
data_v7_o = pd.concat([data_v7, data_obj], axis=1)
data_v8_o = pd.concat([data_v8, data_obj], axis=1)

In [None]:
# broken_column = 'Price'

# data_work = data_v1

# test_param = {'n_estimators': [400], 
#               'min_samples_leaf': [20],
#               'max_depth': (3, 4)}

# d1_model = gs_gbr(data_work, broken_column, test_param)

# {'max_depth': 5, 'min_samples_leaf': 10, 'n_estimators': 200}
# GridSearchCV train sample R2: 0.726
# train sample R2: 0.875
# valid sample R2: 0.733
# all sample R2: 0.859

# v_8_o is better

In [None]:
# broken_column = 'Price'

# data_work = data_v2

# test_param = {'n_estimators': [200, 300], 
#               'min_samples_leaf': [5, 10],
#               'max_depth': (5, 10)}

# d2_model = gs_gbr(data_work, broken_column, test_param)

# {'max_depth': 5, 'min_samples_leaf': 5, 'n_estimators': 300}
# GridSearchCV train sample R2: 0.731
# train sample R2: 0.913
# valid sample R2: 0.722
# all sample R2: 0.888
# Пыжня

In [None]:
# broken_column = 'Price'

# data_work = data_v3

# test_param = {'n_estimators': [200, 300], 
#               'min_samples_leaf': [5, 10],
#               'max_depth': (5, 10)}

# d3_model = gs_gbr(data_work, broken_column, test_param)

# {'max_depth': 5, 'min_samples_leaf': 5, 'n_estimators': 200}
# GridSearchCV train sample R2: 0.726
# train sample R2: 0.889
# valid sample R2: 0.728
# all sample R2: 0.868
    
# Пыжня

In [None]:
# broken_column = 'Price'

# data_work = data_v4

# test_param = {'n_estimators': [200, 300], 
#               'min_samples_leaf': [5, 10],
#               'max_depth': (5, 10)}

# d4_model = gs_gbr(data_work, broken_column, test_param)

# {'max_depth': 5, 'min_samples_leaf': 5, 'n_estimators': 300}
# GridSearchCV train sample R2: 0.678
# train sample R2: 0.882
# valid sample R2: 0.675
# all sample R2: 0.864
    
# Пыжня

In [None]:
# broken_column = 'Price'

# data_work = data_v5

# test_param = {'n_estimators': [200, 300], 
#               'min_samples_leaf': [10],
#               'max_depth': (5, 10)}

# d5_model = gs_gbr(data_work, broken_column, test_param)

# {'max_depth': 10, 'min_samples_leaf': 10, 'n_estimators': 200}
# GridSearchCV train sample R2: 0.639
# train sample R2: 0.943
# valid sample R2: 0.645
# all sample R2: 0.921
    
# Пыжня

In [None]:
# broken_column = 'Price'

# data_work = data_v6

# test_param = {'n_estimators': [200, 300], 
#               'min_samples_leaf': [10],
#               'max_depth': (5, 10)}

# d6_model = gs_gbr(data_work, broken_column, test_param)

# {'max_depth': 10, 'min_samples_leaf': 10, 'n_estimators': 200}
# GridSearchCV train sample R2: 0.556
# train sample R2: 0.904
# valid sample R2: 0.576
# all sample R2: 0.895
    
# Пыжня

In [None]:
# broken_column = 'Price'

# data_work = data_v7

# test_param = {'n_estimators': [200, 300], 
#               'min_samples_leaf': [10],
#               'max_depth': (5, 10)}

# d7_model = gs_gbr(data_work, broken_column, test_param)

# {'max_depth': 5, 'min_samples_leaf': 10, 'n_estimators': 200}
# GridSearchCV train sample R2: 0.610
# train sample R2: 0.819
# valid sample R2: 0.617
# all sample R2: 0.784
    
# Пыжня

In [None]:
# broken_column = 'Price'

# data_work = data_v8

# test_param = {'n_estimators': [200, 300], 
#               'min_samples_leaf': [10],
#               'max_depth': (5, 10)}

# d8_model = gs_gbr(data_work, broken_column, test_param)

# нет значительной разницы с data_v8

In [None]:
# broken_column = 'Price'

# data_work = data_v1_o

# test_param = {'n_estimators': [200, 300], 
#               'min_samples_leaf': [10],
#               'max_depth': (5, 10)}

# d1o_model = gs_gbr(data_work, broken_column, test_param)

# {'max_depth': 5, 'min_samples_leaf': 10, 'n_estimators': 200}
# GridSearchCV train sample R2: 0.724
# train sample R2: 0.881
# valid sample R2: 0.735
# all sample R2: 0.863
    
# Нет значительной разницы с dummies

In [None]:
# broken_column = 'Price'

# data_work = data_v2_o

# test_param = {'n_estimators': [200, 300], 
#               'min_samples_leaf': [10],
#               'max_depth': (5, 10)}

# d2o_model = gs_gbr(data_work, broken_column, test_param)

# {'max_depth': 5, 'min_samples_leaf': 10, 'n_estimators': 300}
# GridSearchCV train sample R2: 0.724
# train sample R2: 0.900
# valid sample R2: 0.726
# all sample R2: 0.878
    
# Пыжня

In [None]:
# broken_column = 'Price'

# data_work = data_v3_o

# test_param = {'n_estimators': [200, 300], 
#               'min_samples_leaf': [10],
#               'max_depth': (5, 10)}

# d3o_model = gs_gbr(data_work, broken_column, test_param)

# {'max_depth': 5, 'min_samples_leaf': 10, 'n_estimators': 300}
# GridSearchCV train sample R2: 0.724
# train sample R2: 0.902
# valid sample R2: 0.725
# all sample R2: 0.882
    
# Пыжня

In [None]:
# broken_column = 'Price'

# data_work = data_v4_o

# test_param = {'n_estimators': [200, 300], 
#               'min_samples_leaf': [10],
#               'max_depth': (5, 10)}

# d4o_model = gs_gbr(data_work, broken_column, test_param)

# {'max_depth': 5, 'min_samples_leaf': 10, 'n_estimators': 300}
# GridSearchCV train sample R2: 0.678
# train sample R2: 0.873
# valid sample R2: 0.680
# all sample R2: 0.850
    
# Пыжня

In [None]:
# broken_column = 'Price'

# data_work = data_v5_o

# test_param = {'n_estimators': [200, 300], 
#               'min_samples_leaf': [10],
#               'max_depth': (5, 10)}

# d5_model = gs_gbr(data_work, broken_column, test_param)

# {'max_depth': 10, 'min_samples_leaf': 10, 'n_estimators': 200}
# GridSearchCV train sample R2: 0.640
# train sample R2: 0.941
# valid sample R2: 0.649
# all sample R2: 0.923
    
# Пыжня

In [None]:
# broken_column = 'Price'

# data_work = data_v6_o

# test_param = {'n_estimators': [200, 300], 
#               'min_samples_leaf': [10],
#               'max_depth': (5, 10)}

# d6o_model = gs_gbr(data_work, broken_column, test_param)

# {'max_depth': 10, 'min_samples_leaf': 10, 'n_estimators': 200}
# GridSearchCV train sample R2: 0.559
# train sample R2: 0.907
# valid sample R2: 0.580
# all sample R2: 0.896
    
# Пыжня

In [None]:
# broken_column = 'Price'

# data_work = data_v7_o

# test_param = {'n_estimators': [200, 300], 
#               'min_samples_leaf': [10],
#               'max_depth': (5, 10)}

# d7o_model = gs_gbr(data_work, broken_column, test_param)

# {'max_depth': 5, 'min_samples_leaf': 10, 'n_estimators': 300}
# GridSearchCV train sample R2: 0.611
# train sample R2: 0.856
# valid sample R2: 0.616
# all sample R2: 0.821
    
# Пыжня

## working model

In [None]:
# broken_column = 'Price'

# data_work = data_v8_o

# test_param = {'n_estimators': [275, 300, 325], 
#               'min_samples_leaf': [10],
#               'max_depth': (4, 6)}

# d8o_model = gs_gbr(data_work, broken_column, test_param)

# {'max_depth': 5, 'min_samples_leaf': 10, 'n_estimators': 250}
# GridSearchCV train sample R2: 0.724
# train sample R2: 0.895
# valid sample R2: 0.739
# all sample R2: 0.879
    
# working model, long calculation

In [None]:
x = data_v8_o.drop(['Price'], axis=1)
y = data_v8_o['Price']

final_model = GradientBoostingRegressor(max_depth=5, 
                                        min_samples_leaf=10, 
                                        n_estimators=250, 
                                        criterion='mse')

final_model.fit(x, y)

y_r = final_model.predict(x)

evaluate_predict(y, y_r)

### Preprocess data

In [None]:
data_test = pd.read_csv('test.csv')

answer = pd.DataFrame(data_test['Id'])

In [None]:
data_test.head()

In [None]:
no_teach_column = ['Id', 
                   'Ecology_1', 
                   'Social_1', 
                   'Helthcare_2', 
                   'Shops_1',
                   'Ecology_2', 
                   'Ecology_3', 
                   'Shops_2']

data_test_recov = data_test[['Ecology_1', 
                       'Social_1', 
                       'Helthcare_2', 
                       'Shops_1']]

In [None]:
data_test_obj = data_test[['Ecology_2', 'Ecology_3', 'Shops_2']]
data_test_obj = pd.get_dummies(data_test_obj)

data_test = data_test.drop(no_teach_column, axis=1)
# LifeSquare
data_correct = data_test.dropna(subset=['LifeSquare']).drop(data_test[
                             (data_test['LifeSquare'] > data_test['Square']) |
                             (data_test['LifeSquare'] > 100) |
                             (data_test['LifeSquare'] < 5)].index)

data_predicted = data_test.drop(data_correct.index).drop(['LifeSquare'], axis=1).drop(columns='Healthcare_1')

data_predicted['LifeSquare'] = ls_model.predict(data_predicted)

data_test = data_correct.append(data_predicted).sort_index()

# Healthcare_1
data_correct = data_test.dropna().drop(data_test[(data_test['Healthcare_1'] < 0)].index)

data_predicted = data_test.drop(data_correct.index).drop(['Healthcare_1'], axis=1)

data_predicted['Healthcare_1'] = h1_model.predict(data_predicted)

data_test = data_correct.append(data_predicted).sort_index()

# DistrictId
data_correct = data_test.drop(data_test[(data_test['DistrictId'] == 0)].index)

data_predicted = data_test.drop(data_correct.index).drop(['DistrictId'], axis=1)

data_predicted['DistrictId'] = d_model.predict(data_predicted)

data_test = data_correct.append(data_predicted).sort_index()

# Rooms
data_correct = data_test.drop(data_test[(data_test['Rooms'] == 0) | (data_test['Rooms'] > 5)].index)

data_predicted = data_test.drop(data_correct.index).drop(['Rooms'], axis=1)

data_predicted['Rooms'] = r_model.predict(data_predicted)

data_test = data_correct.append(data_predicted).sort_index()

# Square
data_correct = data_test.drop(data_test[(data_test['Square'] < 10) | (data_test['Square'] > 300)].index)

data_predicted = data_test.drop(data_correct.index).drop(['Square'], axis=1)

data_predicted['Square'] = sq_model.predict(data_predicted)

data_test = data_correct.append(data_predicted).sort_index()

# KitchenSquare
data_correct = data_test.drop(data_test[(data_test['KitchenSquare'] < 3) | 
                                        (data_test['KitchenSquare'] / data_test['Square'] > 0.5)].index)

data_predicted = data_test.drop(data_correct.index).drop(['KitchenSquare'], axis=1)

data_predicted['KitchenSquare'] = ksr_model.predict(data_predicted)

data_test = data_correct.append(data_predicted).sort_index()

# HouseFloor
data_correct = data_test.drop(data_test[(data_test['HouseFloor'] > 50) |
                                        (data_test['HouseFloor'] == 0) |
                                        (data_test['HouseFloor'] < data_test['Floor'])].index)

data_predicted = data_test.drop(data_correct.index).drop(['HouseFloor'], axis=1)

data_predicted['HouseFloor'] = hf_model.predict(data_predicted)

data_test = data_correct.append(data_predicted).sort_index()

# HouseYear
data_correct = data_test.drop(data_test[(data_test['HouseYear'] > 2020) |
                                        (data_test['HouseYear'] == 1977)].index)

data_predicted = data_test.drop(data_correct.index).drop(['HouseYear'], axis=1)

data_predicted['HouseYear'] = hy_model.predict(data_predicted)

data_test = data_correct.append(data_predicted).sort_index()

# Social_3
data_correct = data_test.drop(data_test[(data_test['Social_3'] > 140)].index)

data_predicted = data_test.drop(data_correct.index).drop(['Social_3'], axis=1)

data_predicted['Social_3'] = s3_model.predict(data_predicted)

data_test = data_correct.append(data_predicted).sort_index()

data_test = pd.concat([data_test, data_test_recov], axis=1)

data_test.describe()

In [None]:
data_test[(data_test['Healthcare_1'] < 0)]

In [None]:
data_s_r = data_test['Square'] / data_test['Rooms']
data_l_k = data_test['LifeSquare'] / data_test['KitchenSquare']
data_test['s_r'] = data_s_r
data_test['l_k'] = data_l_k
data_test = pd.concat([data_test, data_test_obj], axis=1)
data_test.describe()

In [None]:
y_r = final_model.predict(data_test)

answer['Price'] = y_r

In [None]:
answer.to_csv('answer.csv', sep=',', index=False)