#### Импортируем данные и необходимые модули

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score as r2
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv('input/train.csv')

In [5]:
data.shape

(10000, 20)

#### Удаляем все квартиры, где больше 7 комнат, и где площадь меньше 16м2, и больше 200м2, а цена между  30000 и 60000 долларов

In [6]:
data.loc[(data['Square'] < 15) & (data['LifeSquare'] > 15), 'Square'] = data['LifeSquare']

In [7]:
data = data.loc[(data['Rooms'] <= 7) & (data['Square'].between(16, 200)) & (data['Price'].between(30000, 600000)), :]
data.shape

(9961, 20)

In [8]:
data.loc[data['Square'] < data['LifeSquare'], 'Square'] = data['LifeSquare']

In [9]:
data.loc[(data['Square'] > 15) & (data['LifeSquare'] < 10), 'LifeSquare'] = data['Square']
data['LifeSquare'] = data['LifeSquare'].fillna(data['Square'])

In [10]:
data = data.drop('Healthcare_1', axis=1)

In [11]:
data.loc[data['HouseFloor'] == 0, 'HouseFloor'] = data['Floor']

In [12]:
data = pd.get_dummies(data)

In [13]:
data.loc[data['Floor'] > data['HouseFloor'], 'HouseFloor'] = data['Floor']

In [14]:
data.head(20)

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_3,Helthcare_2,Shops_1,Price,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,...,5,0,11,184966.93073,0,1,0,1,0,1
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,...,1,1,16,300009.450063,0,1,0,1,0,1
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,...,0,1,3,220925.908524,0,1,0,1,0,1
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,...,3,0,5,175616.227217,0,1,0,1,0,1
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,...,1,2,4,150226.531644,0,1,0,1,0,1
5,12915,59,3.0,80.384479,46.68372,12.0,5,17.0,2011,0.309479,...,4,0,6,215898.447742,0,1,0,1,0,1
6,14549,154,2.0,62.254114,37.160377,7.0,3,5.0,1960,0.460556,...,14,1,5,296021.204377,0,1,0,1,0,1
7,11993,74,2.0,80.312926,80.312926,0.0,14,14.0,1977,0.075779,...,3,0,2,221244.156664,0,1,0,1,0,1
8,5172,1,2.0,64.511437,64.511437,1.0,9,17.0,1977,0.007122,...,0,0,1,229102.795999,0,1,0,1,0,1
9,8649,23,1.0,46.461409,18.915552,8.0,13,17.0,2014,0.075779,...,3,0,2,95380.220993,0,1,0,1,0,1


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9961 entries, 0 to 9999
Data columns (total 22 columns):
Id               9961 non-null int64
DistrictId       9961 non-null int64
Rooms            9961 non-null float64
Square           9961 non-null float64
LifeSquare       9961 non-null float64
KitchenSquare    9961 non-null float64
Floor            9961 non-null int64
HouseFloor       9961 non-null float64
HouseYear        9961 non-null int64
Ecology_1        9961 non-null float64
Social_1         9961 non-null int64
Social_2         9961 non-null int64
Social_3         9961 non-null int64
Helthcare_2      9961 non-null int64
Shops_1          9961 non-null int64
Price            9961 non-null float64
Ecology_2_A      9961 non-null uint8
Ecology_2_B      9961 non-null uint8
Ecology_3_A      9961 non-null uint8
Ecology_3_B      9961 non-null uint8
Shops_2_A        9961 non-null uint8
Shops_2_B        9961 non-null uint8
dtypes: float64(7), int64(9), uint8(6)
memory usage: 1.7 MB


In [16]:
data.loc[data['HouseYear'] == 20052011, 'HouseYear'] = 2011
data["OldHouse"] = (data['HouseYear'] <= 1920).astype(int)

In [17]:
train, test = train_test_split(data, test_size=0.3, random_state=32)

In [18]:
train.shape

(6972, 23)

In [19]:
test.shape

(2989, 23)

In [20]:
from sklearn.metrics import confusion_matrix

In [21]:
district_stat = train.groupby(['DistrictId', 'Rooms'])[['Price']].mean().reset_index().rename(columns={'Price':'D_mean_price'})

In [22]:
train_reg = pd.merge(train, district_stat, on=['DistrictId', 'Rooms'])

In [23]:
train_reg['D_mean_price'].isnull().sum()

0

In [24]:
test_reg = pd.merge(test, district_stat, on=['DistrictId', 'Rooms'])

In [25]:
test_reg['D_mean_price'].isnull().sum()

0

In [26]:
Train_cor = train_reg.corr()
hight_corr_reg = Train_cor['Price']
hight_corr_reg = hight_corr_reg.drop('Price')
hight_corr_reg = hight_corr_reg.loc[abs(hight_corr_reg) > 0.01]

In [27]:
fts_reg = list(hight_corr_reg.keys())
fts_reg.remove('DistrictId')
fts_reg

['Rooms',
 'Square',
 'LifeSquare',
 'KitchenSquare',
 'Floor',
 'HouseFloor',
 'HouseYear',
 'Ecology_1',
 'Social_1',
 'Social_2',
 'Social_3',
 'Helthcare_2',
 'Shops_1',
 'Ecology_2_A',
 'Ecology_2_B',
 'Ecology_3_A',
 'Ecology_3_B',
 'Shops_2_A',
 'Shops_2_B',
 'OldHouse',
 'D_mean_price']

In [28]:
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.neighbors import KNeighborsRegressor as KNN

In [29]:
scaler = StandardScaler()

In [30]:
train_reg_scaled = scaler.fit_transform(train_reg.loc[:, fts_reg])
test_reg_scaled = scaler.transform(test_reg.loc[:, fts_reg])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


In [31]:
best_train = 0
best_test = 0
for i in range(1, 30, 2):
    knn = KNN(n_neighbors=i, n_jobs=-1, weights='distance')
    knn.fit(train_reg_scaled, train_reg['Price'])
    pred_reg = knn.predict(train_reg_scaled)
    train_iter = r2(train_reg['Price'], pred_reg)
    pred_reg_test = knn.predict(test_reg_scaled)
    test_iter = r2(test_reg['Price'], pred_reg_test)
    if test_iter > best_test:
        best_train = train_iter
        best_test = test_iter
        best_neighbors = i
print(f"Best train r2 = {best_train}\nBest test r2 = {best_test}\nfor n_neighbors = {best_neighbors}")

Best train r2 = 1.0
Best test r2 = 0.6730983824809629
for n_neighbors = 9


In [32]:
Train_cor = train.corr()
hight_corr = Train_cor['Price']
hight_corr = hight_corr.drop('Price')
hight_corr = hight_corr.loc[abs(hight_corr) > 0.01]

In [33]:
fts = list(hight_corr.keys())
fts

['DistrictId',
 'Rooms',
 'Square',
 'LifeSquare',
 'KitchenSquare',
 'Floor',
 'HouseFloor',
 'HouseYear',
 'Ecology_1',
 'Social_1',
 'Social_2',
 'Social_3',
 'Helthcare_2',
 'Shops_1',
 'Ecology_2_A',
 'Ecology_2_B',
 'Ecology_3_A',
 'Ecology_3_B',
 'Shops_2_A',
 'Shops_2_B',
 'OldHouse']

In [34]:
parameters = [{
    'n_estimators': [100, 150, 200], 
    'max_features': np.arange(5, 15, 2),
    'max_depth': np.arange(2, 15, 2)
              }]

In [35]:
from sklearn.model_selection import GridSearchCV

In [36]:
clf = GridSearchCV(
    estimator=RF(random_state=42), 
    param_grid=parameters,
    scoring='r2',
    cv=5)

In [37]:
# clf.fit(train.loc[:, fts], train['Price'])

In [38]:
# clf.best_params_
train.shape

(6972, 23)

In [39]:
rf = RF(max_depth=14, n_estimators=150, random_state=42, max_features=5, n_jobs=-1)

In [40]:
rf.fit(train.loc[:, fts], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=14,
           max_features=5, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=150, n_jobs=-1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [41]:
pred = rf.predict(train.loc[:, fts])

In [42]:
r2(train['Price'], pred)

0.9286006923710686

In [43]:
pred_test = rf.predict(test.loc[:, fts])

In [44]:
r2(test['Price'], pred_test)

0.7310831670412755

#### Проверка на тестовый данных

In [45]:
test_data = data = pd.read_csv('input/test.csv')
test_data.shape

(5000, 19)

In [46]:
test_data.loc[(test_data['Square'] < 15) & (test_data['LifeSquare'] > 15), 'Square'] = test_data['LifeSquare']

In [47]:
test_data.loc[test_data['Square'] < test_data['LifeSquare'], 'Square'] = test_data['LifeSquare']

In [48]:
test_data['LifeSquare'] = test_data['LifeSquare'].fillna(test_data['Square'])

In [49]:
test_data.loc[test_data['HouseFloor'] == 0, 'HouseFloor'] = test_data['Floor']

In [50]:
test_data = pd.get_dummies(test_data)

In [51]:
test_data = test_data.drop('Healthcare_1', axis=1)

In [52]:
test_data.loc[test_data['Floor'] > test_data['HouseFloor'], 'HouseFloor'] = test_data['Floor']

In [53]:
test_data["OldHouse"] = (test_data['HouseYear'] <= 1920).astype(int)

In [54]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 22 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       5000 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Ecology_2_A      5000 non-null uint8
Ecology_2_B      5000 non-null uint8
Ecology_3_A      5000 non-null uint8
Ecology_3_B      5000 non-null uint8
Shops_2_A        5000 non-null uint8
Shops_2_B        5000 non-null uint8
OldHouse         5000 non-null int64
dtypes: float64(6), int64(10), uint8(6)
memory usage: 654.4 KB


In [55]:
test_pred = rf.predict(test_data.loc[:, fts])

In [67]:
test_pred.size

5000

In [73]:
test_data["Price"] = test_pred
test_data[["Id", "Price"]].to_csv('DVdovin_predictions.csv', sep=',', index=False)

NameError: name 'DVdovin_predictions' is not defined