In [148]:
#IMPORT_LIB
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

from sklearn.metrics import r2_score as r2
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as MSE

from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.manifold import TSNE
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [149]:
#LOAD_DATA
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
#data.info()
#data.describe()

In [150]:
#Гипотезы для очистки данных
'''
data.loc[data['HouseYear']>2020]
data.loc[data['Rooms']>6]
data.loc[data['Rooms']==0]
data.loc[data['Square']>150]
data.loc[data['KitchenSquare']>20]
data.loc[data['KitchenSquare']<5]
data.loc[data['LifeSquare']<8]
'''

"\ndata.loc[data['HouseYear']>2020]\ndata.loc[data['Rooms']>6]\ndata.loc[data['Rooms']==0]\ndata.loc[data['Square']>150]\ndata.loc[data['KitchenSquare']>20]\ndata.loc[data['KitchenSquare']<5]\ndata.loc[data['LifeSquare']<8]\n"

In [151]:
#Очистка данных
data.drop('Healthcare_1',axis=1, inplace=True)
data.loc[data['Rooms']==10, 'Rooms']=2
data.loc[data['Rooms']==19, 'Rooms']=1
data.loc[data['Rooms']==6, 'Rooms']=2
data.loc[data['Rooms']==0,'Rooms']=1
data.loc[data['HouseYear']==4968, 'HouseYear']=1968
data.loc[data['HouseYear']==20052011, 'HouseYear']=2011
data.loc[data['LifeSquare']>641, 'LifeSquare']=data.loc[data['LifeSquare']>7480, 'LifeSquare']/100

In [152]:
def fillna_lifesquare(df,df_source):
    df['LifeSquare']=df['LifeSquare'].fillna(df_source['LifeSquare'].median())
    return df

In [153]:
def join_stats(df,stats1,source_df,mode='train'):
    df=pd.merge(df,stats1,on=['DistrictId','Rooms'],how='left')
    if mode == 'test':
        df['mean_price']=df['mean_price'].fillna(source_df['mean_price'].mean())
    return df

In [154]:
def prepare_square(df):
    df.loc[df['Square']>(df['Rooms']*40+20),'Square']=df['Rooms']*40+20
    df.loc[df['Square']<(df['Rooms']*15),'Square']=df['Rooms']*15
    df.loc[df['Square']<(df['LifeSquare']),'LifeSquare']=df['Square']
    df.loc[df['KitchenSquare']>150,'KitchenSquare']=10
    df.loc[df['KitchenSquare']>df['Square'], 'KitchenSquare']=10
    df.loc[df['KitchenSquare']>df['Square']*0.5, 'KitchenSquare']=df['Square']*random.uniform(0.2, 0.4)
    df.loc[df['LifeSquare']<df['Square']*0.2, 'LifeSquare']=df['Square']*random.uniform(0.45, 0.8)
    df.loc[df['LifeSquare']<8, 'LifeSquare']=df['Square']
    return df

In [155]:
data=prepare_square(data)

In [156]:
data=fillna_lifesquare(data,data)

In [157]:
stats1=data.groupby(['DistrictId','Rooms'],as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price'})

In [158]:
data=join_stats1(data,stats1,data)

In [159]:
features=data[['Id', 'DistrictId', 'Rooms','Square','LifeSquare', 'Floor', 'HouseFloor', 'HouseYear', 'mean_price']]
target=data['Price']

In [160]:
X_train,X_test,y_train,y_test=train_test_split(features, target, test_size=0.25, random_state=42)

In [161]:
model1 = RFR(n_estimators=100, max_depth=12, min_samples_leaf=5, random_state=42)

In [162]:
%%time
model1.fit(X_train,y_train)

Wall time: 1.7 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=5, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [163]:
y_pred=model1.predict(X_test)

In [164]:
print(mse(y_test,y_pred),r2(y_test,y_pred))

2296548255.611943 0.7393214545550568


In [166]:
print(model1.feature_importances_)

[0.01592009 0.01916477 0.01708925 0.12122368 0.02243915 0.01633585
 0.01721283 0.03037559 0.74023879]


In [167]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


In [168]:
test.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,5000.0,5000.0,5000.0,5000.0,3959.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,2623.0,5000.0,5000.0
mean,8412.5954,51.2792,1.91,56.4495,36.15881,5.9768,8.632,12.601,1984.3926,0.119874,24.9338,5406.9,8.2626,1146.657263,1.3194,4.2428
std,4832.674037,44.179466,0.838594,19.092787,17.825287,9.950018,5.483228,6.789213,18.573149,0.12007,17.532202,4026.614773,23.863762,1044.744231,1.47994,4.777365
min,1.0,0.0,0.0,1.378543,0.33349,0.0,1.0,0.0,1908.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0
25%,4221.75,21.0,1.0,41.906231,23.092026,1.0,4.0,9.0,1973.0,0.019509,6.0,1564.0,0.0,325.0,0.0,1.0
50%,8320.5,37.0,2.0,52.92134,32.925087,6.0,7.0,12.0,1977.0,0.072158,25.0,5285.0,2.0,900.0,1.0,3.0
75%,12598.25,77.0,2.0,66.285129,45.174091,9.0,12.0,17.0,2000.0,0.195781,36.0,7287.0,5.0,1548.0,2.0,6.0
max,16795.0,212.0,17.0,223.453689,303.071094,620.0,78.0,99.0,2020.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


In [169]:
test=prepare_square(test)

In [170]:
test.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,5000.0,5000.0,5000.0,5000.0,3959.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,2623.0,5000.0,5000.0
mean,8412.5954,51.2792,1.91,56.246558,37.085456,5.736877,8.632,12.601,1984.3926,0.119874,24.9338,5406.9,8.2626,1146.657263,1.3194,4.2428
std,4832.674037,44.179466,0.838594,18.724156,16.505724,3.929042,5.483228,6.789213,18.573149,0.12007,17.532202,4026.614773,23.863762,1044.744231,1.47994,4.777365
min,1.0,0.0,0.0,15.0,9.495367,0.0,1.0,0.0,1908.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0
25%,4221.75,21.0,1.0,41.906231,23.740605,1.0,4.0,9.0,1973.0,0.019509,6.0,1564.0,0.0,325.0,0.0,1.0
50%,8320.5,37.0,2.0,52.92134,33.439783,6.0,7.0,12.0,1977.0,0.072158,25.0,5285.0,2.0,900.0,1.0,3.0
75%,12598.25,77.0,2.0,65.97819,45.714884,9.0,12.0,17.0,2000.0,0.195781,36.0,7287.0,5.0,1548.0,2.0,6.0
max,16795.0,212.0,17.0,255.0,191.421728,42.0,78.0,99.0,2020.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


In [171]:
test=fillna_lifesquare(test,data)

In [172]:
test=join_stats1(test,stats1,data,mode='test')

In [173]:
test.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,mean_price
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,2623.0,5000.0,5000.0,5000.0
mean,8412.5954,51.2792,1.91,56.246558,36.285284,5.736877,8.632,12.601,1984.3926,0.119874,24.9338,5406.9,8.2626,1146.657263,1.3194,4.2428,214295.229152
std,4832.674037,44.179466,0.838594,18.724156,14.769608,3.929042,5.483228,6.789213,18.573149,0.12007,17.532202,4026.614773,23.863762,1044.744231,1.47994,4.777365,71418.589638
min,1.0,0.0,0.0,15.0,9.495367,0.0,1.0,0.0,1908.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,84250.557673
25%,4221.75,21.0,1.0,41.906231,27.798639,1.0,4.0,9.0,1973.0,0.019509,6.0,1564.0,0.0,325.0,0.0,1.0,166014.055413
50%,8320.5,37.0,2.0,52.92134,33.242167,6.0,7.0,12.0,1977.0,0.072158,25.0,5285.0,2.0,900.0,1.0,3.0,198777.278269
75%,12598.25,77.0,2.0,65.97819,42.411448,9.0,12.0,17.0,2000.0,0.195781,36.0,7287.0,5.0,1548.0,2.0,6.0,246262.924161
max,16795.0,212.0,17.0,255.0,191.421728,42.0,78.0,99.0,2020.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,593618.746096


In [174]:
test1=test[['Id', 'DistrictId', 'Rooms','Square','LifeSquare', 'Floor', 'HouseFloor', 'HouseYear', 'mean_price']]

In [175]:
test1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 9 columns):
Id            5000 non-null int64
DistrictId    5000 non-null int64
Rooms         5000 non-null float64
Square        5000 non-null float64
LifeSquare    5000 non-null float64
Floor         5000 non-null int64
HouseFloor    5000 non-null float64
HouseYear     5000 non-null int64
mean_price    5000 non-null float64
dtypes: float64(5), int64(4)
memory usage: 390.6 KB


In [176]:
test['Price']=model1.predict(test1)

In [177]:
test[['Id','Price']].to_csv('ACulkin_predictions.csv',index= None)