In [6]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as LR, Ridge
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.metrics import r2_score as r2,mean_squared_error as MSE
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score, GridSearchCV

In [7]:
df=pd.read_csv('train.csv')

In [241]:
data=df.copy()
data.drop(['Healthcare_1'],axis=1,inplace=True)

In [242]:
#гипотеза №2
data.loc[377,'Rooms']=2
data.loc[1454,'Rooms']=1
data.loc[8849,'Rooms']=2
data.loc[1397,'Rooms']=4
data.loc[1981,'Rooms']=5
data.loc[2269,'Rooms']=1
data.loc[3911,'Rooms']=1
data.loc[4366,'Rooms']=3
data.loc[6149,'Rooms']=1
data.loc[8834,'Rooms']=3
data.loc[4853,'Rooms']=1

In [243]:
def prepare_square(df):
    df.loc[df['Square']<(df['Rooms']*15),'Square']=df['Rooms']*15
   # df.loc[df['Square']>(df['Rooms']*40+20),'Square']=df['Rooms']*40+20
    return df

In [244]:
data=prepare_square(data)

In [245]:
data['price_square'] = data['Price'] / data['Square']

In [246]:
data['LifeSquare'] = data['LifeSquare'].fillna(data['LifeSquare'].mean())

In [247]:
train,test=train_test_split(data,test_size=0.3, random_state=54)

In [248]:
price_mean_by_distr_square=train.groupby(['DistrictId','Rooms'],as_index=False).agg({'price_square':'mean'}).rename(columns={'price_square':'mean_price_by_ds'})

In [203]:
price_mean_by_distr_square.head()

Unnamed: 0,DistrictId,mean_price_by_ds
0,0,3906.105818
1,1,3241.334714
2,2,4572.065582
3,3,3677.887042
4,4,5035.442365


In [249]:
def join_price_mean(df,stats1,source_df):
    df=pd.merge(df,stats1,on=['DistrictId','Rooms'],how='left')
    df['mean_price_by_ds']=df['mean_price_by_ds'].fillna(source_df['price_square'].mean())
    return df

In [250]:
train = join_price_mean(train,price_mean_by_distr_square,train)

In [93]:
correlation = train.corr()
print(correlation['Price'].sort_values(ascending=False))

Price               1.000000
price_square        0.601041
Square              0.584870
Rooms               0.574945
mean_price_by_ds    0.465918
LifeSquare          0.314351
Social_1            0.269122
Helthcare_2         0.257421
DistrictId          0.255619
Social_2            0.243029
Shops_1             0.182023
Floor               0.114123
Social_3            0.070582
HouseFloor          0.069637
KitchenSquare       0.018351
Id                  0.010019
HouseYear           0.005143
Ecology_1          -0.060091
Name: Price, dtype: float64


In [251]:
X_train=train[['Rooms','Square','DistrictId','Social_1','Helthcare_2','Social_2','mean_price_by_ds']]
y_train=train['Price']

In [252]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
#X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [253]:
def rfr_model(X, y):
# Perform Grid-Search
    gsc = GridSearchCV(
        estimator=RF(),
        param_grid={
            'max_depth': range(3,7),
            'n_estimators': (10, 50, 75, 100),
            'min_samples_leaf':[3,5,7,10]
        },
        cv=3, scoring='neg_mean_squared_error', verbose=0,                         n_jobs=-1)
    
    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_
    print(best_params)
    
rfr_model(X_train_scaled,y_train)

{'max_depth': 6, 'min_samples_leaf': 5, 'n_estimators': 75}


In [254]:
model3=RF(random_state=54,max_depth=6,n_estimators=75,min_samples_leaf=5)

In [255]:
from sklearn.cluster import KMeans


In [256]:
kmeans = KMeans(n_clusters=2,random_state=54)

labels_train = kmeans.fit_predict(X_train_scaled)



In [257]:
%%time
model3.fit(X_train_scaled,y_train)

Wall time: 503 ms


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=5, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=75, n_jobs=None,
           oob_score=False, random_state=54, verbose=0, warm_start=False)

In [258]:
test = join_price_mean(test,price_mean_by_distr_square,train)

In [742]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 0 to 2999
Data columns (total 21 columns):
Id                  3000 non-null int64
DistrictId          3000 non-null int64
Rooms               3000 non-null float64
Square              3000 non-null float64
LifeSquare          3000 non-null float64
KitchenSquare       3000 non-null float64
Floor               3000 non-null int64
HouseFloor          3000 non-null float64
HouseYear           3000 non-null int64
Ecology_1           3000 non-null float64
Ecology_2           3000 non-null object
Ecology_3           3000 non-null object
Social_1            3000 non-null int64
Social_2            3000 non-null int64
Social_3            3000 non-null int64
Helthcare_2         3000 non-null int64
Shops_1             3000 non-null int64
Shops_2             3000 non-null object
Price               3000 non-null float64
price_square        3000 non-null float64
mean_price_by_ds    3000 non-null float64
dtypes: float64(9), int64(9), ob

In [259]:
X_test=test[['Rooms','Square','DistrictId','Social_1','Helthcare_2','Social_2','mean_price_by_ds']]
y_test=test['Price']

In [260]:
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [261]:
X_test['y_pred_3']=model3.predict(X_test_scaled)

In [144]:
X_test.y_pred_3.value_counts()

173843.617184    52
125803.696937    49
156631.902804    45
129297.091232    36
191087.625653    35
142244.098884    30
125613.768545    28
177877.586752    21
153775.142265    21
195486.079235    19
127209.086127    19
153590.383626    18
139056.617253    17
193485.298217    17
192769.728866    17
175296.183936    17
142399.367146    16
175536.781263    16
177227.995981    16
175615.775542    14
127213.864020    14
154478.437112    14
174324.883623    13
177411.619496    12
173705.181063    12
191953.808971    12
211741.870467    12
157514.679008    12
139518.701384    11
173967.705544    11
                 ..
215539.482981     1
447047.590045     1
330471.957193     1
197530.001855     1
260228.213906     1
326386.079995     1
238916.131554     1
243338.969843     1
215740.310408     1
337515.471622     1
225621.972255     1
174917.815589     1
305844.922770     1
290584.182525     1
214237.409568     1
190883.440499     1
301553.051237     1
281528.973687     1
350211.462505     1


In [262]:
print('Случайный лес',MSE(y_test,y_pred_3),r2(y_test,X_test['y_pred_3']))

Случайный лес 2849538244.6877747 0.6673059409742638


In [263]:
r2(y_train,model3.predict(X_train_scaled))

0.7955143444163503

In [264]:
labels_test = kmeans.predict(X_test_scaled)

In [265]:
model3.fit(X_train_scaled.loc[labels_train == 0], y_train[labels_train == 0])

y_test_pred_0 = model3.predict(X_test_scaled.loc[labels_test == 0])

In [266]:
model3.fit(X_train_scaled.loc[labels_train == 1], y_train[labels_train == 1])

y_test_pred_1 = model3.predict(X_test_scaled.loc[labels_test == 1])

In [267]:
y_test_all = np.hstack([y_test[labels_test == 0], y_test[labels_test == 1]])
y_test_pred_all = np.hstack([y_test_pred_0, y_test_pred_1])

r2(y_test_all, y_test_pred_all)

0.6684499904135277

In [268]:
y_train_all = np.hstack([y_train[labels_train == 0], y_train[labels_train == 1]])
y_train_pred_all = np.hstack([model3.predict(X_train_scaled.loc[labels_train == 0]), 
                              model3.predict(X_train_scaled.loc[labels_train == 1])])

r2(y_train_all, y_train_pred_all)

0.7065561176166566

In [269]:
price_mean_by_distr_square=data.groupby(['DistrictId','Rooms'],as_index=False).agg({'price_square':'mean'}).rename(columns={'price_square':'mean_price_by_ds'})

In [270]:
data = join_price_mean(data,price_mean_by_distr_square,data)

In [271]:
X = data[['Rooms','Square','DistrictId','Social_1','Helthcare_2','Social_2','mean_price_by_ds']]
y = data['Price']

In [272]:
X_scaled = pd.DataFrame(scaler.transform(X), columns=X.columns)

In [273]:
labels_train = kmeans.fit_predict(X_scaled)

In [150]:
model3.fit(X_scaled,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=7, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=54, verbose=0, warm_start=False)

# Загружаем тест

In [274]:
test=pd.read_csv('test.csv')

In [77]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


In [52]:
test.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,5000.0,5000.0,5000.0,5000.0,3959.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,2623.0,5000.0,5000.0
mean,8412.5954,51.2792,1.91,56.4495,36.15881,5.9768,8.632,12.601,1984.3926,0.119874,24.9338,5406.9,8.2626,1146.657263,1.3194,4.2428
std,4832.674037,44.179466,0.838594,19.092787,17.825287,9.950018,5.483228,6.789213,18.573149,0.12007,17.532202,4026.614773,23.863762,1044.744231,1.47994,4.777365
min,1.0,0.0,0.0,1.378543,0.33349,0.0,1.0,0.0,1908.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0
25%,4221.75,21.0,1.0,41.906231,23.092026,1.0,4.0,9.0,1973.0,0.019509,6.0,1564.0,0.0,325.0,0.0,1.0
50%,8320.5,37.0,2.0,52.92134,32.925087,6.0,7.0,12.0,1977.0,0.072158,25.0,5285.0,2.0,900.0,1.0,3.0
75%,12598.25,77.0,2.0,66.285129,45.174091,9.0,12.0,17.0,2000.0,0.195781,36.0,7287.0,5.0,1548.0,2.0,6.0
max,16795.0,212.0,17.0,223.453689,303.071094,620.0,78.0,99.0,2020.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


In [277]:
test[test.Rooms > 5]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
2071,10793,23,6.0,110.750226,,0.0,2,2.0,2015,0.014073,B,B,2,475,0,,0,0,B
3217,4058,27,6.0,223.453689,104.113552,16.0,2,2.0,2017,0.041116,B,B,53,14892,4,,1,4,B
3398,1435,111,17.0,255.0,32.528342,8.0,15,17.0,1987,0.093443,B,B,23,4635,5,3300.0,2,4,B


In [278]:
data.loc[2406,'Rooms']=4
data.loc[2524,'Rooms']=3
data.loc[2071,'Rooms']=4
data.loc[3398,'Rooms']=6

In [279]:
test=prepare_square(test)

In [280]:
test=join_price_mean(test,price_mean_by_distr_square,data)

In [281]:
test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns)

ValueError: could not convert string to float: 'B'

In [650]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 20 columns):
Id                  5000 non-null int64
DistrictId          5000 non-null int64
Rooms               5000 non-null float64
Square              5000 non-null float64
LifeSquare          3959 non-null float64
KitchenSquare       5000 non-null float64
Floor               5000 non-null int64
HouseFloor          5000 non-null float64
HouseYear           5000 non-null int64
Ecology_1           5000 non-null float64
Ecology_2           5000 non-null object
Ecology_3           5000 non-null object
Social_1            5000 non-null int64
Social_2            5000 non-null int64
Social_3            5000 non-null int64
Healthcare_1        2623 non-null float64
Helthcare_2         5000 non-null int64
Shops_1             5000 non-null int64
Shops_2             5000 non-null object
mean_price_by_ds    5000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 820.3+ KB


In [85]:
test=join_stats1(test,stats1,data,mode='test')

In [86]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 20 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       5000 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
mean_price       5000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 820.3+ KB


In [282]:
temp=test[['Rooms','Square','DistrictId','Social_1','Helthcare_2','Social_2','mean_price_by_ds']]

In [283]:
temp_scaled = pd.DataFrame(scaler.transform(temp), columns=temp.columns)

In [652]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 7 columns):
Rooms               5000 non-null float64
Square              5000 non-null float64
DistrictId          5000 non-null int64
Social_1            5000 non-null int64
Helthcare_2         5000 non-null int64
Social_2            5000 non-null int64
mean_price_by_ds    5000 non-null float64
dtypes: float64(3), int64(4)
memory usage: 312.5 KB


In [284]:
labels_test = kmeans.fit_predict(temp_scaled)

In [288]:
model3.fit(X_scaled.loc[labels_train == 0], y[labels_train == 0])

y_test_pred_0 = model3.predict(temp_scaled.loc[labels_test == 0])

In [289]:
model3.fit(X_scaled.loc[labels_train == 1], y[labels_train == 1])

y_test_pred_1 = model3.predict(temp_scaled.loc[labels_test == 1])

In [290]:
test['Price']=np.hstack([y_test_pred_0, y_test_pred_1])

In [293]:
test[['Id','Price']].to_csv('EChasovskih_predictions.csv',index=False)

In [291]:
test.Price.value_counts()

141914.550195    70
142035.881226    67
142424.548117    62
143101.143335    61
203110.431793    46
209150.061143    40
195710.190112    33
143840.375829    26
134374.416937    23
166100.504185    21
177232.753219    21
168097.738137    21
209507.592358    20
156604.134157    18
204541.390526    18
143330.377908    17
164299.355510    17
267652.588346    15
134037.430867    14
197437.958600    13
167691.327012    13
271018.199698    13
268084.032750    13
202105.279219    12
156177.446861    11
154934.844277    11
203467.963008    11
144516.971048    10
193112.294791    10
201947.464946    10
                 ..
294364.925841     1
265500.559664     1
301642.714047     1
375364.585631     1
410501.684182     1
274013.804895     1
273838.170045     1
188778.998872     1
336847.057681     1
174168.793184     1
176098.009532     1
156927.540864     1
183113.280813     1
366193.131881     1
195196.588595     1
234676.637332     1
273534.687830     1
235459.623684     1
192993.950944     1


In [292]:
test

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,mean_price_by_ds,Price
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,B,11,2748,1,,0,0,B,2962.604453,189585.058536
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,...,B,6,1437,3,,0,2,B,3289.878362,232883.996281
2,5480,190,1.0,15.000000,15.948246,12.0,2,5.0,1909,0.000000,...,B,30,7538,87,4702.0,5,5,B,3901.513059,143840.375829
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,B,23,4583,3,,3,3,B,3841.195532,216703.033335
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,...,B,2,629,1,,0,0,A,2923.192115,268109.696000
5,7633,53,1.0,40.675627,,1.0,21,21.0,1977,0.049637,...,B,34,7759,0,229.0,1,3,B,5112.250316,268084.032750
6,13329,23,2.0,68.099538,64.843025,1.0,2,17.0,1977,0.075779,...,B,6,1437,3,,0,2,B,2218.491245,141761.912295
7,5502,32,2.0,48.193897,32.857644,6.0,5,14.0,1972,0.135650,...,B,46,7960,6,350.0,3,11,B,4724.292735,267526.591155
8,4220,96,3.0,72.277106,45.968758,9.0,17,17.0,1997,0.041116,...,B,53,14892,4,,1,4,B,3929.317080,142425.949464
9,11538,6,3.0,80.219400,47.660260,9.0,13,17.0,2014,0.243205,...,B,5,1564,0,540.0,0,0,B,2671.567170,267526.591155
