In [614]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as LR, Ridge
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.metrics import r2_score as r2,mean_squared_error as MSE
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score, GridSearchCV

In [655]:
df=pd.read_csv('train.csv')

In [743]:
data=df.copy()
data.drop(['Healthcare_1'],axis=1,inplace=True)

In [744]:
#гипотеза №2
data.loc[377,'Rooms']=2
data.loc[1454,'Rooms']=1
data.loc[8849,'Rooms']=2
data.loc[1397,'Rooms']=4
data.loc[1981,'Rooms']=5
data.loc[2269,'Rooms']=1
data.loc[3911,'Rooms']=1
data.loc[4366,'Rooms']=3
data.loc[6149,'Rooms']=1
data.loc[8834,'Rooms']=3
data.loc[4853,'Rooms']=1

In [745]:
def prepare_square(df):
    df.loc[df['Square']<(df['Rooms']*15),'Square']=df['Rooms']*15
   # df.loc[df['Square']>(df['Rooms']*40+20),'Square']=df['Rooms']*40+20
    return df

In [746]:
data=prepare_square(data)

In [747]:
data['price_square'] = data['Price'] / data['Square']

In [748]:
data['LifeSquare'] = data['LifeSquare'].fillna(data['LifeSquare'].mean())

In [749]:
train,test=train_test_split(data,test_size=0.3, random_state=54)

In [750]:
price_mean_by_distr_square=train.groupby(['DistrictId'],as_index=False).agg({'price_square':'mean'}).rename(columns={'price_square':'mean_price_by_ds'})

In [673]:
price_mean_by_distr_square.head()

Unnamed: 0,DistrictId,Rooms,mean_price_by_ds
0,0,1.0,4175.774657
1,0,2.0,3560.242068
2,0,3.0,4280.427935
3,1,1.0,3403.793451
4,1,2.0,3202.86063


In [751]:
def join_price_mean(df,stats1,source_df):
    df=pd.merge(df,stats1,on=['DistrictId'],how='left')
    df['mean_price_by_ds']=df['mean_price_by_ds'].fillna(source_df['price_square'].mean())
    return df

In [752]:
train = join_price_mean(train,price_mean_by_distr_square,train)

In [753]:
correlation = train.corr()
print(correlation['Price'].sort_values(ascending=False))

Price               1.000000
price_square        0.594004
Rooms               0.574945
mean_price_by_ds    0.547718
Square              0.501587
LifeSquare          0.314351
Social_1            0.269122
Helthcare_2         0.257421
DistrictId          0.255619
Social_2            0.243029
Shops_1             0.182023
Floor               0.114123
Social_3            0.070582
HouseFloor          0.069637
KitchenSquare       0.018351
Id                  0.010019
HouseYear           0.005143
Ecology_1          -0.060091
Name: Price, dtype: float64


In [754]:
X_train=train[['Rooms','Square','DistrictId','Social_1','Helthcare_2','Social_2','mean_price_by_ds']]
y_train=train['Price']

In [755]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
#X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [756]:
def rfr_model(X, y):
# Perform Grid-Search
    gsc = GridSearchCV(
        estimator=RF(),
        param_grid={
            'max_depth': range(3,7),
            'n_estimators': (10, 50, 100),
            'min_samples_leaf':[3,5,7,10]
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0,                         n_jobs=-1)
    
    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_
    print(best_params)
    
rfr_model(X_train_scaled,y_train)

{'max_depth': 6, 'min_samples_leaf': 7, 'n_estimators': 100}


In [757]:
model3=RF(random_state=54,max_depth=6,n_estimators=100,min_samples_leaf=7)

In [681]:
from sklearn.cluster import KMeans


In [758]:
kmeans = KMeans(n_clusters=2,random_state=54)

labels_train = kmeans.fit_predict(X_train_scaled)



In [759]:
%%time
model3.fit(X_train_scaled,y_train)

Wall time: 644 ms


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=7, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=54, verbose=0, warm_start=False)

In [760]:
test = join_price_mean(test,price_mean_by_distr_square,train)

In [742]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 0 to 2999
Data columns (total 21 columns):
Id                  3000 non-null int64
DistrictId          3000 non-null int64
Rooms               3000 non-null float64
Square              3000 non-null float64
LifeSquare          3000 non-null float64
KitchenSquare       3000 non-null float64
Floor               3000 non-null int64
HouseFloor          3000 non-null float64
HouseYear           3000 non-null int64
Ecology_1           3000 non-null float64
Ecology_2           3000 non-null object
Ecology_3           3000 non-null object
Social_1            3000 non-null int64
Social_2            3000 non-null int64
Social_3            3000 non-null int64
Helthcare_2         3000 non-null int64
Shops_1             3000 non-null int64
Shops_2             3000 non-null object
Price               3000 non-null float64
price_square        3000 non-null float64
mean_price_by_ds    3000 non-null float64
dtypes: float64(9), int64(9), ob

In [761]:
X_test=test[['Rooms','Square','DistrictId','Social_1','Helthcare_2','Social_2','mean_price_by_ds']]
y_test=test['Price']

In [762]:
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [763]:
y_pred_3=model3.predict(X_test_scaled)

In [764]:
print('Случайный лес',MSE(y_test,y_pred_3),r2(y_test,y_pred_3))

Случайный лес 2668039084.288804 0.687705909190593


In [765]:
r2(y_train,model3.predict(X_train_scaled))

0.7673982820122058

In [713]:
labels_test = kmeans.predict(X_test_scaled)

In [766]:
model3.fit(X_train_scaled.loc[labels_train == 0], y_train[labels_train == 0])

y_test_pred_0 = model3.predict(X_test_scaled.loc[labels_test == 0])

In [767]:
model3.fit(X_train_scaled.loc[labels_train == 1], y_train[labels_train == 1])

y_test_pred_1 = model3.predict(X_test_scaled.loc[labels_test == 1])

In [768]:
y_test_all = np.hstack([y_test[labels_test == 0], y_test[labels_test == 1]])
y_test_pred_all = np.hstack([y_test_pred_0, y_test_pred_1])

r2(y_test_all, y_test_pred_all)

0.687944613913841

In [769]:
price_mean_by_distr_square=data.groupby(['DistrictId'],as_index=False).agg({'price_square':'mean'}).rename(columns={'price_square':'mean_price_by_ds'})

In [770]:
data = join_price_mean(data,price_mean_by_distr_square,data)

In [771]:
X = data[['Rooms','Square','DistrictId','Social_1','Helthcare_2','Social_2','mean_price_by_ds']]
y = data['Price']

In [772]:
X_scaled = pd.DataFrame(scaler.transform(X), columns=X.columns)

In [773]:
model3.fit(X_scaled,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=7, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=54, verbose=0, warm_start=False)

# Загружаем тест

In [774]:
test=pd.read_csv('test.csv')

In [77]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


In [80]:
test.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,5000.0,5000.0,5000.0,5000.0,3959.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,2623.0,5000.0,5000.0
mean,8412.5954,51.2792,1.91,56.246558,36.15881,5.9768,8.632,12.601,1984.3926,0.119874,24.9338,5406.9,8.2626,1146.657263,1.3194,4.2428
std,4832.674037,44.179466,0.838594,18.724156,17.825287,9.950018,5.483228,6.789213,18.573149,0.12007,17.532202,4026.614773,23.863762,1044.744231,1.47994,4.777365
min,1.0,0.0,0.0,15.0,0.33349,0.0,1.0,0.0,1908.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0
25%,4221.75,21.0,1.0,41.906231,23.092026,1.0,4.0,9.0,1973.0,0.019509,6.0,1564.0,0.0,325.0,0.0,1.0
50%,8320.5,37.0,2.0,52.92134,32.925087,6.0,7.0,12.0,1977.0,0.072158,25.0,5285.0,2.0,900.0,1.0,3.0
75%,12598.25,77.0,2.0,65.97819,45.174091,9.0,12.0,17.0,2000.0,0.195781,36.0,7287.0,5.0,1548.0,2.0,6.0
max,16795.0,212.0,17.0,255.0,303.071094,620.0,78.0,99.0,2020.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


In [775]:
test=prepare_square(test)

In [649]:
test=join_price_mean(test,price_mean_by_distr_square,data)

In [650]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 20 columns):
Id                  5000 non-null int64
DistrictId          5000 non-null int64
Rooms               5000 non-null float64
Square              5000 non-null float64
LifeSquare          3959 non-null float64
KitchenSquare       5000 non-null float64
Floor               5000 non-null int64
HouseFloor          5000 non-null float64
HouseYear           5000 non-null int64
Ecology_1           5000 non-null float64
Ecology_2           5000 non-null object
Ecology_3           5000 non-null object
Social_1            5000 non-null int64
Social_2            5000 non-null int64
Social_3            5000 non-null int64
Healthcare_1        2623 non-null float64
Helthcare_2         5000 non-null int64
Shops_1             5000 non-null int64
Shops_2             5000 non-null object
mean_price_by_ds    5000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 820.3+ KB


In [85]:
test=join_stats1(test,stats1,data,mode='test')

In [86]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 20 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       5000 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
mean_price       5000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 820.3+ KB


In [651]:
temp=test[['Rooms','Square','DistrictId','Social_1','Helthcare_2','Social_2','mean_price_by_ds']]

In [652]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 7 columns):
Rooms               5000 non-null float64
Square              5000 non-null float64
DistrictId          5000 non-null int64
Social_1            5000 non-null int64
Helthcare_2         5000 non-null int64
Social_2            5000 non-null int64
mean_price_by_ds    5000 non-null float64
dtypes: float64(3), int64(4)
memory usage: 312.5 KB


In [653]:
test['Price']=model3.predict(temp)

In [654]:
test[['Id','Price']].to_csv('EChasovskih_predictions.csv',index=False)