In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as LR, Ridge
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.metrics import r2_score as r2,mean_squared_error as MSE
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score, GridSearchCV

In [393]:
df=pd.read_csv('train.csv')

In [394]:
data=df.copy()
data.drop(['Healthcare_1'],axis=1,inplace=True)

In [395]:
#гипотеза №2
data.loc[377,'Rooms']=2
data.loc[1454,'Rooms']=1
data.loc[8849,'Rooms']=2
data.loc[1397,'Rooms']=4
data.loc[1981,'Rooms']=5
data.loc[2269,'Rooms']=1
data.loc[3911,'Rooms']=1
data.loc[4366,'Rooms']=3
data.loc[6149,'Rooms']=1
data.loc[8834,'Rooms']=3
data.loc[4853,'Rooms']=1

In [396]:
def prepare_square(df):
    df.loc[df['Square']<(df['Rooms']*15),'Square']=df['Rooms']*15
    return df

In [397]:
data=prepare_square(data)

In [398]:
train,test=train_test_split(data,test_size=0.3, random_state=54)

In [399]:
train['price_square'] = train['Price'] / train['Square']

In [400]:
price_mean_by_distr_square=train.groupby(['DistrictId','Rooms'],as_index=False).agg({'price_square':'mean'}).rename(columns={'price_square':'mean_price_by_ds'})

In [401]:
price_mean_by_distr_square.head()

Unnamed: 0,DistrictId,Rooms,mean_price_by_ds
0,0,1.0,4031.170747
1,0,2.0,3560.242068
2,0,3.0,4280.427935
3,1,1.0,3401.340728
4,1,2.0,3202.86063


In [440]:
def join_price_mean(df,stats1):
    df=pd.merge(df,stats1,on=['DistrictId','Rooms'],how = 'left')
    df['mean_price_by_ds']=df['mean_price_by_ds'].fillna(df['mean_price_by_ds'].mean())
    return df

In [403]:
train = join_price_mean(train,price_mean_by_distr_square)

In [404]:
correlation = data.corr()
print(correlation['Price'].sort_values(ascending=False))

Price            1.000000
Rooms            0.571121
Square           0.521041
DistrictId       0.265100
Social_1         0.263286
Helthcare_2      0.253090
Social_2         0.239226
Shops_1          0.180876
Floor            0.128715
HouseFloor       0.088280
LifeSquare       0.081292
Social_3         0.074878
KitchenSquare    0.028864
Id               0.009880
HouseYear        0.004305
Ecology_1       -0.058381
Name: Price, dtype: float64


In [405]:
X_train=train[['Rooms','Square','DistrictId','Social_1','Helthcare_2','Social_2','mean_price_by_ds']]
y_train=train['Price']

In [406]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
#X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [407]:
def rfr_model(X, y):
# Perform Grid-Search
    gsc = GridSearchCV(
        estimator=RF(),
        param_grid={
            'max_depth': range(3,7),
            'n_estimators': (10, 50, 100),
            'min_samples_leaf':[3,5,7,10]
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0,                         n_jobs=-1)
    
    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_
    print(best_params)
    
rfr_model(X_train_scaled,y_train)

{'max_depth': 6, 'min_samples_leaf': 10, 'n_estimators': 50}


In [422]:
model3=RF(random_state=54,max_depth=6,n_estimators=50,min_samples_leaf=10)

In [409]:
from sklearn.cluster import KMeans


In [410]:
kmeans = KMeans(n_clusters=2,random_state=54)

labels_train = kmeans.fit_predict(X_train_scaled)



In [423]:
%%time
model3.fit(X_train_scaled,y_train)

Wall time: 326 ms


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=10, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
           oob_score=False, random_state=54, verbose=0, warm_start=False)

In [412]:
test = join_price_mean(test,price_mean_by_distr_square)

In [413]:
X_test=test[['Rooms','Square','DistrictId','Social_1','Helthcare_2','Social_2','mean_price_by_ds']]
y_test=test['Price']

In [414]:
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [424]:
y_pred_3=model3.predict(X_test_scaled)

In [425]:
print('Случайный лес',MSE(y_test,y_pred_3),r2(y_test,y_pred_3))

Случайный лес 2610607009.8314176 0.6771230942861428


In [426]:
r2(y_train,model3.predict(X_train_scaled))

0.7908188893133905

In [427]:
labels_test = kmeans.predict(X_test_scaled)

In [428]:
model3.fit(X_train_scaled.loc[labels_train == 0], y_train[labels_train == 0])

y_test_pred_0 = model3.predict(X_test_scaled.loc[labels_test == 0])

In [429]:
model3.fit(X_train_scaled.loc[labels_train == 1], y_train[labels_train == 1])

y_test_pred_1 = model3.predict(X_test_scaled.loc[labels_test == 1])

# Загружаем тест

In [441]:
test=pd.read_csv('test.csv')

In [77]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


In [80]:
test.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,5000.0,5000.0,5000.0,5000.0,3959.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,2623.0,5000.0,5000.0
mean,8412.5954,51.2792,1.91,56.246558,36.15881,5.9768,8.632,12.601,1984.3926,0.119874,24.9338,5406.9,8.2626,1146.657263,1.3194,4.2428
std,4832.674037,44.179466,0.838594,18.724156,17.825287,9.950018,5.483228,6.789213,18.573149,0.12007,17.532202,4026.614773,23.863762,1044.744231,1.47994,4.777365
min,1.0,0.0,0.0,15.0,0.33349,0.0,1.0,0.0,1908.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0
25%,4221.75,21.0,1.0,41.906231,23.092026,1.0,4.0,9.0,1973.0,0.019509,6.0,1564.0,0.0,325.0,0.0,1.0
50%,8320.5,37.0,2.0,52.92134,32.925087,6.0,7.0,12.0,1977.0,0.072158,25.0,5285.0,2.0,900.0,1.0,3.0
75%,12598.25,77.0,2.0,65.97819,45.174091,9.0,12.0,17.0,2000.0,0.195781,36.0,7287.0,5.0,1548.0,2.0,6.0
max,16795.0,212.0,17.0,255.0,303.071094,620.0,78.0,99.0,2020.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


In [442]:
test=prepare_square(test)

In [443]:
test=join_price_mean(test,price_mean_by_distr_square)

In [444]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 20 columns):
Id                  5000 non-null int64
DistrictId          5000 non-null int64
Rooms               5000 non-null float64
Square              5000 non-null float64
LifeSquare          3959 non-null float64
KitchenSquare       5000 non-null float64
Floor               5000 non-null int64
HouseFloor          5000 non-null float64
HouseYear           5000 non-null int64
Ecology_1           5000 non-null float64
Ecology_2           5000 non-null object
Ecology_3           5000 non-null object
Social_1            5000 non-null int64
Social_2            5000 non-null int64
Social_3            5000 non-null int64
Healthcare_1        2623 non-null float64
Helthcare_2         5000 non-null int64
Shops_1             5000 non-null int64
Shops_2             5000 non-null object
mean_price_by_ds    5000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 820.3+ KB


In [85]:
test=join_stats1(test,stats1,data,mode='test')

In [86]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 20 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       5000 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
mean_price       5000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 820.3+ KB


In [445]:
temp=test[['Rooms','Square','DistrictId','Social_1','Helthcare_2','Social_2','mean_price_by_ds']]

In [439]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4891 entries, 0 to 4890
Data columns (total 7 columns):
Rooms               4891 non-null float64
Square              4891 non-null float64
DistrictId          4891 non-null int64
Social_1            4891 non-null int64
Helthcare_2         4891 non-null int64
Social_2            4891 non-null int64
mean_price_by_ds    4891 non-null float64
dtypes: float64(3), int64(4)
memory usage: 305.7 KB


In [446]:
test['Price']=model3.predict(temp)

In [91]:
test[['Id','Price']].to_csv('EChasovskih_predictions.csv',index=False)