In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv('./data/train_clear_.csv')
data_exam = pd.read_csv('./data/test_clear_.csv')

data.DistrictId = data.DistrictId.astype(str)
data_exam.DistrictId = data_exam.DistrictId.astype(str)

In [3]:
def set_dummies(ds):
    return pd.get_dummies(ds)

In [4]:
data = set_dummies(data)
data_exam = set_dummies(data_exam)

In [5]:
X = data.drop(columns=['Id', 'Price'])
y = data['Price']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

#### LinearRegression

In [7]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
y_predict = lr.predict(X_test)

In [9]:
compare_ds = pd.DataFrame({
    'y_test': y_test,
    'y_pred': y_predict.flatten()
}, columns=['y_test', 'y_pred'])
compare_ds['error'] = compare_ds.y_pred - compare_ds.y_test

In [10]:
r2 = r2_score(compare_ds.y_test, compare_ds.y_pred)
r2

0.6428803789725346

#### RandomForest

In [11]:
model = RandomForestRegressor(max_depth=12, random_state=42, n_estimators=1000)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [12]:
y_predict = model.predict(X_test)
compare_ds = pd.DataFrame({
    'y_test': y_test,
    'y_pred': y_predict.flatten()
}, columns=['y_test', 'y_pred'])
compare_ds['error'] = compare_ds.y_pred - compare_ds.y_test
compare_ds.head()

Unnamed: 0,y_test,y_pred,error
6252,181530.459031,189618.555791,8088.09676
4684,260456.004692,292227.337505,31771.332813
1731,219945.30464,216701.893607,-3243.411033
4742,66883.280318,67035.877718,152.597399
4521,114086.065201,126962.265134,12876.199933


In [13]:
r2_forest = r2_score(compare_ds.y_test, compare_ds.y_pred)
r2_forest

0.7159051788030102