In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv('./data/train_clear.csv')
data_exam = pd.read_csv('./data/test_clear.csv')

In [3]:
def set_dummies(ds):
    return pd.get_dummies(ds, prefix=['Ecology_2', 'Ecology_3', 'Shops_2'])

In [4]:
data = set_dummies(data)
data_exam = set_dummies(data_exam)

In [5]:
X = data.drop(columns=['Id', 'Price'])
y = data['Price']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### LinearRegression

In [7]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
y_predict = lr.predict(X_test)

In [9]:
compare_ds = pd.DataFrame({
    'y_test': y_test,
    'y_pred': y_predict.flatten()
}, columns=['y_test', 'y_pred'])
compare_ds['error'] = compare_ds.y_pred - compare_ds.y_test

In [10]:
r2 = r2_score(compare_ds.y_test, compare_ds.y_pred)
r2

0.5194964157274393

#### RandomForest

In [12]:
model = RandomForestRegressor(max_depth=12, random_state=42, n_estimators=1000)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [13]:
y_predict = model.predict(X_test)
compare_ds = pd.DataFrame({
    'y_test': y_test,
    'y_pred': y_predict.flatten()
}, columns=['y_test', 'y_pred'])
compare_ds['error'] = compare_ds.y_pred - compare_ds.y_test
compare_ds.head()