In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score
from sklearn import preprocessing

In [2]:
%matplotlib inline

In [3]:
data_train = pd.read_csv('./data/train.csv')
data_test = pd.read_csv('./data/test.csv')

### Split

In [4]:
X = data_train[data_train.columns.difference(['Price'])]
y = data_train.Price
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000 entries, 9254 to 7270
Data columns (total 19 columns):
DistrictId       8000 non-null int64
Ecology_1        8000 non-null float64
Ecology_2        8000 non-null object
Ecology_3        8000 non-null object
Floor            8000 non-null int64
Healthcare_1     4148 non-null float64
Helthcare_2      8000 non-null int64
HouseFloor       8000 non-null float64
HouseYear        8000 non-null int64
Id               8000 non-null int64
KitchenSquare    8000 non-null float64
LifeSquare       6299 non-null float64
Rooms            8000 non-null float64
Shops_1          8000 non-null int64
Shops_2          8000 non-null object
Social_1         8000 non-null int64
Social_2         8000 non-null int64
Social_3         8000 non-null int64
Square           8000 non-null float64
dtypes: float64(7), int64(9), object(3)
memory usage: 1.2+ MB


### Data preparing

In [5]:
def drop_healthcare(ds):
    return ds.drop(columns=['Healthcare_1'], axis=1)

In [6]:
def drop_kitchen(ds):
    return ds.drop(columns=['KitchenSquare'], axis=1)

In [7]:
def set_dummies(ds):
    return pd.get_dummies(ds)

In [8]:
def fill_lifesquare(ds):
    ds.LifeSquare = ds.LifeSquare.fillna(1)
    ds.LifeSquare = np.where((ds.LifeSquare < 8) | (ds.LifeSquare > ds.Square), ds.Square, ds.LifeSquare)
    return ds

In [9]:
def prepare(ds):
    ds = drop_healthcare(ds)
    ds = drop_kitchen(ds)
    ds = set_dummies(ds)
    ds = fill_lifesquare(ds)
    return ds

In [10]:
X_train = prepare(X_train)
X_valid = prepare(X_valid)

In [11]:
X_train.head()

Unnamed: 0,DistrictId,Ecology_1,Floor,Helthcare_2,HouseFloor,HouseYear,Id,LifeSquare,Rooms,Shops_1,Social_1,Social_2,Social_3,Square,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B
9254,58,0.437885,3,0,1.0,1977,12473,62.798045,1.0,5,23,5735,3,65.271225,0,1,0,1,0,1
1561,146,0.236108,5,3,18.0,2017,14050,43.816601,2.0,10,16,3893,27,45.091598,0,1,0,1,0,1
1670,5,0.150818,3,4,5.0,1960,3779,20.561823,1.0,5,16,3433,4,34.463114,0,1,0,1,0,1
6087,90,0.265089,4,3,5.0,1966,3762,46.126389,3.0,2,37,5288,0,61.931107,0,1,0,1,0,1
6669,1,0.007122,8,0,17.0,1977,358,42.67084,1.0,1,1,264,0,42.67084,0,1,0,1,0,1


### Model/ RandomForestRegressor

In [12]:
feats = ['DistrictId', 'Ecology_1','Floor', 'Helthcare_2', 
         'HouseFloor', 'HouseYear', 'LifeSquare', 'Rooms', 
         'Shops_1', 'Social_1', 'Social_2', 'Social_3', 'Square', 
         'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A', 'Shops_2_B']

In [13]:
model = RandomForestRegressor(max_depth=20, random_state=42, n_estimators=100)
model.fit(X_train.loc[:, feats], y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [14]:
def get_prediction(ds, y, r2=False):
    pred = model.predict(ds.loc[:, feats])
    if r2:
        print(r2_score(y, pred))
    return pred

In [15]:
pred_train = get_prediction(ds=X_train, y=y_train, r2=True)

0.9606163073117645


In [16]:
pred_valid = get_prediction(ds=X_valid, y=y_valid, r2=True)

0.7170523342490556
