In [2]:
# Load required libraries and datasets

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor

path = "./"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")

## Drop columns which are not required

In [3]:
train['dataset'] = "train"
test['dataset'] = "test"
data = pd.concat([train,test], axis = 0)

columns_to_drop = ['description','name','neighbourhood','thumbnail_url']
for c in columns_to_drop:
    data.drop(c,axis=1)

In [None]:
# One-hot-encode categorical variables
categorical = ['property_type','room_type','bed_type','cancellation_policy','city']
data = pd.get_dummies(data, columns = categorical)

In [6]:
# Select only numeric data and impute missing values as 0
numerics = ['uint8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
train_x = data[data.dataset == "train"] \
    .select_dtypes(include=numerics) \
    .drop("log_price", axis = 1) \
    .fillna(0) \
    .values

test_x = data[data.dataset == "test"] \
    .select_dtypes(include=numerics) \
    .drop("log_price", axis = 1) \
    .fillna(0) \
    .values
    
train_y = data[data.dataset == "train"].log_price.values

In [None]:
from sklearn.model_selection import KFold

sample_leaf_options = [1,5,10,50,100,300,500,700,1000]

for leaf_size in sample_leaf_options:
    print('Leaf Size: {}'.format(leaf_size))
    
    cv_groups = KFold(n_splits=10)
    
    regr = RandomForestRegressor(n_estimators = 200,oob_score = True,n_jobs = -1,random_state =50,max_features = "auto", min_samples_leaf = leaf_size)

    for train_index, test_index in cv_groups.split(train_x):

        # Train the model using the training sets
        regr.fit(train_x[train_index], train_y[train_index])

        # Make predictions using the testing set
        pred_rf = regr.predict(train_x[test_index])

        # Calculate RMSE for current cross-validation split
        rmse = str(np.sqrt(np.mean((train_y[test_index] - pred_rf)**2)))

        print("RMSE for current split: " + rmse)

Leaf Size: 1
RMSE for current split: 0.414398200688
RMSE for current split: 0.390431579841
RMSE for current split: 0.396590483988
RMSE for current split: 0.404026339999
RMSE for current split: 0.387322741676
RMSE for current split: 0.395154636037
RMSE for current split: 0.403544358305
RMSE for current split: 0.403117079208
RMSE for current split: 0.397940898955
RMSE for current split: 0.409196291087
Leaf Size: 5
RMSE for current split: 0.411393163314
RMSE for current split: 0.387167425046
RMSE for current split: 0.394739765627
RMSE for current split: 0.401223368293
RMSE for current split: 0.385406915374
RMSE for current split: 0.392627301948
RMSE for current split: 0.399267096317
RMSE for current split: 0.397889454815
RMSE for current split: 0.395358468653
RMSE for current split: 0.405571869642
Leaf Size: 10
RMSE for current split: 0.411920662091
RMSE for current split: 0.388421526585
RMSE for current split: 0.395006967568
RMSE for current split: 0.401950303251
RMSE for current split: 

In [None]:
# Create submission file
regr.fit(train_x, train_y)
final_prediction = regr.predict(test_x)

submission = pd.DataFrame(np.column_stack([test.id, final_prediction]), columns = ['id','log_price'])
submission.to_csv("sample_submission.csv", index = False)