In [32]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor

%matplotlib inline

In [41]:
path = "./"
train = pd.read_csv(path + "train.csv",true_values='t',false_values='f')
test = pd.read_csv(path + "test.csv",true_values='t',false_values='f')

## Checking & Modifying Column Data Types

In [56]:
train['first_review'] = pd.to_datetime(train['first_review'])
train['host_since'] = pd.to_datetime(train['host_since'])
train['last_review'] = pd.to_datetime(train['last_review'])
train.dtypes

id                                 int64
log_price                        float64
property_type                     object
room_type                         object
amenities                         object
accommodates                       int64
bathrooms                        float64
bed_type                          object
cancellation_policy               object
cleaning_fee                        bool
city                              object
description                       object
first_review              datetime64[ns]
host_has_profile_pic              object
host_identity_verified            object
host_response_rate                object
host_since                datetime64[ns]
instant_bookable                    bool
last_review               datetime64[ns]
latitude                         float64
longitude                        float64
name                              object
neighbourhood                     object
number_of_reviews                  int64
review_scores_ra

## Calculating NaN %

In [51]:
train_nan = (train.isnull().sum() / train.shape[0]) * 100
train_nan = train_nan[train_nan>0]
train_nan

bathrooms                  0.269865
first_review              21.405729
host_has_profile_pic       0.253674
host_identity_verified     0.253674
host_response_rate        24.691341
host_since                 0.253674
last_review               21.355804
neighbourhood              9.272578
review_scores_rating      22.563452
thumbnail_url             11.086074
zipcode                    1.303450
bedrooms                   0.122789
beds                       0.176762
dtype: float64

## One-hot-encode categorical variables

In [None]:
train['dataset'] = "train"
test['dataset'] = "test"
data = pd.concat([train,test], axis = 0)

# Create amenities columns
amenitie_options = []
for a in data.amenities:
    x = a.split(',')
    for t in x:
        t = t.replace('{','')
        t = t.replace('}','')
        if len(t) > 0 and 'translation missing' not in t not in amenitie_options:
            amenitie_options.append(t)
for x in amenitie_options:
    print(x)

In [None]:
categorical = ['property_type','room_type','bed_type','cancellation_policy','city']
data = pd.get_dummies(data, columns = categorical)
data.columns

In [53]:
# Select only numeric data and impute missing values as 0
numerics = ['uint8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
train_x = data[data.dataset == "train"] \
    .select_dtypes(include=numerics) \
    .drop("log_price", axis = 1) \
    .fillna(0) \
    .values

test_x = data[data.dataset == "test"] \
    .select_dtypes(include=numerics) \
    .drop("log_price", axis = 1) \
    .fillna(0) \
    .values

train_y = data[data.dataset == "train"].log_price.values

In [54]:
# Train a Random Forest model with cross-validation

from sklearn.model_selection import KFold
cv_groups = KFold(n_splits=5)
regr = RandomForestRegressor(random_state = 0, n_estimators = 10)

for train_index, test_index in cv_groups.split(train_x):
    
    # Train the model using the training sets
    regr.fit(train_x[train_index], train_y[train_index])
    
    # Make predictions using the testing set
    pred_rf = regr.predict(train_x[test_index])
    
    # Calculate RMSE for current cross-validation split
    rmse = str(np.sqrt(np.mean((train_y[test_index] - pred_rf)**2)))
    
    print("RMSE for current split: " + rmse)

RMSE for current split: 0.422514098591
RMSE for current split: 0.419582744545
RMSE for current split: 0.413761925371
RMSE for current split: 0.421351916657
RMSE for current split: 0.424273559012


In [14]:
# Create submission file
regr.fit(train_x, train_y)
final_prediction = regr.predict(test_x)

submission = pd.DataFrame(np.column_stack([test.id, final_prediction]), columns = ['id','log_price'])
submission.to_csv("sample_submission.csv", index = False)