https://ga.greyatom.com/learn/competitions/stayze-rent-predcition-tr1


##  Stayze Hackathon   Team NewShoots

### Version 5.0  -V.40  2020.10.09   -  Multiple Algorithms

In [138]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import os.path

In [139]:
# for Linear regression using scikit learn

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [140]:
# For XG Boost algo

from xgboost import XGBRegressor 
import xgboost as xgb

###  Change the below details as per your installation

In [141]:
filepath     = 'd:/DataScience/GREY-ATOM/09-25-Hackaton-Due-Oct-10-stayze-rent/'
trainfile    = 'train.csv'
testfile     = 'test.csv'
enggdata     = 'engg-features.csv'           # to be created via another code, with just engineered values from train dataset
enggdatatest = 'engg-features-test.csv'      # to be created via another code, with just engineered values from test dataset
trackresults = 'modelresults.csv'

### EDA Routines 

In [142]:
#EDA Missing data

def missing_data( dfr) :      # While this is done, it is redendant as the columns will be dropped
    #EDA 1
    dfr.name.fillna('*** Get Name Updated ***', inplace = True)    # Fixed the missing names in the data set

    #EDA 2
    dfr.last_review.fillna('01-01-2000', inplace = True)           # FIXED missing last review

    #EDA 3
    dfr.reviews_per_month.fillna(0, inplace = True)                # Fixed missing reviews/month

    #EDA 4
    dfr.host_name.fillna('*** Enter Host Name Here ***', inplace = True)
    
    return dfr


In [143]:
#EDA Outliers

def outlier_data ( dfr, rmax, rmin, daymax):
    #EDA 6 - Remove rent outliers
    dfr.drop((dfr[dfr.price >= rmax].index), axis = 0, inplace = True)  
    dfr.drop (dfr[dfr.price <= rmin].index, axis = 0, inplace = True)
   
    #EDA 7  - Remove day stay outliers
    dfr.drop(dfr[dfr.minimum_nights > daymax].index, axis = 0, inplace = True)
    
    return dfr


##  
## When cycling with different options START FROM HERE

#### if run for the first time, then create a empty template of model results, else load the previous results

In [144]:
# Load Previous Results 

if os.path.isfile(filepath + trackresults) :
    modelresults = pd.read_csv (filepath + trackresults)
    print (' Previous ', modelresults.shape[0], '  results loaded')
else :
    modelresults = pd.DataFrame(columns= ('Model','Comments', 'Engg_Feat', 'Train_RSME', 'Train_R2', 'Test_RMSE', 'Test_R2','One Hot','Features'))
    print (' New Results Tracking file created ')

 Previous  7   results loaded


In [145]:
# Read Data set

file_name = filepath + trainfile

dfr = pd.read_csv(file_name)

org_count = dfr.shape[0]

print ('\n\n', 'Number of observations in original data file', org_count, '\n\n')



 Number of observations in original data file 34226 




### Set the below semaphore to true, if the engineered features are to be included

In [146]:
add_engg_features = False
remove_outliers   = True

comments = 'Rent 20 to 500, min days < 150 '

# ENSURE THAT THE OUTLIER NOTES FIELD IS UPDATED WITH MEANINGFUL COMMENTS, THIS gets written to the output file for analysis

#### Model info for CLEANSING & TRACKING purposes

In [147]:
# UPDATE THIS CELL 

rentmax = 500
rentmin = 20
daysmax = 150
comments = 'Rent 20 to 500, min days < 150 '
comments_ols = comments 
# ENSURE THAT THE OUTLIER NOTES FIELD IS UPDATED WITH MEANINGFUL COMMENTS, THIS gets written to the output file for analysis

####  Data Preperation

In [148]:
if add_engg_features :      # Add the featured engineered columns
    engg_f = pd.read_csv(filepath + enggdata)
    dfr = pd.concat([dfr, engg_f], axis = 1)

In [149]:
# fill missing values
dfr =  missing_data (dfr)

# remove outliers
if  remove_outliers :
    dfr = outlier_data(dfr, rentmax, rentmin, daysmax)            # dataset, max rent, min rent, max days
    print ('Dropped ', org_count - dfr.shape[0], ' observations; about ~ ', round((1-(dfr.shape[0]/org_count))*100,2) ,' % of the train data')


Dropped  1025  observations; about ~  2.99  % of the train data


In [150]:
df = dfr.reset_index().copy()  # reset the index, as dropping records leaves gaps in the index. This eliminates a warning 

In [151]:
# The above df is clean; no obs removed;

# The approach for LR is as follows;
#- One hot neighbourhood group
#- One hot room type
#- ignore (drop) neighbourhood
#- drop non value adding col; 'id', 'name', 'host_id', 'host_name',
#- Drop col neighbourhood group & room type as they are One Hotted

# rest are numeric and can be used for LR

#### Identify Features and Usage

In [152]:
allfeatures = df.columns
dep_col =    ['price']
onehot_col = ['neighbourhood_group', 'room_type']

drop_col =   ['id', 'name', 'host_id', 'host_name', 'price',  'last_review', 'index', 'neighbourhood'] 
# with reset_index the old index is added as additional col with name 'index'. This is also dropped

ind_col = list(set(allfeatures) - set(dep_col) - set(onehot_col) - set(drop_col))


#### Drop columns that are not relevant, index, and dep feature

In [153]:
# seperate the dependent variable
dfy = df[dep_col]  

# remove features that are not contributing to the outcome.
df = df.drop(drop_col , axis = 1)


##### One hot ordinal features.

In [154]:
onehot_df = pd.get_dummies(df[onehot_col])

In [155]:
df.drop(onehot_col, axis = 1, inplace = True)

In [156]:
df = pd.concat([df, onehot_df], axis = 1)

#### Drop the Dummy Variable from one hot

In [157]:
df.drop(['neighbourhood_group_Staten Island'], axis = 1, inplace = True)  
df.drop([ 'room_type_Shared room'], axis = 1, inplace = True)  

#### setup Sklearn for Linear Regression

#### Evaluation Metrics: Evaluate the performance of the model using two metrics - R-squared value and Root Mean Squared Error (RMSE).

R-squared values range from 0 to 1 and are commonly stated as percentages. It is a statistical measure that represents the proportion of the variance for a target variable that is explained by the independent variables. The other commonly used metric for regression problems is RMSE, that measures the average magnitude of the residuals or error. We will be using both these metrics to evaluate the model performance.

Ideally, lower RMSE and higher R-squared values are indicative of a good model.

In [158]:
print('Percentage of original data used :', round(df.shape[0]/org_count * 100, 2))

Percentage of original data used : 97.01


#####  Apply Ordinary Least Square Linear Regression Model from SciKit Learn

In [159]:
X_train, X_test, y_train, y_test = train_test_split(df,dfy, test_size=0.2, random_state=100)

In [160]:
# Model OLS
algo = 'OLS Linear Regression'

In [161]:
# Instantiate LR model
lrm = LinearRegression()

In [162]:
lrm.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [163]:
ols_pred = lrm.predict(X_test)

In [164]:
rmserr = np.sqrt(mean_squared_error(y_test,ols_pred))

In [165]:
print ('\n'*2, '_'*120, '\n\n',
       '    The error after running Ordinary Least Square Linear Regression ', round((rmserr),2),
      '\n', '_'*120, '\n'*3,)



 ________________________________________________________________________________________________________________________ 

     The error after running Ordinary Least Square Linear Regression  65.43 
 ________________________________________________________________________________________________________________________ 





In [166]:
#Determine the R2 value

In [167]:
r2 = r2_score(y_test, ols_pred)

In [168]:
print ('\n'*2, '_'*120, '\n\n',
       '    The R2 Score is ', round(r2,2),
      '\n', '_'*120, '\n'*3,)



 ________________________________________________________________________________________________________________________ 

     The R2 Score is  0.43 
 ________________________________________________________________________________________________________________________ 





In [169]:
train_e = np.sqrt(mean_squared_error(y_train, lrm.predict(X_train)))
train_r = r2_score(y_train,lrm.predict(X_train) )
test_e = np.sqrt(mean_squared_error(y_test, lrm.predict(X_test)))
test_r = r2_score(y_test,lrm.predict(X_test) )


In [170]:
modelresults.loc[len(modelresults)] = [algo,comments, add_engg_features, train_e, train_r, test_e, test_r, onehot_col, ind_col] 

In [171]:
round(modelresults.tail(1),2)

Unnamed: 0,Model,Comments,Engg_Feat,Train_RSME,Train_R2,Test_RMSE,Test_R2,One Hot,Features
7,OLS Linear Regression,"Rent 20 to 500, min days < 150",False,63.75,0.43,65.43,0.43,"[neighbourhood_group, room_type]","[longitude, reviews_per_month, number_of_revie..."


#### Apply RIDGE Regession & Evalation

In [51]:
from sklearn.linear_model import Ridge
algo = 'Ridge'

In [52]:
ridge = Ridge(alpha = .01)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict (X_test)

In [53]:
train_e = np.sqrt(mean_squared_error(y_train, ridge.predict(X_train)))
train_r = r2_score(y_train,ridge.predict(X_train) )
test_e = np.sqrt(mean_squared_error(y_test, ridge.predict(X_test)))
test_r = r2_score(y_test,ridge.predict(X_test) )

In [54]:
modelresults.loc[len(modelresults)] = [algo, comments, add_engg_features, train_e, train_r, test_e, test_r, onehot_col, ind_col] 

In [55]:
round(modelresults.tail(1),2)

Unnamed: 0,Model,Comments,Engg_Feat,Train_RSME,Train_R2,Test_RMSE,Test_R2,One Hot,Features
2,Ridge,"Rent 20 to 500, min days < 150",True,63.72,0.43,65.35,0.43,"[neighbourhood_group, room_type]","[longitude, reviews_per_month, number_of_revie..."


#### Apply LASSO Regession & Evalation

In [56]:
from sklearn.linear_model import Lasso
algo = 'Lasso'

In [57]:
lasso = Lasso(alpha = 0.01)
lasso.fit (X_train, y_train)
lasso_pred = lasso.predict (X_test)

In [58]:
train_e = np.sqrt(mean_squared_error(y_train, lasso.predict(X_train)))
train_r = r2_score(y_train,lasso.predict(X_train) )
test_e = np.sqrt(mean_squared_error(y_test, lasso.predict(X_test)))
test_r = r2_score(y_test,lasso.predict(X_test) )

modelresults.loc[len(modelresults)] = [algo, comments, add_engg_features, train_e, train_r, test_e, test_r, onehot_col, ind_col] 


In [59]:
round(modelresults.tail(1),2)

Unnamed: 0,Model,Comments,Engg_Feat,Train_RSME,Train_R2,Test_RMSE,Test_R2,One Hot,Features
3,Lasso,"Rent 20 to 500, min days < 150",True,63.72,0.43,65.38,0.43,"[neighbourhood_group, room_type]","[longitude, reviews_per_month, number_of_revie..."


#### Apply ElasticNet Regession & Evaluation





In [60]:
from sklearn.linear_model import ElasticNet
algo = 'ElasticNet'

In [61]:
elast = ElasticNet(alpha = 0.05)
elast.fit (X_train, y_train)
elast_pred = elast.predict (X_test)

In [62]:
train_e = np.sqrt(mean_squared_error(y_train, elast.predict(X_train)))
train_r = r2_score(y_train,elast.predict(X_train) )
test_e  = np.sqrt(mean_squared_error(y_test, elast.predict(X_test)))
test_r  = r2_score(y_test,elast.predict(X_test) )

modelresults.loc[len(modelresults)] = [algo, comments, add_engg_features, train_e, train_r, test_e, test_r, onehot_col, ind_col] 

In [63]:
round(modelresults.tail(1),2)

Unnamed: 0,Model,Comments,Engg_Feat,Train_RSME,Train_R2,Test_RMSE,Test_R2,One Hot,Features
4,ElasticNet,"Rent 20 to 500, min days < 150",True,65.02,0.41,66.95,0.4,"[neighbourhood_group, room_type]","[longitude, reviews_per_month, number_of_revie..."


#### Apply XG Boost & Evaluation

https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [64]:
algo = 'XGBoost'
#comments = "Orginal .7 5 8 3 200 13 dep =12 .1 .1 1 1"
comments = "Reduce Overfit .5 5 8 3 200 chldwt:25 dep:5  Lr:.08 .8 .8"
xgboost = XGBRegressor(objective='reg:squarederror',
                       subsample = 0.5,                        # low value prevent OF, too low leads  to UF  0.5 - 1
                       scale_pos_weight= 5,                    # > 0 used in case of high class imbalance, helps convergence.
                       reg_lambda= 8 ,                         # check usage - 
                       gamma = 3,                              # min loss fun reductio reqd to split the node
                       n_estimators = 200, 
                       min_child_weight= 25,                   # Low values leads to OF 
                       max_depth = 5 ,                         # High value leads to OF  3-10 
                       learning_rate = 0.08,                    # .01 to .2
                       colsample_bytree = .8 ,
                       colsample_bynode= .8,
                       eval_metric = 'rmse')


xgboost.fit(X_train,y_train)
xgboost_pred = xgboost.predict(X_test)


In [65]:
train_e = np.sqrt(mean_squared_error(y_train, xgboost.predict(X_train)))
train_r = r2_score(y_train,xgboost.predict(X_train) )
test_e = np.sqrt(mean_squared_error(y_test, xgboost.predict(X_test)))
test_r = r2_score(y_test, xgboost.predict(X_test) )

In [66]:
modelresults.loc[len(modelresults)] = [algo, comments, add_engg_features, train_e, train_r, test_e, test_r, onehot_col, ind_col] 

In [67]:
round(modelresults.tail(1),2)

Unnamed: 0,Model,Comments,Engg_Feat,Train_RSME,Train_R2,Test_RMSE,Test_R2,One Hot,Features
5,XGBoost,Reduce Overfit .5 5 8 3 200 chldwt:25 dep:5 L...,True,54.83,0.58,59.02,0.53,"[neighbourhood_group, room_type]","[longitude, reviews_per_month, number_of_revie..."


####  XGB with Cross Validation 

In [68]:
xgb_dm = xgb.DMatrix(data=X_train,label= y_train)
algo = 'XGBoost CV - 10 Fold '
#comments = "Reduce Overfit .7 5 8 3 200 13 dep =5  .1 .8 .8"
comments = "Reduce Overfit .5 5 8 3 200 chldwt:25 dep:5  Lr:.08 .8 .8"

In [69]:
xgb_params = {'objective':'reg:squarederror',
              'subsample' : 0.5,
              'scale_pos_weight': 5,
              'lambda' : 8, 
              'gamma': 3 ,
            
              'min_child_weight' :  25, 
              'max_depth': 5,
              'learning_rate': 0.08,              
              'colsample_bytree': .8,
              'colsample_bynode' : .8 }

In [70]:
xgb_cv = xgb.cv(dtrain=xgb_dm, 
                params= xgb_params, 
                nfold=10,
                num_boost_round=100,
                early_stopping_rounds=10,
                metrics="rmse", 
                as_pandas=True, 
                seed=77)

In [71]:
train_e = xgb_cv.tail(1).iloc[0,0]
train_r = 0
test_e = xgb_cv.tail(1).iloc[0,2]
test_r = 0

In [72]:
modelresults.loc[len(modelresults)] = [algo, comments, add_engg_features, train_e, train_r, test_e, test_r, onehot_col, ind_col] 

In [73]:
round(modelresults.tail(1),2)

Unnamed: 0,Model,Comments,Engg_Feat,Train_RSME,Train_R2,Test_RMSE,Test_R2,One Hot,Features
6,XGBoost CV - 10 Fold,Reduce Overfit .5 5 8 3 200 chldwt:25 dep:5 L...,True,56.5,0.0,58.41,0.0,"[neighbourhood_group, room_type]","[longitude, reviews_per_month, number_of_revie..."


### Save the Results in the TrackResults.Csv file

In [74]:
# if this is the first time the program is run, then just write out the model results into a csv, else first delete
if os.path.isfile(filepath + trackresults) :
    os.remove (filepath + trackresults)

modelresults.to_csv(filepath + trackresults, index =False)

# --------------------------------------------------------------------------------------------------------------
# Validated Till Above Only  - Dont execute the code below
# --------------------------------------------------------------------------------------------------------------


##### USE KFOLD Validation from SciKit Learn to run Linear Regression Multiple times, with different train/test combinations

In [76]:
from sklearn.model_selection import KFold

In [93]:
algo = 'Kfold CV on OLS'
comments = comments_ols                                                               # Revert to original comments
kferr = pd.DataFrame(columns = ('Train_rmse', 'Train_R2', 'Test_rmse', 'Test_R2' ))
cv = KFold (n_splits = 10, random_state = 77)
lrmk = LinearRegression()

for trainind, testind in cv.split (df) :
    
    xtrain = df.iloc[trainind]
    ytrain = dfy.iloc[trainind]
    xtest  = df.iloc[testind]
    ytest  = dfy.iloc[testind]
    
    lrmk.fit(xtrain, ytrain)
    
    ypred = lrmk.predict (xtest)

    train_e = np.sqrt(mean_squared_error(ytrain, lrmk.predict(xtrain)))
    train_r = r2_score(ytrain,lrmk.predict(xtrain) )
    test_e = np.sqrt(mean_squared_error(ytest, lrmk.predict(xtest)))
    test_r = r2_score(ytest,lrmk.predict(xtest) )

    
    kferr.loc[len(kferr)] = [train_e, train_r, test_e, test_r]
    
    

In [94]:
modelresults.loc[len(modelresults)] = [algo, comments, add_engg_features, np.mean(kferr.iloc[:,0]), 
                                                                          np.mean(kferr.iloc[:,1]), 
                                                                          np.mean(kferr.iloc[:,2]), 
                                                                          np.mean(kferr.iloc[:,3]),
                                                                          onehot_col, ind_col]

In [95]:
round(modelresults.tail(1),2)

Unnamed: 0,Model,Comments,Engg_Feat,Train_RSME,Train_R2,Test_RMSE,Test_R2,One Hot,Features
8,Kfold CV on OLS,"Rent 20 to 500, min days < 150",True,64.04,0.43,64.07,0.43,"[neighbourhood_group, room_type]","[longitude, reviews_per_month, number_of_revie..."


### Run the test file through the above model, and check the prediction


In [172]:
dftest = pd.read_csv(filepath + testfile)

In [None]:
if add_engg_features :      # Add the featured engineered columns
    engg_f = pd.read_csv(filepath + enggdatatest)
    dftest = pd.concat([dftest, engg_f], axis = 1)

In [173]:
# fill missing values
dftest = missing_data (dftest)


In [174]:
# remove data with max night outliers
dftest.drop(dftest[dftest.minimum_nights > daysmax].index, axis = 0, inplace = True)
drop_col.remove('price')

dft  = dftest.reset_index().copy()  # reset the index, as dropping fes records leaves gaps in the index. elimates a warning 

dft_org = dft.copy()  # keep a copy to analyse post rent prediction.

dft = dft.drop(drop_col , axis = 1)

# remove features that are not ontributing to the outcome.

onehot_dft = pd.get_dummies(dft[onehot_col])

dft.drop(onehot_col, axis = 1, inplace = True)

dft = pd.concat([dft, onehot_dft], axis = 1)

#### Drop the Dummy Variable from one hot

dft.drop(['neighbourhood_group_Staten Island'], axis = 1, inplace = True)  
dft.drop([ 'room_type_Shared room'], axis = 1, inplace = True)  


####   USe the above processed test data file to predictprice using the lrm LR Model from above

In [183]:
predrent = lrm.predict(dft).squeeze()


In [186]:
predrent = pd.Series(predrent)

In [187]:
dft_org = pd.concat([dft_org, predrent], axis = 1)
dft_org.rename(columns = { 0 : 'price'}, inplace = True)

In [188]:
dft_org.to_csv(filepath + 'Testwithprice.csv' , index = False)

### Analysis of the Preddicted Rent

In [None]:
# there are few -ve rents predicted

In [189]:
dft_org[dft_org.price < 0]['price'].count()  # 13 predicted rents are in the -ve !!!

13

In [190]:
print ('\n'*2, '_'*120, '\n',
       'Negative rent prediction as percent of total predictions : ', 
        round(dft_org[dft_org.price < 0]['price'].count() / dft_org.shape[0]*100,2),'%',
       '\n', '_'*120, '\n\n')



 ________________________________________________________________________________________________________________________ 
 Negative rent prediction as percent of total predictions :  0.09 % 
 ________________________________________________________________________________________________________________________ 




In [191]:
print(dft_org[dft_org.price < 0][['neighbourhood', 'room_type', 'price']].sort_values([ 'room_type', 'price']))

             neighbourhood     room_type      price
11404              Jamaica  Private room -25.464896
7099               Astoria  Private room -24.711014
6121              Bushwick  Private room -22.464179
9412   Springfield Gardens  Private room -16.773838
14058             Bushwick  Private room -13.295838
9460   Springfield Gardens  Private room -12.093461
12009              Jamaica  Private room  -7.375107
1095        Queens Village  Private room  -5.200617
10128          Kew Gardens  Private room  -4.680602
13125         Forest Hills  Private room  -3.128846
3083               Bayside  Private room  -2.133044
5811           East Harlem   Shared room -43.597105
1190               Jamaica   Shared room -18.445399


In [192]:
dft_org[dft_org.price > 0].pivot_table(values = ['price'],
                   index   = ['neighbourhood_group', 'neighbourhood'],
                   columns = ['room_type'],
                   aggfunc = {'price' : [min, max, np.mean]}
                    
                   )

Unnamed: 0_level_0,Unnamed: 1_level_0,price,price,price,price,price,price,price,price,price
Unnamed: 0_level_1,Unnamed: 1_level_1,max,max,max,mean,mean,mean,min,min,min
Unnamed: 0_level_2,room_type,Entire home/apt,Private room,Shared room,Entire home/apt,Private room,Shared room,Entire home/apt,Private room,Shared room
neighbourhood_group,neighbourhood,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
Bronx,Allerton,150.096426,60.622618,,129.978288,37.395083,,118.916911,24.815114,
Bronx,Baychester,,49.800700,,,49.800700,,,49.800700,
Bronx,Belmont,145.381788,70.407979,,145.381788,51.906268,,145.381788,36.213095,
Bronx,Bronxdale,134.092132,55.339975,35.867924,128.765798,47.301428,35.867924,123.439464,39.262881,35.867924
Bronx,Castle Hill,134.913425,,,126.196505,,,117.479585,,
Bronx,City Island,107.359373,,,103.902558,,,101.483793,,
Bronx,Claremont Village,77.877351,79.241730,,70.416762,57.373639,,62.956172,36.398468,
Bronx,Clason Point,146.280824,68.296211,,135.969071,58.851345,,125.657318,32.617224,
Bronx,Concourse,166.199302,90.110561,,145.546320,65.551228,,120.752979,46.001682,
Bronx,Concourse Village,146.592523,73.477168,27.988482,143.625279,60.384002,27.988482,140.658035,50.236055,27.988482


deck 
our story line
 - we worked with the given data ; poor results
 - we added multiple models ; not muh improvement

- added the distance to the nearest subway station ; good improvemet but not sufficient

so reco:

- current data is insufficient to prdict rent with a high degree of confidence
- recommend that we use additional features like transporattion, crime rate, etc to find relevant features