In [1]:
import pandas as pd 
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR

In [3]:
df = pd.read_csv('airbnb_final_dataset.csv')
df.head()

Unnamed: 0,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,neighbourhood_cleansed,price,cleaning_fee,...,heating,internet,kitchen,microwave,outdoor parking,private bathroom,self check-in,tv,washer,wifi
0,6,2,3,1,1,2,4,12,139,50,...,1,1,0,1,0,1,0,1,0,1
1,1,0,3,1,1,2,4,12,215,80,...,1,0,1,0,0,0,1,1,0,1
2,6,2,1,1,1,0,4,1,98,30,...,1,1,1,1,0,0,0,1,0,1
3,6,2,4,1,1,2,4,12,199,75,...,1,1,0,1,0,1,0,1,0,1
4,1,0,2,1,1,2,4,1,150,0,...,1,0,1,1,0,0,0,1,1,1


In [4]:
X = df.drop(['price'], axis = 1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Models

Linear Regression Model

In [5]:
model = LinearRegression()

model.fit(X_train, y_train)
print('Training Accuracy: ', model.score(X_train, y_train))
print('Test Accuracy: ', model.score(X_test, y_test))

Training Accuracy:  0.6505125046826657
Test Accuracy:  0.5482975510579896


Random Forest  Model

In [6]:
model = RandomForestRegressor(n_estimators=500, max_depth=11, bootstrap=True, max_features='sqrt', min_samples_leaf=1, min_samples_split=4, random_state=123)

model.fit(X_train, y_train)
print('Training Accuracy: ', model.score(X_train, y_train))
print('Test Accuracy: ', model.score(X_test, y_test))

Training Accuracy:  0.9039228860172568
Test Accuracy:  0.6214825438505851


## Feature selection using SelectFromModel
### Selecting Features based on Importance Weights

Linear Regression Model with Feature Selection

In [7]:
sel = SelectFromModel(model, prefit=True)
select_X_train = sel.transform(X_train)
select_X_test = sel.transform(X_test)
sel_model = LinearRegression()
sel_model.fit(select_X_train, y_train)
print('Training Accuracy: ', sel_model.score(select_X_train, y_train))
print('Test Accuracy: ', sel_model.score(select_X_test, y_test))

Training Accuracy:  0.6120223226664827
Test Accuracy:  0.5004431365357028


In [8]:
feature_idx = sel.get_support()
feature_idx

array([False, False,  True,  True,  True,  True, False, False,  True,
        True, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False])

Random Forest Model with Feature Selection

In [9]:
sel = SelectFromModel(model, prefit=True)

select_X_train = sel.transform(X_train)
select_X_test = sel.transform(X_test)

sel_model = RandomForestRegressor(n_estimators=500, max_depth=11, bootstrap=True, max_features='sqrt', min_samples_leaf=1, min_samples_split=4, random_state=123)
sel_model.fit(select_X_train, y_train)
print('Training Accuracy: ', sel_model.score(select_X_train, y_train))
print('Test Accuracy: ', sel_model.score(select_X_test, y_test))

Training Accuracy:  0.86701357983464
Test Accuracy:  0.5996749703586233


In [10]:
select_X_train[0]

array([ 5.,  2.,  3.,  3., 75.,  1., 18.])

In [11]:
len(X_train)

980

In [12]:
feature_idx = sel.get_support()
feature_idx

array([False, False,  True,  True,  True,  True, False, False,  True,
        True, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False])

## Recursive feature elimination

In [13]:
estimator = SVR(kernel="linear")
selector = RFE(estimator, 30, step=1)
selector = selector.fit(X_train, y_train)
selector.support_
print("Training accuracy:", selector.score(X_train, y_train))
print("Testing accuracy:", selector.score(X_test, y_test))

Training accuracy: 0.5955274448688165
Testing accuracy: 0.4935414039660088


In [14]:
selector.support_

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])