In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV

%config InlineBackend.figure_format = "svg"

In [2]:
data = pd.read_csv("train.csv")

In [3]:
data.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,40.766115,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019,3.0,3.0
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,40.80811,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027,1.0,3.0
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117,2.0,2.0
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,...,38.925627,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009,0.0,1.0


In [4]:
data.shape

(74111, 29)

In [5]:
df, validation_df  = train_test_split(data,
                               test_size=0.25,
                               random_state = 101)

In [6]:
df_train, df_test  = train_test_split(df,
                               test_size=0.25,
                               random_state = 101)

# info

In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13896 entries, 4281 to 51739
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      13896 non-null  int64  
 1   log_price               13896 non-null  float64
 2   property_type           13896 non-null  object 
 3   room_type               13896 non-null  object 
 4   amenities               13896 non-null  object 
 5   accommodates            13896 non-null  int64  
 6   bathrooms               13860 non-null  float64
 7   bed_type                13896 non-null  object 
 8   cancellation_policy     13896 non-null  object 
 9   cleaning_fee            13896 non-null  bool   
 10  city                    13896 non-null  object 
 11  description             13896 non-null  object 
 12  first_review            11006 non-null  object 
 13  host_has_profile_pic    13867 non-null  object 
 14  host_identity_verified  13867 non-n

# Finding missing values

In [8]:
df_test.isnull().sum()

id                           0
log_price                    0
property_type                0
room_type                    0
amenities                    0
accommodates                 0
bathrooms                   36
bed_type                     0
cancellation_policy          0
cleaning_fee                 0
city                         0
description                  0
first_review              2890
host_has_profile_pic        29
host_identity_verified      29
host_response_rate        3388
host_since                  29
instant_bookable             0
last_review               2882
latitude                     0
longitude                    0
name                         0
neighbourhood             1300
number_of_reviews            0
review_scores_rating      3046
thumbnail_url             1549
zipcode                    186
bedrooms                    23
beds                        30
dtype: int64

# Correlation

In [9]:
df_test.corr()

Unnamed: 0,id,log_price,accommodates,bathrooms,cleaning_fee,latitude,longitude,number_of_reviews,review_scores_rating,bedrooms,beds
id,1.0,-0.011693,-5e-05,-0.00774,0.00373,0.006692,0.011101,-0.00541,0.006822,0.001691,-0.002001
log_price,-0.011693,1.0,0.565343,0.347248,0.121777,0.003362,-0.04325,-0.031786,0.093349,0.484767,0.452548
accommodates,-5e-05,0.565343,1.0,0.490884,0.18895,-0.073857,-0.08026,0.037285,-0.012337,0.701503,0.811189
bathrooms,-0.00774,0.347248,0.490884,1.0,0.053514,-0.127524,-0.125467,-0.046195,0.011151,0.578114,0.517307
cleaning_fee,0.00373,0.121777,0.18895,0.053514,1.0,-0.065813,-0.074338,0.112034,0.037779,0.114016,0.143235
latitude,0.006692,0.003362,-0.073857,-0.127524,-0.065813,1.0,0.896456,-0.018672,-0.040265,-0.059292,-0.083189
longitude,0.011101,-0.04325,-0.08026,-0.125467,-0.074338,0.896456,1.0,-0.047175,-0.054929,-0.073124,-0.08078
number_of_reviews,-0.00541,-0.031786,0.037285,-0.046195,0.112034,-0.018672,-0.047175,1.0,0.016425,-0.038856,0.030966
review_scores_rating,0.006822,0.093349,-0.012337,0.011151,0.037779,-0.040265,-0.054929,0.016425,1.0,0.021525,-0.022357
bedrooms,0.001691,0.484767,0.701503,0.578114,0.114016,-0.059292,-0.073124,-0.038856,0.021525,1.0,0.707111


# Date

In [10]:
#df["last_review"] = pd.to_datetime(df["last_review"])
#df["first_review"] = pd.to_datetime(df["first_review"])
#df["host_since"] = pd.to_datetime(df["host_since"])

# Replac missing values with previous date

In [11]:
df_test.last_review.fillna(method="ffill",inplace=True)
df_test.first_review.fillna(method="ffill",inplace=True)
df_test.host_since.fillna(method="ffill",inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


# Handling Missing Values 

> #### 1. Bathrooms

In [12]:
df_test.bathrooms.unique()

array([1. , 4. , 2. , 1.5, 0. , 0.5, 2.5, 3. , nan, 3.5, 5. , 8. , 4.5,
       5.5, 6. , 7.5])

^ It is not logical for the bathrooms to be the flutes numbers, because in fact, for example, there is no bathroom and a half!

In [13]:
#df_train["bathrooms"]=df_train["bathrooms"].round()

In [14]:
df_test["bathrooms"].mode()

0    1.0
dtype: float64

In [15]:
df_test["bathrooms"].mean()

1.2413059163059164

In [16]:
df_test["bathrooms"] = df_test['bathrooms'].fillna(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["bathrooms"] = df_test['bathrooms'].fillna(1)


In [17]:
#df_train["bathrooms"] = df_train['bathrooms'].fillna(df_train["bathrooms"].mode())

In [18]:
df_test["bathrooms"].value_counts()

1.0    10890
2.0     1541
1.5      672
2.5      286
3.0      223
3.5       85
4.0       51
0.5       43
0.0       41
4.5       23
5.0       17
8.0       11
6.0        7
5.5        5
7.5        1
Name: bathrooms, dtype: int64

> #### 2. review

Missing values in column "review_scores_rating" are related to column "number_of_reviews".

In [19]:
df_test[["number_of_reviews","review_scores_rating"]][df_test.number_of_reviews == 0]

Unnamed: 0,number_of_reviews,review_scores_rating
36339,0,
72747,0,
20849,0,
25183,0,
17669,0,
...,...,...
22869,0,
37047,0,
30838,0,
73419,0,


In [20]:
df_test["review_scores_rating"] = df_test["review_scores_rating"].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["review_scores_rating"] = df_test["review_scores_rating"].fillna(0)


In [21]:
df_test["review_scores_rating"].unique()

array([ 94.,   0.,  99.,  96.,  85.,  92.,  90.,  91.,  98., 100.,  93.,
        80.,  89.,  97.,  65.,  79.,  95.,  73.,  86.,  77.,  88.,  87.,
        70.,  67.,  84.,  75.,  83.,  76.,  40.,  20.,  53.,  63.,  60.,
        82.,  78.,  81.,  68.,  71.,  58.,  50.,  74.,  62.,  72.,  69.,
        30.,  55.,  64.])

In [22]:
df_test["review_scores_rating"] = df_test["review_scores_rating"]/100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["review_scores_rating"] = df_test["review_scores_rating"]/100


In [23]:
df_test["review_scores_rating"].unique()

array([0.94, 0.  , 0.99, 0.96, 0.85, 0.92, 0.9 , 0.91, 0.98, 1.  , 0.93,
       0.8 , 0.89, 0.97, 0.65, 0.79, 0.95, 0.73, 0.86, 0.77, 0.88, 0.87,
       0.7 , 0.67, 0.84, 0.75, 0.83, 0.76, 0.4 , 0.2 , 0.53, 0.63, 0.6 ,
       0.82, 0.78, 0.81, 0.68, 0.71, 0.58, 0.5 , 0.74, 0.62, 0.72, 0.69,
       0.3 , 0.55, 0.64])

> #### 3. bedrooms

In [24]:
df_test.bedrooms.unique()

array([ 1.,  9.,  2.,  0.,  3.,  4.,  5., nan,  6.,  7., 10.,  8.])

In [25]:
df_test["bedrooms"].mean()

1.262235997981691

In [26]:
df_test["bedrooms"].mode()

0    1.0
dtype: float64

In [27]:
#df["bedrooms"] = df['bedrooms'].fillna(df["bedrooms"].mode())

In [28]:
df_test["bedrooms"] = df_test["bedrooms"].fillna(1.0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["bedrooms"] = df_test["bedrooms"].fillna(1.0)


In [29]:
df_test.bedrooms.value_counts()

1.0     9413
2.0     2070
0.0     1259
3.0      787
4.0      259
5.0       83
6.0       14
7.0        6
8.0        3
9.0        1
10.0       1
Name: bedrooms, dtype: int64

> #### 4. Beds

In [30]:
df_test["beds"].unique()

array([ 1.,  9.,  2.,  3.,  4.,  5.,  7.,  6.,  8., nan, 12., 16., 10.,
       13., 15., 11.,  0.])

In [31]:
df_test["beds"].mean()

1.704384826193567

In [32]:
df_test["beds"].mode()

0    1.0
dtype: float64

In [33]:
df_test["beds"] = df_test["beds"].fillna(2.0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["beds"] = df_test["beds"].fillna(2.0)


In [34]:
df_test["beds"].value_counts()

1.0     8493
2.0     3157
3.0     1168
4.0      582
5.0      239
6.0      136
7.0       38
8.0       37
10.0      17
9.0       11
11.0       4
12.0       4
16.0       4
0.0        2
15.0       2
13.0       2
Name: beds, dtype: int64

> #### 5. host has profile pic

In [35]:
df_test.host_has_profile_pic.unique()

array(['t', nan, 'f'], dtype=object)

In [36]:
df_test.host_has_profile_pic.value_counts()

t    13820
f       47
Name: host_has_profile_pic, dtype: int64

> #### 6. host_identity_verified

In [37]:
df_test.host_identity_verified.unique()

array(['f', 't', nan], dtype=object)

In [38]:
df_test.host_identity_verified.value_counts()

t    9354
f    4513
Name: host_identity_verified, dtype: int64

the columns host_has_profile_pic and host_identity_verified has boolean values true and false¶


In [39]:
df_test.replace(to_replace = "t", value = 1,inplace=True) 
df_test.replace(to_replace = "f", value = 0,inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [40]:
df_test["host_has_profile_pic"] = df_test["host_has_profile_pic"].astype("bool")
df_test["host_identity_verified"] = df_test["host_identity_verified"].astype("bool")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["host_has_profile_pic"] = df_test["host_has_profile_pic"].astype("bool")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["host_identity_verified"] = df_test["host_identity_verified"].astype("bool")


In [41]:
df_test.host_identity_verified.value_counts()

True     9383
False    4513
Name: host_identity_verified, dtype: int64

In [42]:
df_test.host_has_profile_pic.value_counts()

True     13849
False       47
Name: host_has_profile_pic, dtype: int64

### Changed the datatype to bool of these two columns:

In [43]:
#df["host_has_profile_pic"] = df["host_has_profile_pic"].astype("bool")
#df["host_identity_verified"] = df["host_identity_verified"].astype("bool")

> #### 7.host response rate

In [44]:
df_test.host_response_rate.unique()

array(['100%', nan, '70%', '50%', '80%', '96%', '93%', '89%', '97%',
       '90%', '99%', '20%', '83%', '0%', '94%', '92%', '67%', '33%',
       '75%', '81%', '84%', '85%', '91%', '82%', '95%', '40%', '60%',
       '71%', '57%', '72%', '98%', '86%', '25%', '88%', '78%', '30%',
       '35%', '56%', '69%', '87%', '68%', '54%', '43%', '53%', '55%',
       '29%', '52%', '79%', '36%', '76%', '74%', '65%', '73%', '58%',
       '10%', '63%', '38%', '15%', '41%', '27%', '17%', '14%', '77%',
       '26%', '22%'], dtype=object)

In [45]:
df_test.host_response_rate = df_test.host_response_rate.apply(
    lambda x: int(x[:len(x)-1])/100
     if isinstance(x,str) else x )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [46]:
df_test.host_response_rate.unique()

array([1.  ,  nan, 0.7 , 0.5 , 0.8 , 0.96, 0.93, 0.89, 0.97, 0.9 , 0.99,
       0.2 , 0.83, 0.  , 0.94, 0.92, 0.67, 0.33, 0.75, 0.81, 0.84, 0.85,
       0.91, 0.82, 0.95, 0.4 , 0.6 , 0.71, 0.57, 0.72, 0.98, 0.86, 0.25,
       0.88, 0.78, 0.3 , 0.35, 0.56, 0.69, 0.87, 0.68, 0.54, 0.43, 0.53,
       0.55, 0.29, 0.52, 0.79, 0.36, 0.76, 0.74, 0.65, 0.73, 0.58, 0.1 ,
       0.63, 0.38, 0.15, 0.41, 0.27, 0.17, 0.14, 0.77, 0.26, 0.22])

In [47]:
df_test.host_response_rate = df_test.host_response_rate.fillna(df_test.host_response_rate.mean())

In [48]:
df_test.host_response_rate.value_counts()

1.00000    8159
0.94516    3388
0.90000     414
0.80000     190
0.00000     149
           ... 
0.15000       1
0.17000       1
0.52000       1
0.36000       1
0.26000       1
Name: host_response_rate, Length: 65, dtype: int64

In [49]:
df_test.host_response_rate.unique()

array([1.        , 0.94515988, 0.7       , 0.5       , 0.8       ,
       0.96      , 0.93      , 0.89      , 0.97      , 0.9       ,
       0.99      , 0.2       , 0.83      , 0.        , 0.94      ,
       0.92      , 0.67      , 0.33      , 0.75      , 0.81      ,
       0.84      , 0.85      , 0.91      , 0.82      , 0.95      ,
       0.4       , 0.6       , 0.71      , 0.57      , 0.72      ,
       0.98      , 0.86      , 0.25      , 0.88      , 0.78      ,
       0.3       , 0.35      , 0.56      , 0.69      , 0.87      ,
       0.68      , 0.54      , 0.43      , 0.53      , 0.55      ,
       0.29      , 0.52      , 0.79      , 0.36      , 0.76      ,
       0.74      , 0.65      , 0.73      , 0.58      , 0.1       ,
       0.63      , 0.38      , 0.15      , 0.41      , 0.27      ,
       0.17      , 0.14      , 0.77      , 0.26      , 0.22      ])

> #### 8.drob

In [50]:
#id
#description
 ##first_review
 ##last_review
 ##host_since
#host_has_profile_pic
#name
#thumbnail_url
#zipcode
#neighbourhood

In [51]:
df_test = df_test.drop(['id','description','first_review','last_review','host_since',
                          'host_has_profile_pic','name','thumbnail_url','zipcode','neighbourhood',"amenities"],axis=1)
df_train.columns

Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
       'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
       'cleaning_fee', 'city', 'description', 'first_review',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'host_since', 'instant_bookable', 'last_review', 'latitude',
       'longitude', 'name', 'neighbourhood', 'number_of_reviews',
       'review_scores_rating', 'thumbnail_url', 'zipcode', 'bedrooms', 'beds'],
      dtype='object')

In [52]:
df_test.isna().sum()

log_price                 0
property_type             0
room_type                 0
accommodates              0
bathrooms                 0
bed_type                  0
cancellation_policy       0
cleaning_fee              0
city                      0
host_identity_verified    0
host_response_rate        0
instant_bookable          0
latitude                  0
longitude                 0
number_of_reviews         0
review_scores_rating      0
bedrooms                  0
beds                      0
dtype: int64

In [53]:
df_test.head()

Unnamed: 0,log_price,property_type,room_type,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,host_identity_verified,host_response_rate,instant_bookable,latitude,longitude,number_of_reviews,review_scores_rating,bedrooms,beds
4281,4.477337,Apartment,Private room,2,1.0,Real Bed,strict,True,Boston,False,1.0,0,42.342498,-71.080286,22,0.94,1.0,1.0
36339,7.377759,House,Entire home/apt,12,4.0,Real Bed,strict,False,SF,True,0.94516,0,37.772892,-122.426182,0,0.0,9.0,9.0
53913,4.976734,Apartment,Entire home/apt,3,1.0,Real Bed,moderate,True,NYC,True,0.94516,0,40.719425,-73.9565,8,0.94,1.0,2.0
72747,3.688879,House,Private room,1,1.0,Real Bed,flexible,False,LA,True,1.0,0,34.189818,-118.565122,0,0.0,1.0,1.0
69751,5.241747,Apartment,Entire home/apt,4,1.0,Real Bed,strict,True,NYC,True,0.7,0,40.721229,-73.949986,18,0.99,1.0,2.0


> #### 9.amenities

In [54]:
# df_train.isna().sum()

In [55]:
# df_train['amenities']

In [56]:
# amenities_col = []
# amenities_map = {}
# for s in df_train.amenities:
#     s = s.replace('{','')
#     s = s.replace('}','')
#     s = s.replace('"','')
#     s = s.split(',')
#     amenities_col.append(max(len(s)-1,0))
#     for k in s:
#         if amenities_map.get(k) != None:
#             amenities_map[k] +=1 
#     else:
#         amenities_map[k] = 1

In [57]:
#df_train['amenities_count'] = pd.Series(amenities_col)

In [58]:
#df_train['amenities_count'].isna().sum()

In [59]:
#df1['amenities_count'].unique()

In [60]:
#df1['amenities_count'].value_counts()

In [61]:
#df1 = df1.drop(['amenities'], axis=1)

In [62]:
#df1.head()

# 9.Factorization of categorical columns

### 1- room_type

In [63]:
df_test.room_type.unique()

array(['Private room', 'Entire home/apt', 'Shared room'], dtype=object)

In [64]:
df_test.room_type.value_counts()

Entire home/apt    7690
Private room       5806
Shared room         400
Name: room_type, dtype: int64

In [65]:
#df1 = pd.concat([df1, pd.get_dummies(df1['room_type'], prefix='room_type')],axis=1)

In [66]:
#df1 = df1.drop(['room_type'],axis=1)

In [67]:
def room_type_cleaning(x):
    if x=='Entire home/apt':
        x= 3
    elif x=='Private room':
        x= 2
    elif x=='Shared room':
        x= 1
    return x
df_test['room_type']=df_test['room_type'].map(lambda x: room_type_cleaning(x))

In [68]:
#df1.room_type = df1.room_type.apply(lambda x: 3 if x=='Entire home/apt' else 2 if x=='Private room' else 1)

In [69]:
df_test.head()

Unnamed: 0,log_price,property_type,room_type,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,host_identity_verified,host_response_rate,instant_bookable,latitude,longitude,number_of_reviews,review_scores_rating,bedrooms,beds
4281,4.477337,Apartment,2,2,1.0,Real Bed,strict,True,Boston,False,1.0,0,42.342498,-71.080286,22,0.94,1.0,1.0
36339,7.377759,House,3,12,4.0,Real Bed,strict,False,SF,True,0.94516,0,37.772892,-122.426182,0,0.0,9.0,9.0
53913,4.976734,Apartment,3,3,1.0,Real Bed,moderate,True,NYC,True,0.94516,0,40.719425,-73.9565,8,0.94,1.0,2.0
72747,3.688879,House,2,1,1.0,Real Bed,flexible,False,LA,True,1.0,0,34.189818,-118.565122,0,0.0,1.0,1.0
69751,5.241747,Apartment,3,4,1.0,Real Bed,strict,True,NYC,True,0.7,0,40.721229,-73.949986,18,0.99,1.0,2.0


In [70]:
df_test.isna().sum()

log_price                 0
property_type             0
room_type                 0
accommodates              0
bathrooms                 0
bed_type                  0
cancellation_policy       0
cleaning_fee              0
city                      0
host_identity_verified    0
host_response_rate        0
instant_bookable          0
latitude                  0
longitude                 0
number_of_reviews         0
review_scores_rating      0
bedrooms                  0
beds                      0
dtype: int64

### 2- bed_type

In [71]:
df_test.bed_type.unique()

array(['Real Bed', 'Pull-out Sofa', 'Futon', 'Airbed', 'Couch'],
      dtype=object)

In [72]:
df_test.bed_type.value_counts()

Real Bed         13517
Futon              135
Pull-out Sofa       99
Airbed              94
Couch               51
Name: bed_type, dtype: int64

In [73]:
#df1.bed_type = df.bed_type.map(lambda x: 2 if x=='Real Bed' else 1)

In [74]:
df_test = pd.concat([df_train, pd.get_dummies(df_train['bed_type'], prefix='bed_type')],axis=1)

In [75]:
df_test = df_train.drop(['bed_type'],axis=1)

In [76]:
df_test.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,cancellation_policy,cleaning_fee,city,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
17236,11276421,4.941642,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",3,1.0,strict,True,Boston,...,42.328538,-71.110867,Stylish 1BR with Outdoor Pool & Fitness Center!,Jamaica Plain,0,,https://a0.muscache.com/im/pictures/3aa3bfda-b...,2130,1.0,1.0
58369,1634003,4.382027,Apartment,Private room,"{TV,""Wireless Internet"",Kitchen,""Free parking ...",1,1.0,flexible,True,LA,...,34.033205,-118.447228,California Charisma,West Los Angeles,3,93.0,https://a0.muscache.com/im/pictures/068cc064-b...,90064,1.0,1.0
44240,15220486,5.293305,Apartment,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",8,1.0,strict,True,Chicago,...,41.947665,-87.6565,3 bed Wrigleyville Flat Sleeps 8!,Wrigleyville,70,95.0,,60613,3.0,4.0
57759,8833630,4.060443,House,Private room,"{TV,Internet,""Wireless Internet"",""Air conditio...",4,2.5,strict,True,Chicago,...,41.882181,-87.708795,Fort Washington Garden Room,,94,95.0,https://a0.muscache.com/im/pictures/d0704ecc-b...,60624,1.0,2.0
40254,3571986,4.174387,House,Private room,"{""Wireless Internet"",""Air conditioning"",Kitche...",4,1.0,moderate,True,LA,...,33.928853,-118.375224,LAX Casa de Paz - Tranquility Room,Hawthorne,68,91.0,https://a0.muscache.com/im/pictures/bf6a2568-2...,90304,1.0,2.0


### 3- city

In [77]:
df_test['city'].unique()

array(['Boston', 'LA', 'Chicago', 'NYC', 'SF', 'DC'], dtype=object)

In [78]:
df_test['city'].value_counts()

NYC        18296
LA         12554
SF          3590
DC          3176
Chicago     2093
Boston      1978
Name: city, dtype: int64

In [79]:
df_test = pd.concat([df_test, pd.get_dummies(df_test['city'], prefix='city')],axis=1)

In [80]:
df_test = df_test.drop(['city'],axis=1)

In [81]:
df_test.isna().sum()

id                            0
log_price                     0
property_type                 0
room_type                     0
amenities                     0
accommodates                  0
bathrooms                   116
cancellation_policy           0
cleaning_fee                  0
description                   0
first_review               8990
host_has_profile_pic        114
host_identity_verified      114
host_response_rate        10295
host_since                  114
instant_bookable              0
last_review                8970
latitude                      0
longitude                     0
name                          0
neighbourhood              3889
number_of_reviews             0
review_scores_rating       9439
thumbnail_url              4559
zipcode                     527
bedrooms                     43
beds                         67
city_Boston                   0
city_Chicago                  0
city_DC                       0
city_LA                       0
city_NYC

## 4- property_type

In [82]:
df_test.property_type.unique()

array(['Apartment', 'House', 'Condominium', 'Loft', 'Townhouse',
       'Camper/RV', 'Cabin', 'Boutique hotel', 'Bed & Breakfast',
       'Guesthouse', 'Bungalow', 'Other', 'Villa', 'Boat', 'Hostel',
       'Guest suite', 'Timeshare', 'Castle', 'In-law', 'Dorm', 'Yurt',
       'Vacation home', 'Tent', 'Hut', 'Serviced apartment',
       'Earth House', 'Treehouse', 'Lighthouse', 'Train', 'Island',
       'Chalet', 'Cave'], dtype=object)

In [83]:
df_test.property_type.value_counts()

Apartment             27503
House                  9320
Condominium            1519
Townhouse               985
Loft                    673
Other                   360
Guesthouse              279
Bed & Breakfast         270
Bungalow                201
Villa                   100
Dorm                     73
Guest suite              70
Camper/RV                53
Timeshare                44
Boat                     39
Cabin                    37
In-law                   34
Boutique hotel           34
Hostel                   31
Serviced apartment       12
Tent                     10
Castle                    8
Vacation home             6
Yurt                      5
Earth House               4
Hut                       4
Treehouse                 4
Chalet                    3
Train                     2
Cave                      2
Lighthouse                1
Island                    1
Name: property_type, dtype: int64

In [95]:
nbh_counts = df_test.property_type.value_counts()

In [103]:

other_nbhs = list(nbh_counts[nbh_counts <= 600].index)
df_test['property_type'] = df_test['property_type'].replace(other_nbhs,"Other")

In [104]:
df_test['property_type'].unique()

array(['Apartment', 'House', 'Condominium', 'Other'], dtype=object)

In [105]:
df_test['property_type'].value_counts()

Apartment      27503
House           9320
Other           3345
Condominium     1519
Name: property_type, dtype: int64

In [106]:
df_test = pd.concat([df_test, pd.get_dummies(df_test['property_type'], prefix='property_type')],axis=1)

In [107]:
df_test.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,cancellation_policy,cleaning_fee,description,...,city_Boston,city_Chicago,city_DC,city_LA,city_NYC,city_SF,property_type_Apartment,property_type_Condominium,property_type_House,property_type_Other
17236,11276421,4.941642,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",3,1.0,3,1,Enjoy a beautiful contemporary residence with ...,...,1,0,0,0,0,0,1,0,0,0
58369,1634003,4.382027,Apartment,Private room,"{TV,""Wireless Internet"",Kitchen,""Free parking ...",1,1.0,1,1,"My place is close to Don Antonio's, Metro Expo...",...,0,0,0,1,0,0,1,0,0,0
44240,15220486,5.293305,Apartment,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",8,1.0,3,1,Stay one block from Wrigley Field! This renova...,...,0,1,0,0,0,0,1,0,0,0
57759,8833630,4.060443,House,Private room,"{TV,Internet,""Wireless Internet"",""Air conditio...",4,2.5,3,1,This room is in the lower level of Fort Washin...,...,0,1,0,0,0,0,0,0,1,0
40254,3571986,4.174387,House,Private room,"{""Wireless Internet"",""Air conditioning"",Kitche...",4,1.0,2,1,Welcome to your home away from home! Cozy priv...,...,0,0,0,1,0,0,0,0,1,0


In [108]:
df_test = df_test.drop(['property_type'],axis=1)

In [None]:

#property_type_dict1 = {'Apartment':['Condominium','Loft','Serviced apartment','Guest suite'],
# 'House':['Vacation home','Villa','Townhouse','In-law','Casa particular'],
#    'Hotel1':['Dorm','Hostel','Guesthouse'],
#     'Hotel2':['Boutique hotel','Bed & Breakfast'],
#     'Timeshare':['Timeshare'],
#    'Other':['Island','Castle','Yurt','Hut','Chalet','Treehouse',
#            'Earth House','Tipi','Cave','Train','Parking Space','Lighthouse',
#          'Tent','Boat','Cabin','Camper/RV','Bungalow']
# }

#property_type_dict2 = {i : k for k, v in property_type_dict1.items() for i in v}

#df1['property_group'] = df1['property_type'].replace(property_type_dict2)

#df1.drop('property_type',axis=1,inplace=True)



In [None]:
#df1['property_group']

In [None]:
#df1 = pd.concat([df1, pd.get_dummies(df1['property_group'], prefix='property_group')],axis=1)

In [None]:
#df1 = df1.drop(['property_group'],axis=1)

In [None]:
#df1

## 5-cancellation_policy

In [86]:
df_test['cancellation_policy'].unique()

array(['strict', 'flexible', 'moderate', 'super_strict_30',
       'super_strict_60'], dtype=object)

In [87]:
df_test['cancellation_policy'].value_counts()

strict             18226
flexible           12748
moderate           10645
super_strict_30       57
super_strict_60       11
Name: cancellation_policy, dtype: int64

In [88]:
def cancellation_policy_cleaning(x):
    if x=='super_strict_60':
        x= 5
    elif x=='super_strict_30':
        x= 4
    elif x=='strict':
        x= 3
    elif x=='moderate':
        x= 2
    elif x=='flexible':
        x= 1     
    return x
df_test['cancellation_policy']=df_test['cancellation_policy'].map(lambda x: cancellation_policy_cleaning(x))

In [89]:
df_test['cancellation_policy'].value_counts()

3    18226
1    12748
2    10645
4       57
5       11
Name: cancellation_policy, dtype: int64

In [90]:
df_test.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,cancellation_policy,cleaning_fee,description,...,thumbnail_url,zipcode,bedrooms,beds,city_Boston,city_Chicago,city_DC,city_LA,city_NYC,city_SF
17236,11276421,4.941642,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",3,1.0,3,True,Enjoy a beautiful contemporary residence with ...,...,https://a0.muscache.com/im/pictures/3aa3bfda-b...,2130,1.0,1.0,1,0,0,0,0,0
58369,1634003,4.382027,Apartment,Private room,"{TV,""Wireless Internet"",Kitchen,""Free parking ...",1,1.0,1,True,"My place is close to Don Antonio's, Metro Expo...",...,https://a0.muscache.com/im/pictures/068cc064-b...,90064,1.0,1.0,0,0,0,1,0,0
44240,15220486,5.293305,Apartment,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",8,1.0,3,True,Stay one block from Wrigley Field! This renova...,...,,60613,3.0,4.0,0,1,0,0,0,0
57759,8833630,4.060443,House,Private room,"{TV,Internet,""Wireless Internet"",""Air conditio...",4,2.5,3,True,This room is in the lower level of Fort Washin...,...,https://a0.muscache.com/im/pictures/d0704ecc-b...,60624,1.0,2.0,0,1,0,0,0,0
40254,3571986,4.174387,House,Private room,"{""Wireless Internet"",""Air conditioning"",Kitche...",4,1.0,2,True,Welcome to your home away from home! Cozy priv...,...,https://a0.muscache.com/im/pictures/bf6a2568-2...,90304,1.0,2.0,0,0,0,1,0,0


## 6-cleaning_fee and host_identity_verified

In [91]:
df_test.replace(to_replace = True , value = 1,inplace=True) 
df_test.replace(to_replace = False , value = 0,inplace=True)

In [92]:
df_test.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,cancellation_policy,cleaning_fee,description,...,thumbnail_url,zipcode,bedrooms,beds,city_Boston,city_Chicago,city_DC,city_LA,city_NYC,city_SF
17236,11276421,4.941642,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",3,1.0,3,1,Enjoy a beautiful contemporary residence with ...,...,https://a0.muscache.com/im/pictures/3aa3bfda-b...,2130,1.0,1.0,1,0,0,0,0,0
58369,1634003,4.382027,Apartment,Private room,"{TV,""Wireless Internet"",Kitchen,""Free parking ...",1,1.0,1,1,"My place is close to Don Antonio's, Metro Expo...",...,https://a0.muscache.com/im/pictures/068cc064-b...,90064,1.0,1.0,0,0,0,1,0,0
44240,15220486,5.293305,Apartment,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",8,1.0,3,1,Stay one block from Wrigley Field! This renova...,...,,60613,3.0,4.0,0,1,0,0,0,0
57759,8833630,4.060443,House,Private room,"{TV,Internet,""Wireless Internet"",""Air conditio...",4,2.5,3,1,This room is in the lower level of Fort Washin...,...,https://a0.muscache.com/im/pictures/d0704ecc-b...,60624,1.0,2.0,0,1,0,0,0,0
40254,3571986,4.174387,House,Private room,"{""Wireless Internet"",""Air conditioning"",Kitche...",4,1.0,2,1,Welcome to your home away from home! Cozy priv...,...,https://a0.muscache.com/im/pictures/bf6a2568-2...,90304,1.0,2.0,0,0,0,1,0,0


In [93]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41687 entries, 17236 to 68913
Data columns (total 33 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      41687 non-null  int64  
 1   log_price               41687 non-null  float64
 2   property_type           41687 non-null  object 
 3   room_type               41687 non-null  object 
 4   amenities               41687 non-null  object 
 5   accommodates            41687 non-null  int64  
 6   bathrooms               41571 non-null  float64
 7   cancellation_policy     41687 non-null  int64  
 8   cleaning_fee            41687 non-null  int64  
 9   description             41687 non-null  object 
 10  first_review            32697 non-null  object 
 11  host_has_profile_pic    41573 non-null  object 
 12  host_identity_verified  41573 non-null  object 
 13  host_response_rate      31392 non-null  object 
 14  host_since              41573 non-

# Validation Method: Train / Validation / Test

In [None]:
#df_train.head()

In [None]:
#from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
#X, y = df1.drop(['log_price'],axis=1), df1['log_price']

In [None]:
#X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=10)

In [None]:
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, random_state=3)

# Model Buildig 

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso #ordinary linear regression + w/ ridge regularization
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy 
import scipy.stats as stats

In [None]:
X_train, y_train = df_train.drop(['log_price'],axis=1), df_train['log_price']

In [None]:
# fit model 
model = sm.OLS(y_train, X_train, data=df_train)

results = model.fit()

# summarize our model
results.summary()

In [None]:
from sklearn import metrics

def print_evaluate(true,predicted):
    mae = metrics.mean_absolute_error(true,predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true,predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('\nMAE:', mae)
    
    print('\nMSE:', mse)
    print('\nRMSE:', rmse)
    print('\nR2 Square', r2_square)

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression(normalize=True)
lin_reg.fit(X_train,y_train)

In [None]:
print("Error/ Accuracy Analysis :- ")
print_evaluate(y_train,lin_reg.predict(X_train))

In [None]:
#sns.jointplot(x=linear,y=y_train, kind='reg')

In [None]:
# stats.probplot(df1['log_price'], dist="norm", plot=plt)
# plt.title("Normal Q-Q plot")
# plt.show()

In [None]:
#set up the 3 models we're choosing from:

lm = LinearRegression()

#Feature scaling for train, val, and test so that we can run our ridge model on each
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.values)
#X_val_scaled = scaler.transform(X_val.values)

lm_reg = Ridge(alpha=1)

#Feature transforms for train, val, and test so that we can run our poly model on each
poly = PolynomialFeatures(degree=2) 

X_train_poly = poly.fit_transform(X_train.values)
#X_val_poly = poly.transform(X_val.values)


lm_poly = LinearRegression()

In [None]:
# lm.fit(X_train, y_train)
# print(f'Linear Regression val R^2: {lm.score(X_val, y_val):.3f}')

# lm_reg.fit(X_train_scaled, y_train)
# print(f'Ridge Regression val R^2: {lm_reg.score(X_val_scaled, y_val):.3f}')

# lm_poly.fit(X_train_poly, y_train)
# print(f'Degree 2 polynomial regression val R^2: {lm_poly.score(X_val_poly, y_val):.3f}')

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_model.score(X_train,y_train)

In [None]:
lr_model_ridge = Ridge(alpha = 100000000)
lr_model_ridge.fit(X_train, y_train)

list(zip(X_train, lr_model_ridge.coef_))

In [None]:
lm_reg.fit(X_train, y_train)
print(f'Ridge Regression val R^2: {lm_reg.score(X_train, y_train):.3f}')

