In [31]:
import pandas as pd
import re
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler,OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt

In [2]:
airbnb_df = pd.read_csv('Chicago_Airbnb_Listing_Dataset.csv')
airbnb_df.head()


Unnamed: 0,id,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm
0,2384,Hyde Park,41.7879,-87.5878,Private room,1,1 shared bath,1.0,1.0,$95.00,3,89,3,3,1125,1125,3.0,1125.0
1,7126,West Town,41.90166,-87.68021,Entire home/apt,2,1 bath,1.0,1.0,$65.00,2,60,2,2,1125,1125,2.0,1125.0
2,10945,Lincoln Park,41.91196,-87.63981,Entire home/apt,4,1 bath,2.0,2.0,$127.00,4,180,4,4,180,180,4.0,180.0
3,12140,Lincoln Park,41.92357,-87.64947,Private room,2,1 private bath,1.0,2.0,$329.00,2,7,2,2,7,7,2.0,7.0
4,22362,West Town,41.8973,-87.65889,Entire home/apt,4,2 baths,2.0,2.0,$105.00,121,1125,121,121,1125,1125,121.0,1125.0


In [3]:
airbnb_df['price'] = airbnb_df['price'].str.replace('$', '')
airbnb_df['price'] = airbnb_df['price'].str.replace(',', '').astype(float)

  """Entry point for launching an IPython kernel.


In [4]:
airbnb_df.drop(airbnb_df[airbnb_df.price > 1000].index, inplace=True)

In [5]:
airbnb_df.head()

Unnamed: 0,id,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm
0,2384,Hyde Park,41.7879,-87.5878,Private room,1,1 shared bath,1.0,1.0,95.0,3,89,3,3,1125,1125,3.0,1125.0
1,7126,West Town,41.90166,-87.68021,Entire home/apt,2,1 bath,1.0,1.0,65.0,2,60,2,2,1125,1125,2.0,1125.0
2,10945,Lincoln Park,41.91196,-87.63981,Entire home/apt,4,1 bath,2.0,2.0,127.0,4,180,4,4,180,180,4.0,180.0
3,12140,Lincoln Park,41.92357,-87.64947,Private room,2,1 private bath,1.0,2.0,329.0,2,7,2,2,7,7,2.0,7.0
4,22362,West Town,41.8973,-87.65889,Entire home/apt,4,2 baths,2.0,2.0,105.0,121,1125,121,121,1125,1125,121.0,1125.0


In [6]:
neighbourhoods = airbnb_df['neighbourhood_cleansed'].value_counts()
neighbourhoods

Near North Side    749
West Town          676
Lake View          553
Near West Side     402
Logan Square       391
                  ... 
Edison Park          1
Avalon Park          1
Burnside             1
Gage Park            1
West Elsdon          1
Name: neighbourhood_cleansed, Length: 76, dtype: int64

In [7]:
neighbourhood_list = neighbourhoods.index
neighbourhood_list
neighbourhood_num = np.arange(len(neighbourhood_list))

In [8]:
neighborhood_dict = dict(zip(neighbourhood_list,neighbourhood_num))
neighborhood_dict

{'Near North Side': 0,
 'West Town': 1,
 'Lake View': 2,
 'Near West Side': 3,
 'Logan Square': 4,
 'Loop': 5,
 'Lincoln Park': 6,
 'Uptown': 7,
 'Near South Side': 8,
 'Lower West Side': 9,
 'Irving Park': 10,
 'Edgewater': 11,
 'Bridgeport': 12,
 'Avondale': 13,
 'Rogers Park': 14,
 'North Center': 15,
 'East Garfield Park': 16,
 'Grand Boulevard': 17,
 'Hyde Park': 18,
 'Lincoln Square': 19,
 'South Shore': 20,
 'Humboldt Park': 21,
 'West Ridge': 22,
 'Woodlawn': 23,
 'Portage Park': 24,
 'Douglas': 25,
 'Albany Park': 26,
 'Kenwood': 27,
 'Armour Square': 28,
 'Austin': 29,
 'Greater Grand Crossing': 30,
 'South Lawndale': 31,
 'Jefferson Park': 32,
 'Washington Park': 33,
 'North Lawndale': 34,
 'Mckinley Park': 35,
 'Norwood Park': 36,
 'Dunning': 37,
 'Calumet Heights': 38,
 'South Chicago': 39,
 'West Garfield Park': 40,
 'Belmont Cragin': 41,
 'Brighton Park': 42,
 'North Park': 43,
 'Hermosa': 44,
 'Englewood': 45,
 'New City': 46,
 'Oakland': 47,
 'Garfield Ridge': 48,
 'Ch

In [9]:
airbnb_df['neighbourhood_cleansed'] = airbnb_df['neighbourhood_cleansed'].apply(lambda x:neighborhood_dict[x])
airbnb_df.head()

Unnamed: 0,id,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm
0,2384,18,41.7879,-87.5878,Private room,1,1 shared bath,1.0,1.0,95.0,3,89,3,3,1125,1125,3.0,1125.0
1,7126,1,41.90166,-87.68021,Entire home/apt,2,1 bath,1.0,1.0,65.0,2,60,2,2,1125,1125,2.0,1125.0
2,10945,6,41.91196,-87.63981,Entire home/apt,4,1 bath,2.0,2.0,127.0,4,180,4,4,180,180,4.0,180.0
3,12140,6,41.92357,-87.64947,Private room,2,1 private bath,1.0,2.0,329.0,2,7,2,2,7,7,2.0,7.0
4,22362,1,41.8973,-87.65889,Entire home/apt,4,2 baths,2.0,2.0,105.0,121,1125,121,121,1125,1125,121.0,1125.0


In [10]:
# airbnb_df['bathrooms_text'] = airbnb_df['bathrooms_text'].str.replace(' shared bath', '')
airbnb_df['bathrooms_text'].value_counts()

1 bath               3296
2 baths               968
1 shared bath         738
1 private bath        451
2 shared baths        202
1.5 baths             187
2.5 baths             157
1.5 shared baths      108
3 baths               108
3.5 baths              66
4 baths                36
3 shared baths         33
2.5 shared baths       32
4 shared baths         19
4.5 baths              15
0 shared baths          9
0 baths                 5
6 baths                 5
11 shared baths         4
5 baths                 4
Private half-bath       3
Shared half-bath        3
7 baths                 2
Half-bath               2
11.5 shared baths       1
6.5 baths               1
8 shared baths          1
12.5 baths              1
Name: bathrooms_text, dtype: int64

In [11]:
airbnb_df['bathrooms_text'] = airbnb_df['bathrooms_text'].str.replace('-bath', '0.5')
airbnb_df['bathrooms_text'] = airbnb_df['bathrooms_text'].str.extract('(\d*\.\d+|\d+)', expand=False)


In [12]:
airbnb_df['bathrooms_text'] = airbnb_df['bathrooms_text'].astype(float)
airbnb_df = airbnb_df.dropna()

In [13]:
working_df = pd.get_dummies(airbnb_df)
working_df

Unnamed: 0,id,neighbourhood_cleansed,latitude,longitude,accommodates,bathrooms_text,bedrooms,beds,price,minimum_nights,...,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,2384,18,41.787900,-87.587800,1,1.0,1.0,1.0,95.0,3,...,3,3,1125,1125,3.0,1125.0,0,0,1,0
1,7126,1,41.901660,-87.680210,2,1.0,1.0,1.0,65.0,2,...,2,2,1125,1125,2.0,1125.0,1,0,0,0
2,10945,6,41.911960,-87.639810,4,1.0,2.0,2.0,127.0,4,...,4,4,180,180,4.0,180.0,1,0,0,0
3,12140,6,41.923570,-87.649470,2,1.0,1.0,2.0,329.0,2,...,2,2,7,7,2.0,7.0,0,0,1,0
4,22362,1,41.897300,-87.658890,4,2.0,2.0,2.0,105.0,121,...,121,121,1125,1125,121.0,1125.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6539,53812963,1,41.913940,-87.672440,2,1.0,1.0,1.0,53.0,32,...,32,32,1125,1125,32.0,1125.0,1,0,0,0
6540,53816167,1,41.907590,-87.688860,6,1.0,3.0,3.0,105.0,2,...,2,2,365,365,2.0,365.0,1,0,0,0
6541,53820517,2,41.933915,-87.639459,6,2.0,2.0,2.0,150.0,3,...,3,3,1125,1125,3.0,1125.0,1,0,0,0
6542,53820855,2,41.935920,-87.640590,6,2.0,2.0,2.0,150.0,32,...,32,32,1125,1125,32.0,1125.0,1,0,0,0


In [14]:
X = working_df.drop('price', axis=1)
y = working_df['price']
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5739 entries, 0 to 6543
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         5739 non-null   int64  
 1   neighbourhood_cleansed     5739 non-null   int32  
 2   latitude                   5739 non-null   float64
 3   longitude                  5739 non-null   float64
 4   accommodates               5739 non-null   int64  
 5   bathrooms_text             5739 non-null   float64
 6   bedrooms                   5739 non-null   float64
 7   beds                       5739 non-null   float64
 8   minimum_nights             5739 non-null   int64  
 9   maximum_nights             5739 non-null   int64  
 10  minimum_minimum_nights     5739 non-null   int64  
 11  maximum_minimum_nights     5739 non-null   int64  
 12  minimum_maximum_nights     5739 non-null   int64  
 13  maximum_maximum_nights     5739 non-null   int64

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [34]:
model = RandomForestRegressor(random_state=42, n_estimators=100, max_depth=3)

In [36]:
# Fit the model on training data
model.fit(X_train_scaled, y_train)
        
# Predict results
y_test_preds = model.predict(X_test_scaled) 

# Evaluate the result
rsquared_score = r2_score(y_test, y_test_preds)
print('R sequared = ', rsquared_score)

R sequared =  0.47060512741323735


In [37]:
number_input_features = 20 #len(X_train[0])
hidden_nodes_layer1 =  80
hidden_nodes_layer2 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
#  YOUR CODE GOES HERE
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
#  YOUR CODE GOES HERE
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
#  YOUR CODE GOES HERE
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 80)                1680      
                                                                 
 dense_13 (Dense)            (None, 30)                2430      
                                                                 
 dense_14 (Dense)            (None, 1)                 31        
                                                                 
Total params: 4,141
Trainable params: 4,141
Non-trainable params: 0
_________________________________________________________________


In [38]:
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [40]:
fit_model = nn.fit(X_train,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100


Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
