In [1]:
import numpy as np
import pandas as pd
import psycopg2 as pg2

import os
from dotenv import load_dotenv

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVR
from sklearn.metrics import r2_score

## Obtaining the Data From Postgres

In [2]:
load_dotenv()

conn = pg2.connect(
    database = os.getenv('POSTGRES_DATABASE'),
    user = os.getenv('POSTGRES_USER'),
    password = os.getenv('POSTGRES_PASSWORD')
)

cursor = conn.cursor()

cursor.execute('''
    SELECT
        neighborhood_group,
        neighborhood,
        room_type,
        minimum_nights,
        bedrooms,
        bathrooms,
        private_bath,
        accommodates,
        price
            FROM nyc_airbnb
''')

result = cursor.fetchall()

conn.close()

In [3]:
df = pd.DataFrame(data = result)

In [4]:
df.rename(columns = {
    0: 'neighborhood',
    1: 'neighborhood_group',
    2: 'room_type',
    3: 'minimum_nights',
    4: 'bedrooms',
    5: 'bathrooms',
    6: 'private_bath',
    7: 'accommodates',
    8: 'price'
}, inplace = True)

In [5]:
df.head()

Unnamed: 0,neighborhood,neighborhood_group,room_type,minimum_nights,bedrooms,bathrooms,private_bath,accommodates,price
0,Manhattan,Midtown,Entire home/apt,30,0.0,1.0,1,1,150
1,Brooklyn,Bedford-Stuyvesant,Entire home/apt,1,1.0,1.0,1,3,75
2,Brooklyn,Bedford-Stuyvesant,Private room,30,1.0,,0,2,60
3,Brooklyn,Sunset Park,Entire home/apt,5,2.0,1.5,1,4,275
4,Manhattan,Midtown,Private room,2,1.0,1.0,1,2,61


## Cleaning the Data

In [6]:
# Making a copy prevents the warning.

neighborhood_df = df[(df['neighborhood'] == 'Brooklyn') & (df['neighborhood_group'] == 'Prospect Heights')].copy()

In [7]:
neighborhood_df['bathrooms'].isna().value_counts()

False    218
Name: bathrooms, dtype: int64

In [8]:
# Assume that at every listing has at least one bathroom

neighborhood_df['bathrooms'].fillna(value = 1, inplace = True)

In [9]:
neighborhood_df['bathrooms'].isna().value_counts()

False    218
Name: bathrooms, dtype: int64

In [10]:
# Remove the data where the price is listed as 0

neighborhood_df = neighborhood_df[neighborhood_df['price'] != 0]

In [11]:
neighborhood_df['room_type'].value_counts()

Entire home/apt    139
Private room        79
Name: room_type, dtype: int64

In [12]:
# We are not interested in hotel rooms, so we can drop this data.

neighborhood_df = neighborhood_df[neighborhood_df['room_type'] != 'Hotel room']

## Preparing the Data for ML

In [13]:
# X is the columns room_type, minimum_nights, bedrooms, bathrooms, private_bath, and accommodates
# y is the price column

X = neighborhood_df.iloc[:, 2:-1].values
y = neighborhood_df.iloc[:, -1].values

In [14]:
X

array([['Entire home/apt', 30, 4.0, 2.0, 1, 8],
       ['Private room', 30, 1.0, 1.5, 0, 2],
       ['Entire home/apt', 30, 2.0, 1.0, 1, 4],
       ...,
       ['Entire home/apt', 7, 0.0, 1.0, 1, 2],
       ['Entire home/apt', 7, 0.0, 1.0, 1, 2],
       ['Private room', 90, 1.0, 1.0, 0, 1]], dtype=object)

In [15]:
# OneHotEncoder given a column to each room_type with a 1 or 0 value.
# remainder = 'passthrough' makes sure that we keep the remaining columns.

ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')
X = np.array(ct.fit_transform(X))

In [16]:
X

array([[1.0, 0.0, 30, ..., 2.0, 1, 8],
       [0.0, 1.0, 30, ..., 1.5, 0, 2],
       [1.0, 0.0, 30, ..., 1.0, 1, 4],
       ...,
       [1.0, 0.0, 7, ..., 1.0, 1, 2],
       [1.0, 0.0, 7, ..., 1.0, 1, 2],
       [0.0, 1.0, 90, ..., 1.0, 0, 1]], dtype=object)

[1, 0, 0] appears to be 'Entire home/apt'  
[0, 1, 0] appears to be 'Private room'  
This must mean that [0, 0, 1] is 'Shared room'

In [17]:
# test_size = 0.2 gives 20 percent of observations into the test_set.
# random_state fixes the seed.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [18]:
# We need to make y 2D in order to pass it through StandardScaler.fit_transform()

y_train = y_train.reshape(len(y_train), 1)

In [19]:
# We will need to use a different standard scaler on X and y because the standard scaler will
# use the same mean and standard deviation for both X and y, which obviously have different
# means and standard deviations.

sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

## Running the SVR Model

In [20]:
# kernel = 'linear' recommended by the link below
# https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
# regressor.fit expects a 1D array for y. y.reshape(len(y)) changes the array back to 1D.

regressor = SVR(kernel = 'linear')
regressor.fit(X_train, y_train.reshape(len(y_train)))

SVR(kernel='linear')

## Predicting the Test Set Results

In [21]:
y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X_test)))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[127.25 104.  ]
 [445.61 550.  ]
 [242.68 200.  ]
 [137.37 180.  ]
 [151.82 185.  ]
 [156.16 143.  ]
 [118.79 129.  ]
 [158.31 170.  ]
 [130.8   75.  ]
 [133.61 175.  ]
 [137.37  67.  ]
 [420.91 458.  ]
 [ 67.7   85.  ]
 [196.25  50.  ]
 [ 92.88 132.  ]
 [320.93 450.  ]
 [118.   120.  ]
 [193.07 165.  ]
 [152.97 241.  ]
 [117.22  96.  ]
 [157.52  95.  ]
 [200.1  225.  ]
 [ 92.3   90.  ]
 [118.   123.  ]
 [117.43 100.  ]
 [118.   125.  ]
 [118.   100.  ]
 [ 56.14  50.  ]
 [118.   105.  ]
 [200.1  150.  ]
 [223.32 200.  ]
 [234.98 149.  ]
 [117.22 129.  ]
 [118.    55.  ]
 [156.73 120.  ]
 [137.37 130.  ]
 [156.73 102.  ]
 [321.72 140.  ]
 [ 71.46  79.  ]
 [130.51 180.  ]
 [199.31 120.  ]
 [118.   195.  ]
 [114.04  60.  ]
 [ 71.46  65.  ]]


## Evaluating the Model Performance

In [22]:
r2_score(y_test, y_pred)

0.672241052428816