#### Creating a classification model that will predict if an airbnb listing is in manhattan.

In [51]:
## Imports

import pandas as pd
import matplotlib as plt
import numpy as np

In [52]:
# Read from the csv
df = pd.read_csv("Airbnb_NYC.csv")

In [53]:
# Data Preprocessing / Cleaning 
# Removing Long, Lat and review30d features 

df = df.dropna()
df = df.drop(labels=["Latitude", "Longitude", "Reviews30d"], axis=1)
df.head

<bound method NDFrame.head of             Boroughs    Prop_Type  Min_Nights  Host_Listing_Cnt  \
0              Bronx  PrivateRoom        21.0               2.0   
1              Bronx  PrivateRoom         3.0               1.0   
2              Bronx  PrivateRoom         3.0               1.0   
3              Bronx  PrivateRoom        21.0               2.0   
4              Bronx   EntireHome         2.0               3.0   
...              ...          ...         ...               ...   
38728  Staten Island   EntireHome         2.0               1.0   
38729  Staten Island  PrivateRoom        20.0               1.0   
38730  Staten Island  PrivateRoom        30.0               4.0   
38731  Staten Island   EntireHome         3.0               1.0   
38732  Staten Island   EntireHome         1.0               1.0   

       Days_Available  Review_Cnt  Price  
0                 291          19   60.0  
1                   0          59   49.0  
2                 354           3   

#### Encoding and Preparing Training and Test set

In [54]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Encoding the dependent variable
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values
y = [ 1 if i == 'Manhattan' else 0 for i in y ]

# Encoding categorical data feature Prop_Type
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

# Splitting the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
sc = StandardScaler()
X[:, 2:] = sc.fit_transform(X[:, 2:])

print(X_train[0])
print(X_test[0])

[0.0 1.0 -0.15025461655108763 -0.41308268085320043 0.24229453689419658
 1.5742793452533472 -0.5675442324963159 0.20513977488941948]
[0.0 1.0 -0.15025461655108763 -0.41308268085320043 -0.1169414982727913
 -0.038501486613267896 -0.38267764368046975 -0.31310398031794584]


In [55]:
# cross val helper from helpers.py

from helpers import check_accuracy_of_model
    

#### Logistic Regression Model

In [56]:
# Fitting using logistic regression and doing cross val 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

reg = LogisticRegression()
check_accuracy_of_model(reg, X, y)


Accuracy: 64.51 %


#### Random Forest Regression Model

In [57]:
from sklearn.ensemble import RandomForestClassifier

reg = RandomForestClassifier()
check_accuracy_of_model(reg, X, y)

Accuracy: 66.02 %


#### K-means Model

In [58]:
from sklearn.neighbors import KNeighborsClassifier
reg = KNeighborsClassifier(n_neighbors=2)
check_accuracy_of_model(reg, X_train, y_train)

Accuracy: 64.20 %


#### XGBoost Model

In [59]:
from xgboost import XGBClassifier
reg = XGBClassifier()
check_accuracy_of_model(reg, X, y)





































Accuracy: 68.83 %


#### Summarizing the results of the different models

Logistic and K-means classification had the lowest accuracies. Random Forest did better and XGBoost did the best in accuracy since it is an improvement of decision trees it was expected.


#### TODO: Improve the XGBoost model using gridsearch to find the optimal hyperparameters