# Modeling

## Imports

In [63]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier

from sklearn.compose import ColumnTransformer

In [64]:
# read in cleaned data again
listings_df = pd.read_csv('../data/cleaned_listings.csv').drop(columns = 'Unnamed: 0')

  listings_df = pd.read_csv('../data/cleaned_listings.csv').drop(columns = 'Unnamed: 0')


In [65]:
listings_df.head(1)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2595.0,https://www.airbnb.com/rooms/2595,20221204162430.0,2022-12-05,city scrape,Skylit Midtown Castle,"Beautiful, spacious skylit studio in the heart...",Centrally located in the heart of Manhattan ju...,https://a0.muscache.com/pictures/f0813a11-40b2...,2845,...,4.63,4.77,4.8,4.81,4.4,3.0,3.0,0.0,0.0,0.31


In [105]:
listings_df['price'].dtypes

dtype('float64')

In [66]:
listings_df.shape

(41535, 75)

In [72]:
listings_df['price'].value_counts()

$150.00      1228
$100.00      1077
$200.00       883
$120.00       743
$80.00        734
             ... 
$588.00         1
$1,092.00       1
$4,311.00       1
$522.00         1
$2,830.00       1
Name: price, Length: 1289, dtype: int64

In [73]:
listings_df['price'] = listings_df['price'].replace({'\$':''}, regex = True)

In [74]:
listings_df['price'].head()

0    175.00
1     75.00
2    275.00
3     60.00
4     68.00
Name: price, dtype: object

In [76]:
listings_df['price'] = listings_df['price'].replace({',':''}, regex = True)

In [77]:
listings_df['price'].head()

0    175.00
1     75.00
2    275.00
3     60.00
4     68.00
Name: price, dtype: object

In [79]:
listings_df = listings_df[listings_df['price'] != ' ']

In [81]:
listings_df.iloc[[7551]]

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
7552,,,,f,https://a0.muscache.com/im/pictures/user/19857...,https://a0.muscache.com/im/pictures/user/19857...,Williamsburg,1.0,4.0,"['email', 'phone']",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
listings_df = listings_df.drop(index=listings_df.iloc[7551].name)

In [87]:
listings_df = listings_df[listings_df['price'] != '2022-12-05']

In [88]:
listings_df['price'] = pd.to_numeric(listings_df['price'])

In [98]:
float_columns = listings_df.select_dtypes(float).columns

In [100]:
float_columns = list(float_columns)

In [101]:
float_columns.remove('price')

In [102]:
# set up X and y
X = listings_df[float_columns]
y = listings_df['price']

In [103]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42)

In [104]:
lr = LinearRegression()

lr.fit(X_train, y_train)

LinearRegression()

In [106]:
lr.score(X_train, y_train)

0.0401065767163582

In [107]:
lr.score(X_test, y_test)

0.2182936456491481

In [108]:
# Lasso
sc = StandardScaler()

In [110]:
X_scaled = sc.fit_transform(X_train)

In [111]:
X_test_scaled = sc.transform(X_test)

In [112]:
lasso = Lasso()

In [113]:
lasso.fit(X_scaled, y_train)

Lasso()

In [114]:
lasso.score(X_scaled, y_train)

0.03951061723132976

In [115]:
lasso.score(X_test_scaled, y_test)

0.21964124883466007

In [116]:
# Ridge
sc = StandardScaler()

In [117]:
X_scaled = sc.fit_transform(X_train)

X_test_scaled = sc.transform(X_test)

In [118]:
ridge = Ridge()

In [119]:
ridge.fit(X_scaled, y_train)

Ridge()

In [120]:
ridge.score(X_scaled, y_train)

0.04010298154761838

In [121]:
ridge.score(X_test_scaled, y_test)

0.21818360479499188