In [2]:
import pandas as pd
import numpy as np
import sklearn as sk
import seaborn as sns
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB         # Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import codecs

import time
from datetime import date
import matplotlib.pyplot as plt


%matplotlib inline


In [3]:
data = r'../FinalProjectEDA/AB_Clean.csv'
pd.set_option('display.max_columns', 500)
AB_Clean = pd.read_csv(data)
AB_Clean.tail(1)

Unnamed: 0.1,Unnamed: 0,last_scraped,name,summary,space,description,neighborhood_overview,transit,access,house_rules,host_since,host_about,host_total_listings_count,neighbourhood,zipcode,room_type,accommodates,bathrooms,beds,bed_type,amenities,price,availability_90,calendar_last_scraped,first_review,last_review,calculated_host_listings_count,reviews_per_month,price_int,price_bed,Host_Age,RPM
2828,3583,2016-09-07,Great Location; Train and Restaurants,"My place is close to Taco Loco Mexican Grill, ...",,"My place is close to Taco Loco Mexican Grill, ...",,,,,2016-05-27,"Hi, I am very friendly, helpful, positive and ...",4,Somerville,2145,Private room,2,1.0,1.0,Real Bed,"{Kitchen,Gym,""Family/Kid Friendly"",Washer,Drye...",$65.00,5,2016-09-06,2016-08-27,2016-09-04,1,2.0,65.0,65.0,103,1


In [131]:
data1 = r'../FinalProjectEDA/AB_Clean.csv'
pd.set_option('display.max_columns', 500)
AB_Clean_text = pd.read_csv(data1)
AB_Clean.tail(1)

Unnamed: 0.1,Unnamed: 0,last_scraped,name,summary,space,description,neighborhood_overview,transit,access,house_rules,host_since,host_about,host_total_listings_count,neighbourhood,zipcode,room_type,accommodates,bathrooms,beds,bed_type,amenities,price,availability_90,calendar_last_scraped,first_review,last_review,calculated_host_listings_count,reviews_per_month,price_int,price_bed,Host_Age,RPM
2828,3583,2016-09-07,Great Location; Train and Restaurants,"My place is close to Taco Loco Mexican Grill, ...",,"My place is close to Taco Loco Mexican Grill, ...",,,,,2016-05-27,"Hi, I am very friendly, helpful, positive and ...",4,Somerville,2145,Private room,2,1.0,1.0,Real Bed,"{Kitchen,Gym,""Family/Kid Friendly"",Washer,Drye...",$65.00,5,2016-09-06,2016-08-27,2016-09-04,1,2.0,65.0,65.0,103,1


#### Linear regression with quantitative variables

The list of my quantitative variables is below

- host_total_listings_count
- accomodates
- availability 90
- price_bed
- Host_age


In [132]:
# Instantatiate a logistic regression and fit model with all the quant vars. 
# Fit a logistic regression model and store the class predictions.
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

feature_cols = ['host_total_listings_count','accommodates','availability_90','price_bed','Host_Age']
X = AB_Clean[feature_cols]
y = AB_Clean.RPM

logreg.fit(X,y)
pred = logreg.predict(X)

In [133]:
#score of model
logreg.score(X,y)

0.607281724991163

In [136]:
# show coefficients, accommodates is the most influential
coef = logreg.coef_[0]
coef

array([-0.00478119,  0.05927462,  0.0054951 , -0.00240966, -0.00037408])

In [137]:
# Repeat logistic regression with train, test, split 
# using train test split to cross val
logreg2 = LogisticRegression()

feature_cols = ['host_total_listings_count','accommodates','availability_90','price_bed','Host_Age']
X = AB_Clean[feature_cols]
y = AB_Clean.RPM

x_train, x_test, y_train, y_test = train_test_split(X,y, random_state =42)


logreg2.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

The log reg score increases by ~4% when incorporating train, test, split. 

In [138]:
y_pred = logreg2.predict(x_test)

In [139]:
metrics.accuracy_score(y_test,y_pred)

0.6483050847457628

In [64]:
coef2 = logreg2.coef_[0]
coef2


array([-0.00456227,  0.04635654,  0.00569705, -0.00209617, -0.00033959])

In [81]:
metrics.confusion_matrix(y_test,y_pred)

array([[128, 186],
       [ 63, 331]], dtype=int64)

In [74]:
metrics.confusion_matrix(y_test,y_pred)

array([[  0, 314],
       [  0, 394]], dtype=int64)

Since accommodates is the highest rated variable I will do one additional logreg using this as the predictor. 

In [65]:
# Repeat logistic regression with train, test, split 
# using train test split to cross val
logreg3 = LogisticRegression()

feature_cols = ['accommodates']
X = AB_Clean[feature_cols]
y = AB_Clean.RPM

x_train, x_test, y_train, y_test = train_test_split(X,y, random_state =42)


logreg3.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

The score goes down when accommodates is the only feature.

In [66]:
y_pred = logreg3.predict(x_test)

In [67]:
metrics.accuracy_score(y_test,y_pred)

0.556497175141243

#### NLP with text columns

The list of my text variables is below

- Name
- host_about
- description

I will test my variables using count vectorizer and DF/IDF, to find the best. 


In [68]:
AB_Clean.columns

Index(['Unnamed: 0', 'last_scraped', 'name', 'summary', 'space', 'description',
       'neighborhood_overview', 'transit', 'access', 'house_rules',
       'host_since', 'host_about', 'host_total_listings_count',
       'neighbourhood', 'zipcode', 'room_type', 'accommodates', 'bathrooms',
       'beds', 'bed_type', 'amenities', 'price', 'availability_90',
       'calendar_last_scraped', 'first_review', 'last_review',
       'calculated_host_listings_count', 'reviews_per_month', 'price_int',
       'price_bed', 'Host_Age', 'RPM'],
      dtype='object')

In [148]:
# create new table with just the text columns and RPM

AB_Clean_text = AB_Clean[['host_about','name','description','RPM']]
AB_Clean_text['host_about']=AB_Clean_text['host_about'].astype(str)
AB_Clean_text.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,host_about,name,description,RPM
0,I live in Boston and I like to travel and have...,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,1
1,"I am a middle-aged, single male with a wide ra...",Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...",0


In [127]:
# Define X and y.
X = AB_Clean_text.description
y = AB_Clean_text.RPM

# Split the new DataFrame into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

NLP with name field

When using description and adjusting a couple of parameters I get a score of ~63.8%

In [106]:


# use default options for CountVectorizer
vect = CountVectorizer(stop_words='english',ngram_range=(1,2),max_features=10000)

# create document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

# use Naive Bayes  to predict the star rating
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

# calculate accuracy
print(metrics.accuracy_score(y_test, y_pred_class))
print(y_test.value_counts(normalize=True))

0.6384180790960452
1    0.556497
0    0.443503
Name: RPM, dtype: float64


When description is the text field the score goes up to ~68.22

In [129]:

# use default options for CountVectorizer
vect = CountVectorizer(stop_words='english',ngram_range=(1,2),max_features=10000)

# create document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

# use Naive Bayes  to predict the star rating
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

# calculate accuracy
print(metrics.accuracy_score(y_test, y_pred_class))
print(y_test.value_counts(normalize=True))

0.6822033898305084
1    0.556497
0    0.443503
Name: RPM, dtype: float64


In [130]:
vect.vocabulary_

{'right': 7660,
 'marathon': 5928,
 'route': 7860,
 'contact': 2710,
 'availability': 852,
 'centrally': 2139,
 'located': 5612,
 'universities': 9415,
 'hospitals': 4732,
 'tourist': 9098,
 'sites': 8179,
 'walk': 9617,
 'subway': 8814,
 'bus': 1927,
 'gourmet': 4246,
 'kitchen': 5150,
 'hotel': 4755,
 'style': 8801,
 'bathroom': 1069,
 'new': 6404,
 'bed': 1245,
 'furniture': 4116,
 'comfortable': 2491,
 'sleeps': 8249,
 'deal': 2953,
 'monday': 6227,
 'night': 6473,
 'minimum': 6127,
 'apt': 710,
 'gorgeous': 4239,
 'bath': 1045,
 'prime': 7182,
 'boston': 1584,
 'location': 5676,
 'near': 6282,
 'college': 2442,
 'harvard': 4489,
 'bu': 1864,
 'brand': 1792,
 'queen': 7319,
 'stainless': 8478,
 'steel': 8601,
 'high': 4573,
 'end': 3436,
 'breakfast': 1797,
 'bar': 995,
 'pull': 7304,
 'temperpedic': 8970,
 'sofa': 8300,
 '52': 214,
 'inch': 4885,
 'flat': 3851,
 'screen': 7934,
 'tv': 9279,
 'wall': 9733,
 'built': 1910,
 'june': 5080,
 '2012': 113,
 'converted': 2766,
 'basement'

Using the host about column we get the lowest score of ~61.8%

In [126]:
# use default options for CountVectorizer
vect = CountVectorizer(stop_words='english',ngram_range=(1,2),max_features=10000)

# create document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

# use Naive Bayes  to predict the star rating
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

# calculate accuracy
print(metrics.accuracy_score(y_test, y_pred_class))
print(y_test.value_counts(normalize=True))

0.6186440677966102
1    0.556497
0    0.443503
Name: RPM, dtype: float64


Using the description column and a TF/IDF vectorizer I am able to increase the accuracy to ~68.8%!

This is 13% above the training value. 

In [151]:
# Define X and y.
X = AB_Clean_text.description
y = AB_Clean_text.RPM

# Split the new DataFrame into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [152]:
# use default options for CountVectorizer
vect = TfidfVectorizer(stop_words='english',ngram_range=(1,2),max_features=10000)

# create document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

# use Naive Bayes  to predict the star rating
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

# calculate accuracy
print(metrics.accuracy_score(y_test, y_pred_class))
print(y_test.value_counts(normalize=True))

0.6878531073446328
1    0.556497
0    0.443503
Name: RPM, dtype: float64
