## In-Depth Analysis and Machine Learning

Insert description here.

In [1]:
# remove warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# import relevant packages
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning
# preprocessing and tuning
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from imblearn.over_sampling import SMOTE

# naive bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# random forests and boosting
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# pipeline
from sklearn.pipeline import Pipeline

# set seaborn theme
sns.set()

# set seed
SEED = 11

%matplotlib inline

In [3]:
# load data
listing = pd.read_csv('/Users/limesncoconuts2/datasets/capstone_one/los_angeles/los-angeles_listings.csv')
review = pd.read_csv('/Users/limesncoconuts2/datasets/capstone_one/los_angeles/los-angeles_reviews.csv')

### Naive Bayes on Text Data
Create a score for review data based on which words in reviews are most important to price

#### add text data to main df

In [4]:
review = review[['listing_id','comments']]
review.drop_duplicates(inplace=True)

In [5]:
review_combined = pd.DataFrame(columns=['id','text'])

for row in review[:100].iterrows():
    rev = row[1]
    if rev.listing_id not in review_combined: # if listing is not in the dataframe
        # get all reviews from the same listing
        subset = review[review.listing_id == rev.listing_id]

        # combine text in the subset
        combo = ''
        for com in subset.iterrows():
            combo += com[1].comments

        # add text to dataframe
        add = pd.DataFrame([[rev.listing_id, combo]], columns=['id', 'text'])
        review_combined = review_combined.append(add)
review_combined.id = review_combined.id.astype('int64')
review_combined.drop_duplicates(inplace=True)

In [6]:
df = listing.merge(review_combined, on='id', how='left')

In [7]:
df.text.fillna('', inplace=True)

#### Naive Bayes

In [8]:
# categorize top 25% prices as high, rest as low
df.price_percent = pd.qcut(df.price_USD, 3, labels=['low', 'low1', 'high'])

  


In [9]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df.text)
X = X.tocsc()
y = (df.price_percent == 'high').values.astype('int64')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [10]:
# bootstrap resampling to have more 'high' rows
sm = SMOTE(sampling_strategy=1.0 ,random_state=SEED)
X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)

In [11]:
#### NEED TO FIND BEST PARAMETERS
clf = MultinomialNB()
clf.fit(X_train_sm, y_train_sm)
probs = clf.predict_proba(X_test)
predictions = clf.predict(X_test)

# accuracies
training_accuracy = clf.score(X_train_sm, y_train_sm)
test_accuracy = clf.score(X_test, y_test)

print("Accuracy on training data: {:2f}".format(training_accuracy))
print("Accuracy on test data:     {:2f}".format(test_accuracy))

Accuracy on training data: 0.500039
Accuracy on test data:     0.671518


In [12]:
words = np.array(vectorizer.get_feature_names())
words

array(['10', '100', '10min', ..., 'young', 'your', 'zero'], dtype='<U16')

In [13]:
probs

array([[0.5, 0.5],
       [0.5, 0.5],
       [0.5, 0.5],
       ...,
       [0.5, 0.5],
       [0.5, 0.5],
       [0.5, 0.5]])

In [14]:
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
x = np.eye(X_test.shape[1])
probs_log = clf.predict_log_proba(x)[:, 0]
ind = np.argsort(probs_log)

expensive_words = words[ind[:10]]
cheap_words = words[ind[-10:]]

expensive_prob = probs[ind[:10]]
cheap_prob = probs[ind[-10:]]

print("Expensive words\t     P(expensive | word)")
for w, p in zip(expensive_words, expensive_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))
    
print("Cheap words\t     P(expensive | word)")
for w, p in zip(cheap_words, cheap_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))

In [None]:
# add review feature to df


In [None]:
# remove text data from df
listing.info()
listing.drop([], inplace=True)

### Gradient Boosting / Random Forests

In [None]:
# new training and test set
X_reg = listing.drop(['price_USD'])
y_reg = listing.price_USD
X_tr, X_tt, y_tr, y_tt = train_test_split(X_reg, y_reg, test_size=0.2, random_state=SEED)

In [None]:
# run algorithm
# 1000 trees
# max depth of 7
# min samples leaf of 1

rf = RandomForestRegressor(n_estimators=1000, max_depth=7)
rf.fit(X_tr, y_tr)

In [None]:
rf.predict(X_tt)
train_score = rf.score(X_tr, y_tr)
test_score = rf.score(X_tt, y_tt)
print('Train score:', train_score)
print('Test score:', test_score)

In [None]:
# evaluate using ROC / AUC