# Join dataframes (business + reviews)
## Yelp data
* Joined the business and reviews data in dataframe and stored it in "joined_3mReviews_df.p"
* Created the column of ['latest_3m_reviews'] which stores the latest 3 months reviews from the time we want to do prediction.
* Updated columns ['review_count', 'latest_3m_reviews_count']

1\. Import reviews dictionary and yelp_business dataframe

In [1]:
import json
import pandas as pd
import pickle

In [2]:
BUS_PATH = "data/processed/processed_bus_df.p"
bus_df = pd.read_pickle(BUS_PATH)

In [3]:
bus_df.columns

Index(['business_id', 'name', 'neighborhood', 'address', 'city', 'state',
       'postal_code', 'latitude', 'longitude', 'stars', 'review_count',
       'is_open', 'BikeParking', 'BusinessAcceptsBitcoin',
       'BusinessAcceptsCreditCards', 'BusinessParking_garage',
       'BusinessParking_street', 'BusinessParking_validated',
       'BusinessParking_lot', 'BusinessParking_valet', 'DogsAllowed',
       'RestaurantsPriceRange2', 'WheelchairAccessible', 'categories', 'hours',
       'type', 'Alcohol', 'Ambience_romantic', 'Ambience_intimate',
       'Ambience_classy', 'Ambience_hipster', 'Ambience_touristy',
       'Ambience_trendy', 'Ambience_upscale', 'Ambience_casual', 'Caters',
       'GoodForKids', 'GoodForMeal_dessert', 'GoodForMeal_latenight',
       'GoodForMeal_lunch', 'GoodForMeal_dinner', 'GoodForMeal_breakfast',
       'GoodForMeal_brunch', 'HasTV', 'NoiseLevel', 'OutdoorSeating',
       'RestaurantsAttire', 'RestaurantsDelivery', 'RestaurantsGoodForGroups',
       'Restaura

Note: using workaround for issue 24658

In [4]:
file_path = "/Users/xinpeilin/Documents/CS_Learning/Data_Mining/Project/review_cleaned_d.p"

class MacOSFile(object):

    def __init__(self, f):
        self.f = f

    def __getattr__(self, item):
        return getattr(self.f, item)

    def read(self, n):
        # print("reading total_bytes=%s" % n, flush=True)
        if n >= (1 << 31):
            buffer = bytearray(n)
            idx = 0
            while idx < n:
                batch_size = min(n - idx, 1 << 31 - 1)
                # print("reading bytes [%s,%s)..." % (idx, idx + batch_size), end="", flush=True)
                buffer[idx:idx + batch_size] = self.f.read(batch_size)
                # print("done.", flush=True)
                idx += batch_size
            return buffer
        return self.f.read(n)

    def write(self, buffer):
        n = len(buffer)
        print("writing total_bytes=%s..." % n, flush=True)
        idx = 0
        while idx < n:
            batch_size = min(n - idx, 1 << 31 - 1)
            print("writing bytes [%s, %s)... " % (idx, idx + batch_size), end="", flush=True)
            self.f.write(buffer[idx:idx + batch_size])
            print("done.", flush=True)
            idx += batch_size

def pickle_dump(obj, file_path):
    with open(file_path, "wb") as f:
        return pickle.dump(obj, MacOSFile(f), protocol=pickle.HIGHEST_PROTOCOL)
    
def pickle_load(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(MacOSFile(f))

In [5]:
import pickle
review_d = pickle_load(file_path)

2.Create column ['reviews'] in bus_df, having only the latest 3 months reviews. Update bus_df['review_count']

In [6]:
import time
import datetime

ThreeMonths = 7776000 # 90 days in second
FourMonths = 120*24*60*60

def get_time(date_str):
    return time.mktime(datetime.datetime.strptime(date_str, "%Y-%m-%d").timetuple())

#input dictionary and a string of bus_id, output a date string of the last review 
def last_review_date(review_dict, bus_id):
    MIN_TIME = "2000-01-01"
    last_date = MIN_TIME
    if bus_id not in review_dict:
        return "error: not found"
    for key, value in review_dict[bus_id].items():
        if get_time(last_date) < get_time(key):
            last_date = key
    return last_date

In [9]:
import numpy as np
#instances_num = 48493
bus_df['latest_3m_reviews'] = ""
bus_df['latest_3m_reviews_count'] = 0
for index, row in bus_df.iterrows():
    if row['business_id'] in review_d:
        text = ""
        review_num = 0
        review_num_3m = 0
        last_date = last_review_date(review_d, row['business_id'])
        for key, value in review_d[row['business_id']].items():
            for review in value:
                review_num += 1
                if get_time(key) > (get_time(last_date) - ThreeMonths):
                    text = text + review + " "
                    review_num_3m += 1
        bus_df.loc[bus_df.business_id == row['business_id'], 'review_count'] = review_num
        bus_df.loc[bus_df.business_id == row['business_id'], 'reviews'] = text
        bus_df.loc[bus_df.business_id == row['business_id'], 'latest_3m_reviews_count'] = review_num_3m
    else:
        print("error: business_id mismatch.")
        
bus_df['review_count'].head()

# Check:
# --6MefnULPED_I942VcFNA      24
# --9e1ONYQuAa-CB_Rrw7Tw    1311
# --DaPTJW3-tB1vP-PfdTEg      32
# --FBCX-N37CMYDfs790Bnw      88

--6MefnULPED_I942VcFNA      22
--9e1ONYQuAa-CB_Rrw7Tw    1280
--DaPTJW3-tB1vP-PfdTEg      29
--FBCX-N37CMYDfs790Bnw      84
--GM_ORV2cYS-h38DSaCLw       5
Name: review_count, dtype: int64

Testing

In [80]:
# pd.set_option('display.max_rows', 1000)
# pd.set_option('display.max_rows', None)
# #print(str(bus_df.loc[bus_df.business_id == '--6MefnULPED_I942VcFNA', 'reviews']))
# bus_df['reviews'].head()

In [16]:
bus_df['latest_3m_reviews_count'].head()

--6MefnULPED_I942VcFNA     1
--9e1ONYQuAa-CB_Rrw7Tw    43
--DaPTJW3-tB1vP-PfdTEg     2
--FBCX-N37CMYDfs790Bnw     4
--GM_ORV2cYS-h38DSaCLw     1
Name: latest_3m_reviews_count, dtype: int64

In [18]:
bus_df = bus_df.drop('latest_3m_reviews', axis=1)
bus_df=bus_df.rename(columns = {'reviews':'latest_3m_reviews'})

In [19]:
newfile_path = "/Users/xinpeilin/Documents/CS_Learning/Data_Mining/Project/joined_3mReviews_df.p"

In [20]:
pickle_dump(bus_df, newfile_path)

writing total_bytes=161549122...
writing bytes [0, 161549122)... done.


In [21]:
df = pickle_load(newfile_path)

In [30]:
df[['review_count','latest_3m_reviews_count','latest_3m_reviews']].head()

Unnamed: 0,review_count,latest_3m_reviews_count,latest_3m_reviews
--6MefnULPED_I942VcFNA,22,1,They have the best Chinese BBQ Pork (Char Siu)...
--9e1ONYQuAa-CB_Rrw7Tw,1280,43,Exceptional...exceptional steakhouse!! Ordered...
--DaPTJW3-tB1vP-PfdTEg,29,2,Sunnyside grill is sort of american diner sort...
--FBCX-N37CMYDfs790Bnw,84,4,Moved to a neighborhood right by this restaura...
--GM_ORV2cYS-h38DSaCLw,5,1,Screwed up my order. Had to wait for a second ...


In [31]:
reviews_df = df[['latest_3m_reviews', 'is_open']].copy()
reviews_df.head()


Unnamed: 0,latest_3m_reviews,is_open
--6MefnULPED_I942VcFNA,They have the best Chinese BBQ Pork (Char Siu)...,1
--9e1ONYQuAa-CB_Rrw7Tw,Exceptional...exceptional steakhouse!! Ordered...,1
--DaPTJW3-tB1vP-PfdTEg,Sunnyside grill is sort of american diner sort...,1
--FBCX-N37CMYDfs790Bnw,Moved to a neighborhood right by this restaura...,1
--GM_ORV2cYS-h38DSaCLw,Screwed up my order. Had to wait for a second ...,1


In [12]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


* tfidf unigram

In [33]:
X= reviews_df['latest_3m_reviews']
Y= np.asarray(reviews_df['is_open'], dtype='bool')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=.80)

vectorizer = TfidfVectorizer(sublinear_tf = True)
vectorizer.fit(X_train)
feature_names = vectorizer.get_feature_names()
# vocabulary_list = list(zip( vectorizer.vocabulary_.keys(), binary_vectorizer.vocabulary_.values()) )
# vocabulary_list[0:10]


In [44]:
len(feature_names) #109107
feature_names[5000:5010]

['aligot',
 'alihan',
 'aliiiiive',
 'aliitle',
 'alike',
 'alikes',
 'alil',
 'alimentaire',
 'alimentary',
 'alimentation']

In [45]:
X_train_tfidf = vectorizer.transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [49]:
X_train_tfidf

<38794x109107 sparse matrix of type '<class 'numpy.float64'>'
	with 7686915 stored elements in Compressed Sparse Row format>

In [73]:
model = LogisticRegression()
model.fit(X_train_tfidf, Y_train)
LR_accuracy = metrics.accuracy_score(model.predict(X_test_tfidf), Y_test)
print("LR Accuracy = %.3f" % LG_accuracy)
print("Area under the ROC curve on test data (LogisticRegression) = %.3f" % metrics.roc_auc_score(model.predict(X_test_tfidf), Y_test))

LR Accuracy = 0.788
Area under the ROC curve on test data (LogisticRegression) = 0.727


In [64]:
from sklearn.svm import LinearSVC
lsvc_model = LinearSVC()
lsvc_model.fit(X_train_tfidf, Y_train)
SVM_accuracy = metrics.accuracy_score(lsvc_model.predict(X_test_tfidf), Y_test)
print("SVM Accuracy = %.3f" % SVM_accuracy)
print("Area under the ROC curve on test data (SVM) = %.3f" % metrics.roc_auc_score(lsvc_model.predict(X_test_tfidf), Y_test))

SVM Accuracy = 0.774
Area under the ROC curve on test data (SVM) = 0.670


* tfidf trigram

In [63]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), sublinear_tf = True)
vectorizer.fit(X_train)
feature_names = vectorizer.get_feature_names()

In [71]:
len(feature_names) #10,543,718
feature_names[50000:50010]

['23 in',
 '23 in the',
 '23 including',
 '23 including the',
 '23 ish',
 '23 ish 25',
 '23 ish dinner',
 '23 it',
 '23 it an',
 '23 it included']

In [77]:
X_train_tfidf_trigram = vectorizer.transform(X_train)
X_test_tfidf_trigram = vectorizer.transform(X_test)
model.fit(X_train_tfidf_trigram, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [78]:
LR_accuracy_trigram = metrics.accuracy_score(model.predict(X_test_tfidf_trigram), Y_test)
print("LR Accuracy = %.3f" % LR_accuracy_trigram)
print ("Area under the ROC curve on test data (LogisticRegression, trigram) = %.3f" % metrics.roc_auc_score(model.predict(X_test_tfidf_trigram), Y_test))

LR Accuracy = 0.773
Area under the ROC curve on test data (LogisticRegression, trigram) = 0.807


In [79]:
# https://medium.com/@aneesha/visualising-top-features-in-linear-svm-with-scikit-learn-and-matplotlib-3454ab18a14d
from sklearn.feature_extraction.text import CountVectorizer
top_positive_feature_list = []
top_negative_feature_list = []

def get_coefficeints(classifier, feature_names, top_features):
    coef = classifier.coef_.ravel()
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    add_count_for_features(top_positive_coefficients, top_negative_coefficients, feature_names)
    # top_positive_feature_list = []
    # top_negative_feature_list = []
    for i in top_positive_coefficients[::-1]:
        top_positive_feature_list.append(feature_names[i]) 
    for i in top_negative_coefficients[::-1]:
        top_negative_feature_list.append(feature_names[i])

    # print ("top_positive_feature_list:", top_positive_feature_list)
    # print ("top_negative_feature_list:", top_negative_feature_list)
    return coef, top_coefficients

    
def plot_coefficients(classifier, feature_names, top_features=20):
    # create plot
    coef, top_coefficients = get_coefficeints(classifier, feature_names, top_features)
    plt.figure(figsize=(20, 5))
    colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
    plt.show()
    
    
def add_count_for_features(top_positive_coefficients, top_negative_coefficients, feature_names):
    for i in top_positive_coefficients[::-1]:
        try: 
            total_positive_feature_count[feature_names[i]] += 1
        except:
            total_positive_feature_count[feature_names[i]] = 1
            
    for i in top_negative_coefficients[::-1]:
        try: 
            total_negative_feature_count[feature_names[i]] += 1
        except:
            total_negative_feature_count[feature_names[i]] = 1
            

for i in range(len(cluster_eps01_models)):
    X, Y = feature_target_seperator(cluster_eps01_dfs[i])
    cv = CountVectorizer()
    cv.fit(X)
    plot_coefficients(cluster_eps01_models[i], cv.get_feature_names(), top_features=20)

# print (total_positive_feature_count)
# print (total_negative_feature_count)
# plot_coefficients(cluster0_LinearSVC_model, cv.get_feature_names())

[[ 0.03259681 -0.05354299  0.0013548  ..., -0.02637974 -0.02637974
  -0.02637974]]


4.Create column ['reviews'] in bus_df, having the latest 4 months reviews. Update bus_df['review_count']

In [8]:
import numpy as np
#instances_num = 48493
bus_df['latest_4m_reviews'] = ""
bus_df['latest_4m_reviews_count'] = 0
for index, row in bus_df.iterrows():
    if row['business_id'] in review_d:
        text = ""
        review_num = 0
        review_num_4m = 0
        last_date = last_review_date(review_d, row['business_id'])
        for key, value in review_d[row['business_id']].items():
            for review in value:
                review_num += 1
                if get_time(key) > (get_time(last_date) - FourMonths):
                    text = text + review + " "
                    review_num_4m += 1
        bus_df.loc[bus_df.business_id == row['business_id'], 'review_count'] = review_num
        bus_df.loc[bus_df.business_id == row['business_id'], 'latest_4m_reviews'] = text
        bus_df.loc[bus_df.business_id == row['business_id'], 'latest_4m_reviews_count'] = review_num_4m
    else:
        print("error: business_id mismatch.")
        
bus_df['review_count'].head()

# Check:
# --6MefnULPED_I942VcFNA      24
# --9e1ONYQuAa-CB_Rrw7Tw    1311
# --DaPTJW3-tB1vP-PfdTEg      32
# --FBCX-N37CMYDfs790Bnw      88

--6MefnULPED_I942VcFNA      22
--9e1ONYQuAa-CB_Rrw7Tw    1280
--DaPTJW3-tB1vP-PfdTEg      29
--FBCX-N37CMYDfs790Bnw      84
--GM_ORV2cYS-h38DSaCLw       5
Name: review_count, dtype: int64

In [9]:
bus_df[['review_count', 'latest_4m_reviews_count', 'latest_4m_reviews']].head()

Unnamed: 0,review_count,latest_4m_reviews_count,latest_4m_reviews
--6MefnULPED_I942VcFNA,22,2,The incredibly rude woman behind the cashier w...
--9e1ONYQuAa-CB_Rrw7Tw,1280,60,Exceptional...exceptional steakhouse!! Ordered...
--DaPTJW3-tB1vP-PfdTEg,29,2,Sunnyside grill is sort of american diner sort...
--FBCX-N37CMYDfs790Bnw,84,5,"Happy Hour has to many rules, certain drinks a..."
--GM_ORV2cYS-h38DSaCLw,5,1,Screwed up my order. Had to wait for a second ...


In [10]:
pickle_dump(bus_df, "/Users/xinpeilin/Documents/CS_Learning/Data_Mining/Project/joined_4mReviews_df.p")

writing total_bytes=196054111...
writing bytes [0, 196054111)... done.


In [11]:
df = pickle_load("/Users/xinpeilin/Documents/CS_Learning/Data_Mining/Project/joined_4mReviews_df.p")
reviews_df = df[['latest_4m_reviews', 'is_open']].copy()
reviews_df.head()

Unnamed: 0,latest_4m_reviews,is_open
--6MefnULPED_I942VcFNA,The incredibly rude woman behind the cashier w...,1
--9e1ONYQuAa-CB_Rrw7Tw,Exceptional...exceptional steakhouse!! Ordered...,1
--DaPTJW3-tB1vP-PfdTEg,Sunnyside grill is sort of american diner sort...,1
--FBCX-N37CMYDfs790Bnw,"Happy Hour has to many rules, certain drinks a...",1
--GM_ORV2cYS-h38DSaCLw,Screwed up my order. Had to wait for a second ...,1


In [13]:
X= reviews_df['latest_4m_reviews']
Y= np.asarray(reviews_df['is_open'], dtype='bool')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=.80)

vectorizer = TfidfVectorizer(sublinear_tf = True)
vectorizer.fit(X_train)
X_train_tfidf = vectorizer.transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [31]:
LR_model = LogisticRegression()
LR_model.fit(X_train_tfidf, Y_train)
LR_accuracy = metrics.accuracy_score(LR_model.predict(X_test_tfidf), Y_test)
print("<Features: 120 days reviews, Model: Logistic Regression>")
print("Accuracy = %.3f" % LR_accuracy)
print("Area under the ROC curve on test data = %.3f" % metrics.roc_auc_score(LR_model.predict(X_test_tfidf), Y_test))

<Features: 120 days reviews, Model: Logistic Regression>
Accuracy = 0.794
Area under the ROC curve on test data = 0.730


In [19]:
X_tfidf = vectorizer.transform(X)

In [25]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(LR_model, X_tfidf, Y, cv=10, scoring='roc_auc')
print("Cross Validation AUC: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
#4 months, LR

Cross Validation AUC: 0.76 (+/- 0.01)


In [40]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [48]:
X_train_wcount = count_vectorizer.transform(X_train)
X_test_wcount = count_vectorizer.transform(X_test)

In [49]:
print(X_train_wcount[20])

  (0, 6642)	3
  (0, 10772)	1
  (0, 16358)	1
  (0, 17974)	1
  (0, 25937)	1
  (0, 30275)	1
  (0, 33620)	1
  (0, 37106)	1
  (0, 37942)	1
  (0, 41061)	1
  (0, 41230)	2
  (0, 41693)	1
  (0, 50086)	1
  (0, 50138)	1
  (0, 51477)	1
  (0, 53652)	1
  (0, 55111)	2
  (0, 55178)	1
  (0, 55226)	1
  (0, 55317)	1
  (0, 60331)	2
  (0, 61684)	1
  (0, 62114)	1
  (0, 64507)	1
  (0, 66373)	1
  :	:
  (0, 91246)	1
  (0, 92716)	1
  (0, 93263)	1
  (0, 98881)	1
  (0, 99677)	1
  (0, 100457)	1
  (0, 100520)	1
  (0, 100539)	1
  (0, 102403)	1
  (0, 104183)	3
  (0, 104203)	6
  (0, 104410)	1
  (0, 104515)	3
  (0, 104539)	1
  (0, 105351)	2
  (0, 107240)	1
  (0, 107770)	1
  (0, 107809)	1
  (0, 108045)	1
  (0, 110002)	1
  (0, 113254)	1
  (0, 114761)	1
  (0, 115020)	1
  (0, 115496)	1
  (0, 116740)	1


In [50]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
NB_model = MultinomialNB()
NB_model.fit(X_train_wcount, Y_train)
NB_accuracy = metrics.accuracy_score(NB_model.predict(X_test_wcount), Y_test)
print("<Features: 120 days reviews, Model: MultinomialNB>")
print("Accuracy = %.3f" % NB_accuracy)
print("Area under the ROC curve on test data = %.3f" % metrics.roc_auc_score(NB_model.predict(X_test_wcount), Y_test))

<Features: 120 days reviews, Model: MultinomialNB>
Accuracy = 0.778
Area under the ROC curve on test data = 0.673


In [51]:
from sklearn.naive_bayes import BernoulliNB
NB_model = BernoulliNB()
NB_model.fit(X_train_wcount, Y_train)
NB_accuracy = metrics.accuracy_score(NB_model.predict(X_test_wcount), Y_test)
print("<Features: 120 days reviews, Model: BernoulliNB>")
print("Accuracy = %.3f" % NB_accuracy)
print("Area under the ROC curve on test data = %.3f" % metrics.roc_auc_score(NB_model.predict(X_test_wcount), Y_test))

<Features: 120 days reviews, Model: BernoulliNB>
Accuracy = 0.517
Area under the ROC curve on test data = 0.537
