In [36]:
import json
import pandas as pd
import re
import random
from ast import literal_eval as make_tuple
from scipy import sparse
import numpy as np
from pymongo import MongoClient
from nltk.corpus import stopwords
from sklearn import svm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn_deltatfidf import DeltaTfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import sys
sys.path.append('../machine_learning')
import yelp_ml as yml
reload(yml)
from gensim import corpora, models, similarities, matutils
import tqdm

In [27]:
#Import scrapped reviews
dc_reviews = json.load(open("../Yelp_web_scrapper/dc_reviews.json"))
newyork_reviews = json.load(open("../Yelp_web_scrapper/newyork_reviews.json"))

In [28]:
#Import word dictionaries
lh_neg = open('../input/negative-words.txt', 'r').read()
lh_neg = lh_neg.split('\n')
lh_pos = open('../input/positive-words.txt', 'r').read()
lh_pos = lh_pos.split('\n')
users = json.load(open("cleaned_large_user_dictionary.json"))
word_list = list(set(lh_pos + lh_neg))

In [74]:
#####Pull restaurant data for a given user
ip = '54.175.170.119'
conn = MongoClient(ip, 27017)
conn.database_names()
db = conn.get_database('cleaned_data')
reviews = db.get_collection('restaurant_reviews')
string_keys_dict = {}
for j in tqdm.tqdm(range(155, 157)):
    test_results = {}
    user_df = yml.make_user_df(users[users.keys()[j]])
    
    if len([x for x in user_df['rating'] if x < 4]) < 20:
        string_keys_dict[str(users.keys()[j])] = test_results
        continue
    else:
        business_ids = list(set(user_df['biz_id']))
        restreview = {}
        
        #Create a training and test sample from the user reviewed restaurants
        split_samp = .25
        len_random = int(len(business_ids) * split_samp)
        test_set = random.sample(business_ids, len_random)
        training_set = [x for x in business_ids if x not in test_set]
        sub_train_reviews, train_labels, train_reviews, train_ratings = [], [], [], []

        #Create a list of training reviews and training ratings
        for rest_id in training_set:
            train_reviews.append((user_df[user_df['biz_id'] == rest_id]['review_text'].iloc[0],
                                     user_df[user_df['biz_id'] == rest_id]['rating'].iloc[0]))

        #Create an even sample s.t. len(positive_reviews) = len(negative_reviews)
        sample_size = min(len([x[1] for x in train_reviews if x[1] < 4]),
                              len([x[1] for x in train_reviews if x[1] >= 4]))

        bad_reviews = [x for x in train_reviews if x[1] < 4]
        good_reviews = [x for x in train_reviews if x[1] >= 4]

        for L in range(0, int(float(sample_size)/float(2))):
            sub_train_reviews.append(bad_reviews[L][0])
            sub_train_reviews.append(good_reviews[L][0])
            train_labels.append(bad_reviews[L][1])
            train_labels.append(good_reviews[L][1])

        #Make the train labels binary
        train_labels = [1 if x >=4 else 0 for x in train_labels]
        
        if not sub_train_reviews:
            string_keys_dict[str(users.keys()[j])] = test_results
            continue
        else:
            for i in range(0, len(business_ids)):
                rlist = []
                for obj in reviews.find({'business_id':business_ids[i]}):
                    rlist.append(obj)
                restreview[business_ids[i]] = rlist

            restaurant_df = yml.make_biz_df(users.keys()[j], restreview)

            #Make a FeatureUnion object with the desired features then fit to train reviews
            feature_selection = {"sent_tf":(True, True, False), 
                                 "sent": (True,False,False),
                                 "tf_lda": (False,True,True), 
                                 "all": (True, True, True)}

            for feature in feature_selection.keys():
                #Make a FeatureUnion object with the desired features then fit to train reviews
                comb_features = yml.make_featureunion(sent_percent=feature_selection[feature][0], 
                                                      tf = feature_selection[feature][1], 
                                                      lda = feature_selection[feature][2])

                delta_vect = None
                comb_features.fit(sub_train_reviews)
                train_features = comb_features.transform(sub_train_reviews)

                #Fit LSI model and return number of LSI topics
                lsi, topics, dictionary = yml.fit_lsi(sub_train_reviews)
                train_lsi = yml.get_lsi_features(sub_train_reviews, lsi, topics, dictionary)

                #Stack the LSI and combined features together
                train_features = sparse.hstack((train_features, train_lsi))
                train_features = train_features.todense()

                #fit each model in turn 
                model_runs = {"svm": (True, False, False),
                              "rf": (False, True, False), 
                              "naive_bayes": (False, False, True)}

                for model_run in model_runs.keys():
                    clf = yml.fit_model(train_features, train_labels, svm_clf = model_runs[model_run][0], 
                                    RandomForest = model_runs[model_run][1], 
                                        nb = model_runs[model_run][2])
                    threshold = 0.7
                    error = yml.test_user_set(test_set, clf, restaurant_df, user_df, comb_features, 
                                              threshold, lsi, topics, dictionary, delta_vect)
                    test_results[str((feature, model_run))] = (yml.get_log_loss(error), 
                                                    yml.get_accuracy_score(error), 
                                                    yml.get_precision_score(error))
                
    string_keys_dict[str(users.keys()[j])] = test_results
            
with open('test_results.json', 'wb') as fp:
    json.dump(string_keys_dict, fp)

100%|██████████| 2/2 [58:57<00:00, 1978.89s/it]


In [81]:
test_results = string_keys_dict['n86B7IkbU20AkxlFX_5aew']

In [None]:
#########################
#Make a Recommendation
#########################
top_results = []
#Get feature and model combination that yields the highest precision
for key in test_results.keys():
    feat_model = make_tuple(key)
    if not top_results:
        top_results = [(feat_model,test_results[key][2])]
    else:
        if test_results[key][2] > top_results[0][1]:
            top_results.pop()
            top_results = [(feat_model, test_results[key][2])]
feat_result = top_results[0][0][0]
model_result = top_results[0][0][1]

for j in tqdm.tqdm(range(155, 157)):
    user_df = yml.make_user_df(users[users.keys()[j]])
    business_ids = list(set(user_df['biz_id']))

    #Create a list of training reviews and training ratings
    for rest_id in business_ids:
        train_reviews.append((user_df[user_df['biz_id'] == rest_id]['review_text'].iloc[0],
                                 user_df[user_df['biz_id'] == rest_id]['rating'].iloc[0]))

    #Create an even sample s.t. len(positive_reviews) = len(negative_reviews)
    sample_size = min(len([x[1] for x in train_reviews if x[1] < 4]),
                          len([x[1] for x in train_reviews if x[1] >= 4]))
    
    bad_reviews = [x for x in train_reviews if x[1] < 4]
    good_reviews = [x for x in train_reviews if x[1] >= 4]
    
    train_labels = []
    sub_train_reviews = []
    for L in range(0, int(float(sample_size)/float(2))):
        sub_train_reviews.append(bad_reviews[L][0])
        sub_train_reviews.append(good_reviews[L][0])
        train_labels.append(bad_reviews[L][1])
        train_labels.append(good_reviews[L][1])
        
    #Make the train labels binary
    train_labels = [1 if x >=4 else 0 for x in train_labels]
    
    #Fit LSI model and return number of LSI topics
    lsi, topics, dictionary = yml.fit_lsi(sub_train_reviews)

    #Make a FeatureUnion object with the desired features then fit to train reviews
    feature_selection = {"sent_tf":(True, True, False), 
                         "sent": (True,False,False),
                         "tf_lda": (False,True,True), 
                         "all": (True, True, True)}
    top_feature = feature_selection['all']
    
    comb_features = yml.make_featureunion(sent_percent=top_feature[0], 
                                          tf = top_feature[1], 
                                          lda = top_feature[2])
        
    comb_features.fit(sub_train_reviews)
    train_features = comb_features.transform(sub_train_reviews)
    train_lsi = yml.get_lsi_features(sub_train_reviews, lsi, topics, dictionary)
    train_features = sparse.hstack((train_features, train_lsi))
    train_features = train_features.todense()

    #Fit LSI model and return number of LSI topics
    lsi, topics, dictionary = yml.fit_lsi(sub_train_reviews)
        
    #Get the top performing model and fit using that model
    model_runs = {"svm": (True, False, False),
                  "rf": (False, True, False), 
                  "naive_bayes": (False, False, True)}
    top_model = model_runs['svm']
    clf = yml.fit_model(train_features, train_labels, svm_clf = top_model[0], 
                RandomForest = top_model[1], 
                    nb = top_model[2])

    threshold = 0.7
    user_results = yml.make_rec(dc_reviews, clf, threshold, comb_features, 
                                lsi, topics, dictionary)



In [85]:
################################################################
#Collect the results into a list of tuples, then select the top
#5 most confident good recs and top 5 most confident bad recs
################################################################
tuple_results = []
for result in user_results:
    tuple_results.append((result[1], result[2], result[3]))
    
#Sort the list of tuples by predicition confidence
tuple_results = sorted(tuple_results, key=lambda tup: tup[1])
top_5 = tuple_results[-5:]
bottom_5 = tuple_results[0:5]

In [86]:
#Show the top 5 restaurants
top_5

[(u'/biz/bar-a-vin-washington', 0.94999999999999996, 1),
 (u'/biz/ristorante-la-perla-of-washington-washington-3',
  0.94999999999999996,
  1),
 (u'/biz/ruths-chris-steak-house-washington', 1.0, 1),
 (u'/biz/la-jambe-washington', 1.0, 1),
 (u'/biz/ghibellina-washington', 1.0, 1)]

In [87]:
#Show the bottom 5 restaurants
bottom_5

[(u'/biz/daikaya-ramen-shop-washington', 0.0, 0),
 (u'/biz/moxies-washington', 0.14999999999999999, 0),
 (u'/biz/cornerstone-cafe-washington', 0.14999999999999999, 0),
 (u'/biz/sakuramen-washington', 0.20000000000000001, 0),
 (u'/biz/subbs-washington', 0.25, 0)]

In [94]:
#user_df contains all of the user's reviews, with one column of restaurants,
#one column of the user's ratings, and one column with the user's reviews
user_df = yml.make_user_df(users[users.keys()[j]])
user_df.head()

TypeError: string indices must be integers, not str

In [89]:
#View the top words in the LDA representation
no_top_words = 10
tf_feature_names = vectorizer.get_feature_names()
display_topics(lda_fit, tf_feature_names, no_top_words)

NameError: name 'vectorizer' is not defined