In [36]:
import json
import pandas as pd
import re
import random
from ast import literal_eval as make_tuple
from scipy import sparse
import numpy as np
from pymongo import MongoClient
from nltk.corpus import stopwords
from sklearn import svm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn_deltatfidf import DeltaTfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import sys
sys.path.append('../machine_learning')
import yelp_ml as yml
reload(yml)
from gensim import corpora, models, similarities, matutils
import tqdm

In [27]:
#Import scrapped reviews
dc_reviews = json.load(open("../Yelp_web_scrapper/dc_reviews.json"))
newyork_reviews = json.load(open("../Yelp_web_scrapper/newyork_reviews.json"))

In [28]:
#Import word dictionaries
lh_neg = open('../input/negative-words.txt', 'r').read()
lh_neg = lh_neg.split('\n')
lh_pos = open('../input/positive-words.txt', 'r').read()
lh_pos = lh_pos.split('\n')
users = json.load(open("cleaned_large_user_dictionary.json"))
word_list = list(set(lh_pos + lh_neg))

In [72]:
#####Pull restaurant data for a given user
ip = '54.175.170.119'
conn = MongoClient(ip, 27017)
conn.database_names()
db = conn.get_database('cleaned_data')
reviews = db.get_collection('restaurant_reviews')
string_keys_dict = {}
for j in tqdm.tqdm(range(16, 17)):
    test_results = {}
    user_df = yml.make_user_df(users[users.keys()[j]])
    
    if len([x for x in user_df['rating'] if x < 4]) < 20:
        string_keys_dict[str(users.keys()[j])] = test_results
        continue
    else:
        business_ids = list(set(user_df['biz_id']))
        restreview = {}
        
        #Create a training and test sample from the user reviewed restaurants
        split_samp = .25
        len_random = int(len(business_ids) * split_samp)
        test_set = random.sample(business_ids, len_random)
        training_set = [x for x in business_ids if x not in test_set]
        sub_train_reviews, train_labels, train_reviews, train_ratings = [], [], [], []

        #Create a list of training reviews and training ratings
        for rest_id in training_set:
            train_reviews.append((user_df[user_df['biz_id'] == rest_id]['review_text'].iloc[0],
                                     user_df[user_df['biz_id'] == rest_id]['rating'].iloc[0]))

        #Create an even sample s.t. len(positive_reviews) = len(negative_reviews)
        sample_size = min(len([x[1] for x in train_reviews if x[1] < 4]),
                              len([x[1] for x in train_reviews if x[1] >= 4]))

        bad_reviews = [x for x in train_reviews if x[1] < 4]
        good_reviews = [x for x in train_reviews if x[1] >= 4]

        for L in range(0, int(float(sample_size)/float(2))):
            sub_train_reviews.append(bad_reviews[L][0])
            sub_train_reviews.append(good_reviews[L][0])
            train_labels.append(bad_reviews[L][1])
            train_labels.append(good_reviews[L][1])

        #Make the train labels binary
        train_labels = [1 if x >=4 else 0 for x in train_labels]
        
        if not sub_train_reviews:
            string_keys_dict[str(users.keys()[j])] = test_results
            continue
        else:
        for i in range(0, len(business_ids)):
            rlist = []
            for obj in reviews.find({'business_id':business_ids[i]}):
                rlist.append(obj)
            restreview[business_ids[i]] = rlist

        restaurant_df = yml.make_biz_df(users.keys()[j], restreview)

        #Make a FeatureUnion object with the desired features then fit to train reviews
        feature_selection = {"sent_tf":(True, True, False), 
                             "sent": (True,False,False),
                             "tf_lda": (False,True,True), 
                             "all": (True, True, True)}

        for feature in feature_selection.keys():
            #Make a FeatureUnion object with the desired features then fit to train reviews
            comb_features = yml.make_featureunion(sent_percent=feature_selection[feature][0], 
                                                  tf = feature_selection[feature][1], 
                                                  lda = feature_selection[feature][2])

            delta_vect = None
            comb_features.fit(sub_train_reviews)
            train_features = comb_features.transform(sub_train_reviews)

            #Fit LSI model and return number of LSI topics
            lsi, topics, dictionary = yml.fit_lsi(sub_train_reviews)
            train_lsi = yml.get_lsi_features(sub_train_reviews, lsi, topics, dictionary)

            #Stack the LSI and combined features together
            train_features = sparse.hstack((train_features, train_lsi))
            train_features = train_features.todense()

            #fit each model in turn 
            model_runs = {"svm": (True, False, False),
                          "rf": (False, True, False), 
                          "naive_bayes": (False, False, True)}

            for model_run in model_runs.keys():
                clf = yml.fit_model(train_features, train_labels, svm_clf = model_runs[model_run][0], 
                                RandomForest = model_runs[model_run][1], 
                                    nb = model_runs[model_run][2])
                threshold = 0.7
                error = yml.test_user_set(test_set, clf, restaurant_df, user_df, comb_features, 
                                          threshold, lsi, topics, dictionary, delta_vect)
                test_results[str((feature, model_run))] = (yml.get_log_loss(error), 
                                                yml.get_accuracy_score(error), 
                                                yml.get_precision_score(error))
                
    string_keys_dict[str(users.keys()[j])] = test_results
            
with open('test_results.json', 'wb') as fp:
    json.dump(string_keys_dict, fp)

IndentationError: expected an indented block (<ipython-input-72-fed646b0b0ef>, line 47)

In [31]:
#########################
#Make a Recommendation
#########################
top_results = []
#Get feature and model combination that yields the highest precision
for key in test_results.keys():
    feat_model = make_tuple(key)
    if not top_results:
        top_results = [(feat_model,test_results[key][2])]
    else:
        if test_results[key][2] > top_results[0][1]:
            top_results.pop()
            top_results = [(feat_model, test_results[key][2])]
feat_result = top_results[0][0][0]
model_result = top_results[0][0][1]

for j in tqdm.tqdm(range(large_user_number, large_user_number+1)):
    user_df = yml.make_user_df(users[users.keys()[j]])
    business_ids = list(set(user_df['biz_id']))

    #Create a list of training reviews and training ratings
    for rest_id in business_ids:
        train_reviews.append((user_df[user_df['biz_id'] == rest_id]['review_text'].iloc[0],
                                 user_df[user_df['biz_id'] == rest_id]['rating'].iloc[0]))

    #Create an even sample s.t. len(positive_reviews) = len(negative_reviews)
    sample_size = min(len([x[1] for x in train_reviews if x[1] < 4]),
                          len([x[1] for x in train_reviews if x[1] >= 4]))
    
    bad_reviews = [x for x in train_reviews if x[1] < 4]
    good_reviews = [x for x in train_reviews if x[1] >= 4]
    
    train_labels = []
    sub_train_reviews = []
    for L in range(0, int(float(sample_size)/float(2))):
        sub_train_reviews.append(bad_reviews[L][0])
        sub_train_reviews.append(good_reviews[L][0])
        train_labels.append(bad_reviews[L][1])
        train_labels.append(good_reviews[L][1])
        
    #Make the train labels binary
    train_labels = [1 if x >=4 else 0 for x in train_labels]
    
    #Fit LSI model and return number of LSI topics
    lsi, topics, dictionary = yml.fit_lsi(sub_train_reviews)

    #Make a FeatureUnion object with the desired features then fit to train reviews
    feature_selection = {"sent_tf":(True, True, False), 
                         "sent": (True,False,False),
                         "tf_lda": (False,True,True), 
                         "all": (True, True, True)}
    top_feature = feature_selection['all']
    
    comb_features = yml.make_featureunion(sent_percent=top_feature[0], 
                                          tf = top_feature[1], 
                                          lda = top_feature[2])
        
    comb_features.fit(sub_train_reviews)
    train_features = comb_features.transform(sub_train_reviews)
    train_lsi = yml.get_lsi_features(sub_train_reviews, lsi, topics, dictionary)
    train_features = sparse.hstack((train_features, train_lsi))
    train_features = train_features.todense()

    #Fit LSI model and return number of LSI topics
    lsi, topics, dictionary = yml.fit_lsi(sub_train_reviews)
        
    #Get the top performing model and fit using that model
    model_runs = {"svm": (True, False, False),
                  "rf": (False, True, False), 
                  "naive_bayes": (False, False, True)}
    top_model = model_runs['svm']
    clf = yml.fit_model(train_features, train_labels, svm_clf = top_model[0], 
                RandomForest = top_model[1], 
                    nb = top_model[2])

    threshold = 0.7
    user_results = yml.make_rec(dc_reviews, clf, threshold, comb_features, 
                                lsi, topics, dictionary)

  feature_idx = vocabulary[feature]
100%|██████████| 1/1 [00:34<00:00, 34.10s/it]


In [32]:
################################################################
#Collect the results into a list of tuples, then select the top
#5 most confident good recs and top 5 most confident bad recs
################################################################

tuple_results = []
for result in user_results:
    tuple_results.append((result[1], result[2], result[3]))
    
#Sort the list of tuples by predicition confidence
tuple_results = sorted(tuple_results, key=lambda tup: tup[1])
top_5 = tuple_results[-5:]
bottom_5 = tuple_results[0:5]

In [33]:
#Show the top 5 restaurants
top_5

[(u'/biz/kingbird-washington', 1.0, 1),
 (u'/biz/jojo-restaurant-and-bar-washington', 1.0, 1),
 (u'/biz/al-volo-washington', 1.0, 1),
 (u'/biz/timber-pizza-company-washington', 1.0, 1),
 (u'/biz/little-sesame-washington', 1.0, 1)]

In [34]:
#Show the bottom 5 restaurants
bottom_5

[(u'/biz/pier-2934-cajun-seafood-washington', 0.5, 0),
 (u'/biz/momofuku-ccdc-washington', 0.55000000000000004, 0),
 (u'/biz/laliguras-indian-and-nepali-bistro-washington-2',
  0.55000000000000004,
  0),
 (u'/biz/ben-tre-washington', 0.55000000000000004, 0),
 (u'/biz/dumplings-and-beyond-washington', 0.55000000000000004, 0)]

In [35]:
#user_df contains all of the user's reviews, with one column of restaurants,
#one column of the user's ratings, and one column with the user's reviews
user_df.head()

Unnamed: 0,biz_id,rating,review_text
0,7ReFxabYRuDBmx9Bdx7VMA,4,Ok I know it was a fast food but they have exc...
1,430DW6yItFj3iB710i1a8A,5,YUM YUM YUM YUM Birthday cake is my favorite ...
2,7cIsnVpbiIpVkXZCMJyAtg,4,Fancy inside lounge area with a tv kids area ...
3,L9boPSsWE93vkRHR4ti2Fw,4,My brother suggested for us to try to this pla...
4,HH26SnOm2Ab7UfkNhQU66A,2,I swear the pizza was better before It was...


In [45]:
#View the top words in the LDA representation
no_top_words = 10
tf_feature_names = vectorizer.get_feature_names()
display_topics(lda_fit, tf_feature_names, no_top_words)

NameError: name 'vectorizer' is not defined

# Collapsed Gibbs Sampling

1. Go through each document, and randomly assign each word in the document to one of the K topics.
2. For each document d, go through each word w in d, and for each topic t:
    1. Compute P(topic t | document d) = the proportion of words in document d that are currently assigned to topic t, and 2) p(word w | topic t) = the proportion of assignments to topic t over all documents that come from this word w. Reassign w a new topic, where you choose topic t with probability p(topic t | document d) * p(word w | topic t) (according to our generative model, this is essentially the probability that topic t generated word w, so it makes sense that we resample the current word's topic with this probability). (Also, I'm glossing over a couple of things here, such as the use of priors/pseudocounts in these probabilities.)
........In other words, in this step, we're assuming that all topic assignments except for the current word in question are correct, and then updating the assignment of the current word using our model of how documents are generated.
After repeating the previous step a large number of times, you'll eventually reach a roughly steady state where your assignments are pretty good. So use these assignments to estimate the topic mixtures of each document (by counting the proportion of words assigned to each topic within that document) and the words associated to each topic (by counting the proportion of words assigned to each topic overall).