## Problem Statement
Understanding the category of a business listed on Yelp considering the review text

In [1]:
import pandas as pd
import numpy as np
import graphlab as gl
import string
from datetime import datetime
from operator import itemgetter
from collections import Counter
from functools import partial
import ast

import gensim
from gensim import corpora

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score

stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()



In [2]:
main_path = "D:\Grad Notes\Search\Project\Data\yelp_dataset\dataset\\"

In [3]:
## Stopword list 1
stop = stopwords.words('english')
len(stop)

## Stop list 2
f = open(main_path+'stopwords.txt','r')
stop2 = f.readlines()[0].split("\r")

stop_all = stop + stop2
stop_all = set(stop_all)

In [4]:
## Read business dataset

df_bus = pd.read_csv(main_path+"business.csv")
df_bus.head()

Unnamed: 0,attributes.Ambience.divey,attributes.RestaurantsDelivery,attributes.DogsAllowed,postal_code,hours.Thursday,attributes.HairSpecializesIn.coloring,attributes.BestNights.sunday,attributes.BYOB,attributes.AgesAllowed,attributes.Music.video,...,attributes.Caters,attributes.RestaurantsReservations,attributes.DietaryRestrictions.dairy-free,attributes.DietaryRestrictions.vegan,attributes.Ambience.romantic,attributes.Music.jukebox,attributes.Ambience.upscale,attributes.RestaurantsTakeOut,attributes.BikeParking,attributes.OutdoorSeating
0,,,,44143,10:00-21:00,,,,,,...,,,,,,,,,True,
1,False,True,,28215,10:00-22:00,,,,,,...,,False,,,False,,False,True,,False
2,,,,M4K 1N7,10:00-19:00,,,,,,...,,,,,,,,,True,False
3,,,,85258,9:00-17:00,,,,,,...,,,,,,,,,,
4,False,False,,85016,,,,,,,...,True,False,,,False,,False,True,True,False


## Preparing the data

Since data is concentrated around few geographical regions, we decided to pick one city Las Vegas, Nevada as the dataset for our analysis

In [6]:
# ## Get location info
# df_bus_loc = df_bus[['business_id','postal_code','latitude','longitude']]
# df_bus_loc.to_csv(main_path+'business_loc.csv')

In [7]:
#For Las Vegas

df_bus_las = df_bus[(df_bus['latitude'] > 36.133583) & (df_bus['latitude'] < 36.383558) & \
                    (df_bus['longitude'] > -115.418743) & (df_bus['longitude'] < -115.059459)]
df_bus_las.shape

df_las_sub = df_bus_las[['business_id','categories']]
df_las_sub.drop_duplicates(inplace=True)
df_las_sub.size

las_id = df_bus_las['business_id'].tolist()
print len(las_id)

11956


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return func(*args, **kwargs)


## Working with the Las Vegas dataset

In [8]:
## Filtering businesses with the most common category tags

df_las_sub = df_las_sub[df_las_sub['categories']!='[]']

cat_count = Counter(df_las_sub['categories'])
cat_count.most_common(10)

[("[u'Restaurants', u'Mexican']", 99),
 ("[u'Mexican', u'Restaurants']", 84),
 ("[u'Beauty & Spas', u'Nail Salons']", 58),
 ("[u'Pizza', u'Restaurants']", 58),
 ("[u'Restaurants', u'Pizza']", 57),
 ("[u'Nail Salons', u'Beauty & Spas']", 55),
 ("[u'Automotive', u'Auto Repair']", 47),
 ("[u'Financial Services', u'Banks & Credit Unions']", 44),
 ("[u'Food', u'Coffee & Tea']", 43),
 ("[u'Hair Salons', u'Beauty & Spas']", 43)]

In [9]:
## Obtain the top 100 categories
cat_list = cat_count.most_common(100)
cat_list = [i[0] for i in cat_list]
cat_list[:5]

["[u'Restaurants', u'Mexican']",
 "[u'Mexican', u'Restaurants']",
 "[u'Beauty & Spas', u'Nail Salons']",
 "[u'Pizza', u'Restaurants']",
 "[u'Restaurants', u'Pizza']"]

In [10]:
## Get the user reviews dataset

df_rev = pd.read_csv(main_path+"review.csv")
df_rev.shape

(4736897, 9)

In [12]:
df_rev.head(3)

Unnamed: 0,funny,user_id,review_id,text,business_id,stars,date,useful,cool
0,0,cjpdDjZyprfyDG3RlkVG3w,VfBHSwC5Vz_pbFluy07i9Q,My girlfriend and I stayed here for 3 nights a...,uYHaNptLzDLoV_JZ_MuzUA,5,2016-07-12,0,0
1,0,bjTcT8Ty4cJZhEOEo01FGA,3zRpneRKDsOPq92tq7ybAA,If you need an inexpensive place to stay for a...,uYHaNptLzDLoV_JZ_MuzUA,3,2016-10-02,0,0
2,0,AXgRULmWcME7J6Ix3I--ww,ne5WhI1jUFOcRn-b-gAzHA,Mittlerweile gibt es in Edinburgh zwei Ableger...,uYHaNptLzDLoV_JZ_MuzUA,3,2015-09-17,0,0


In [13]:
## Get reviews specific to Las Vegas

df_rev_las = df_rev[df_rev['business_id'].isin(las_id)]
df_rev_las.head(3)

Unnamed: 0,funny,user_id,review_id,text,business_id,stars,date,useful,cool
482,0,jgzD7eBwZrasqy6wUy122w,BLIJFaJZ-_fOcBs16fL_6g,"Loved, Loved, Loved. It is a simple place, but...",OQcvO5P3gH0cuJ-bPXwfQQ,5,2017-04-01,0,0
483,0,2v_meK453YAWXz4NjJ9abA,VuKbGklNbOESJSx76_EjyA,Is a small restaurant food is good! Also the o...,OQcvO5P3gH0cuJ-bPXwfQQ,5,2016-11-05,0,0
484,0,cdFWtOgA1PAkNYkiwzUJbQ,HBaAmcS9zp5rY1qiMuWygA,Best Mexican restaurant in Vegas. Meat is supe...,OQcvO5P3gH0cuJ-bPXwfQQ,5,2017-06-01,1,0


In [14]:
df_rev_las.shape

(458310, 9)

In [15]:
df_rev_las = df_rev_las.merge(df_las_sub, on='business_id', how = 'inner')
df_rev_las.shape

(457998, 10)

In [16]:
## Filter reviews belonging to 100 most frequent categories

df_rev_new = df_rev_las[df_rev_las['categories'].isin(cat_list)]
df_rev_new.shape

(57189, 10)

## Feature generation from text data

In [17]:
## Function to preprocess review text

def clean_text(rtext):
    stop_free = " ".join([i for i in rtext.lower().split() if i not in stop_all])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    return punc_free

In [19]:
df_rev_new['cleaned_text'] = df_rev_new['text'].apply(clean_text)
df_rev_new.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,funny,user_id,review_id,text,business_id,stars,date,useful,cool,categories,cleaned_text
0,0,jgzD7eBwZrasqy6wUy122w,BLIJFaJZ-_fOcBs16fL_6g,"Loved, Loved, Loved. It is a simple place, but...",OQcvO5P3gH0cuJ-bPXwfQQ,5,2017-04-01,0,0,"[u'Restaurants', u'Mexican']",loved loved loved simple place dont place deco...
1,0,2v_meK453YAWXz4NjJ9abA,VuKbGklNbOESJSx76_EjyA,Is a small restaurant food is good! Also the o...,OQcvO5P3gH0cuJ-bPXwfQQ,5,2016-11-05,0,0,"[u'Restaurants', u'Mexican']",small restaurant food good owners friendly mak...
2,0,cdFWtOgA1PAkNYkiwzUJbQ,HBaAmcS9zp5rY1qiMuWygA,Best Mexican restaurant in Vegas. Meat is supe...,OQcvO5P3gH0cuJ-bPXwfQQ,5,2017-06-01,1,0,"[u'Restaurants', u'Mexican']",mexican restaurant vegas meat super soft tasty...


## Noun Tags
Restricting to Nouns to generate the topics in the text

In [20]:
## Extracting noun tags using graphlab package

n3 = datetime.now()

sf_rev = gl.SFrame(df_rev_new[['review_id','cleaned_text']])

sf_pos = gl.SFrame()
sf_pos['nouns_tags'] = gl.text_analytics.extract_parts_of_speech(sf_rev['cleaned_text'],chosen_pos=[gl.text_analytics.PartOfSpeech.NOUN])

print "POS Tagging runtime: ",datetime.now()-n3
sf_pos

This non-commercial license of GraphLab Create for academic use is assigned to bchalasa@iu.edu and will expire on November 26, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\BHARGA~1\AppData\Local\Temp\graphlab_server_1513109858.log.0


POS Tagging runtime:  0:04:12.357000


nouns_tags
"{'NOUN': {'customer': 1L, 'rating': 1L, 'decor': ..."
"{'NOUN': {'food': 1L, 'owners': 1L, ..."
"{'NOUN': {'yummy': 1L, 'meat': 1L, 'service': ..."
"{'NOUN': {'picture': 1L, 'burrito': 1L, ..."
"{'NOUN': {'shop': 1L, 'cheese': 2L, 'figure': ..."
"{'NOUN': {'service': 1L, 'business': 1L, ..."
"{'NOUN': {'cheese': 1L, 'terms': 1L, 'lot': 1L, ..."
"{'NOUN': {'shop': 2L, 'pueblo': 1L, 'line': ..."
"{'NOUN': {'daughter': 1L, 'service': 1L, 'food': ..."
"{'NOUN': {'week': 1L, 'asada': 1L, 'tacos': ..."


In [21]:
df_rev_new['noun_tags'] = list(sf_pos['nouns_tags'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [22]:
def lemmatize_word(word):
    return lemma.lemmatize(word.decode('utf-8'))
    
def format_tags(tags):
    nlist = tags['NOUN'].keys()
    lem_list = list(set([i for i in nlist]))
    
    return lem_list

In [23]:
df_rev_new['noun_tags'] = df_rev_new['noun_tags'].apply(format_tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [24]:
df_rev_new.head(3)

Unnamed: 0,funny,user_id,review_id,text,business_id,stars,date,useful,cool,categories,cleaned_text,noun_tags
0,0,jgzD7eBwZrasqy6wUy122w,BLIJFaJZ-_fOcBs16fL_6g,"Loved, Loved, Loved. It is a simple place, but...",OQcvO5P3gH0cuJ-bPXwfQQ,5,2017-04-01,0,0,"[u'Restaurants', u'Mexican']",loved loved loved simple place dont place deco...,"[customer, rating, decor, dicount, food, note,..."
1,0,2v_meK453YAWXz4NjJ9abA,VuKbGklNbOESJSx76_EjyA,Is a small restaurant food is good! Also the o...,OQcvO5P3gH0cuJ-bPXwfQQ,5,2016-11-05,0,0,"[u'Restaurants', u'Mexican']",small restaurant food good owners friendly mak...,"[food, owners, restaurant]"
2,0,cdFWtOgA1PAkNYkiwzUJbQ,HBaAmcS9zp5rY1qiMuWygA,Best Mexican restaurant in Vegas. Meat is supe...,OQcvO5P3gH0cuJ-bPXwfQQ,5,2017-06-01,1,0,"[u'Restaurants', u'Mexican']",mexican restaurant vegas meat super soft tasty...,"[yummy, meat, service, restaurant, fish, vegas..."


In [91]:
##Store results
df_rev_new.to_csv(main_path+'las_test_pos.csv')

In [26]:
## List of all noun tags in reviews as input to LDA

tagged_revs = df_rev_new['noun_tags'].tolist()
tagged_revs[:3]

[['customer',
  'rating',
  'decor',
  'dicount',
  'food',
  'note',
  'compensation',
  'place'],
 ['food', 'owners', 'restaurant'],
 ['yummy', 'meat', 'service', 'restaurant', 'fish', 'vegas', 'salsa']]

## Train LDA Model

- For topic modeling we used the LDA algorithm available in gensim
- The following parameters were tuned: num_topics, chunksize, update_every, passes

In [27]:
## Run topic modeling over all the reviews
n3 = datetime.now()

tagged_revs = df_rev_new['noun_tags'].tolist()

 # Creating the term dictionary 
dictionary = corpora.Dictionary(tagged_revs)
dictionary.filter_extremes(keep_n=10000)
dictionary.compactify()
len(dictionary)

# Converting into Document Term Matrix
doc_term_matrix = [dictionary.doc2bow(rtxt) for rtxt in tagged_revs]

# LDA model object 
Lda = gensim.models.ldamodel.LdaModel

# Training the LDA model
ldamodel = Lda(doc_term_matrix, num_topics=50, id2word = dictionary, chunksize=3000, update_every=2, passes=2)

print "LDA Runtime: ",datetime.now() - n3

LDA Runtime:  0:06:20.630000


In [29]:
lda_topics = ldamodel.print_topics(num_topics=5, num_words=10)
lda_topics

[(21,
  u'0.029*"apartment" + 0.025*"management" + 0.024*"office" + 0.020*"maintenance" + 0.016*"place" + 0.016*"rent" + 0.016*"months" + 0.015*"apartments" + 0.015*"staff" + 0.015*"lease"'),
 (33,
  u'0.025*"pizza" + 0.024*"place" + 0.023*"service" + 0.022*"pie" + 0.022*"control" + 0.018*"notch" + 0.017*"pest" + 0.017*"fruit" + 0.016*"dough" + 0.016*"time"'),
 (35,
  u'0.026*"cafe" + 0.025*"deals" + 0.024*"rio" + 0.017*"effort" + 0.016*"traffic" + 0.012*"miss" + 0.012*"spanish" + 0.011*"barbacoa" + 0.011*"section" + 0.010*"preparation"'),
 (23,
  u'0.042*"store" + 0.032*"location" + 0.028*"staff" + 0.023*"service" + 0.020*"area" + 0.013*"place" + 0.013*"stores" + 0.013*"seating" + 0.012*"customer" + 0.011*"convenience"'),
 (20,
  u'0.050*"tacos" + 0.045*"food" + 0.034*"place" + 0.030*"taco" + 0.021*"salsa" + 0.020*"beans" + 0.019*"chips" + 0.019*"meat" + 0.019*"burrito" + 0.016*"rice"')]

In [60]:
## Save the LDA model
ldamodel.save(main_path+'LdaModels\las_top50')

In [28]:
## Load previously generated model

Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda.load(main_path+'LdaModels\las_top50')

In [30]:
## Infer a topic name for thr LDA topics

topic_map =[(0,'service;steak'),(1, 'fast food;mexican'),(2, 'ambience'),(3, 'location'),(4, 'italian'),(5, 'gas station;fast food'),\
            (6, 'arts;entertainment'),(7, 'home services'),(8, 'ice cream'),(9, 'management;service'),(10, 'fast food'),(11, 'arts;entertainment'),\
            (12, 'breakfast;restaurant'),(13, 'ice cream'),(14, 'pizza'), (15, 'service;experience'), (16, 'none'), (17, 'bar;nightlife'), (18, 'hair salon;beauty'), \
            (19, 'ice cream'), (20, 'mexican'), (21, 'apartment;home services'), (22, 'parks'),(23, 'location;service'), (24, 'vet;health;medical'),\
            (25, 'Asian;thai;chinese'),(26, 'car dealer;auto service;car;auto'),(27, 'banks;financial'), (28, 'pizza;italian'),\
            (29, 'none'),(30, 'apartment;management'),(31, 'service;apartments'), (32, 'apartments'),(33, 'pizza;italian'), \
            (34, 'cafe;coffee'),(35, 'cafe;coffee'),(36, 'apartment'),(37, 'service;customer'),(38, 'dentist;health'),(39, 'none'),\
            (40, 'ambience'),(41, 'church'),(42, 'italian'),(43, 'fast food;american'), (44, 'nail salon;beauty'),(45, 'hair salon;beauty'),\
            (46, 'weddings;jewelery;shopping'),(47, 'dinner;food;restaurant'),(48, 'auto service;car;auto'), (49, 'optometrist;health')]

In [31]:
## Create a topic_name - topic_id mapping
tlist = [i[1] for i in topic_map]
lda_topic_map = dict(zip(range(51),tlist))

## METHOD I - Predicting topics from all the concatenated topics of a business

In [33]:
## Predicting topics concatenating all the reviews of a business
def predict_topics(bus_id):
    
    df_sub = df_rev_new[(df_rev_new['business_id'] == bus_id)]
    
    review_list = df_sub['noun_tags'].tolist()
    noun_tags = list(set([j for i in review_list for j in i]))

    new_review_bow = dictionary.doc2bow(noun_tags)
    new_review_lda = ldamodel[new_review_bow]

    return sorted(new_review_lda,key=itemgetter(1), reverse=True)[:5]


In [34]:
## Predict topics for cancatenated nouns of all business reviews

tagged_revs = df_rev_new['noun_tags'].tolist()

 # Creating the term dictionary 
dictionary = corpora.Dictionary(tagged_revs)
dictionary.filter_extremes(keep_n=10000)
dictionary.compactify()

df_topics = df_rev_new[['business_id','categories']]
df_topics.drop_duplicates(inplace=True)
df_topics['predicted_topics'] = df_topics['business_id'].apply(predict_topics)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [35]:
df_topics.head()

Unnamed: 0,business_id,categories,predicted_topics
0,OQcvO5P3gH0cuJ-bPXwfQQ,"[u'Restaurants', u'Mexican']","[(20, 0.20127907292), (8, 0.111738134059), (0,..."
614,VkIKXVbSfcbxybbxbEt98w,"[u'Automotive', u'Car Dealers']","[(7, 0.0841880402612), (26, 0.0797780933151), ..."
820,ckc3l8cSuggloG4NlquEFQ,"[u'Fast Food', u'Restaurants']","[(38, 0.292302679118), (43, 0.145408656484), (..."
856,Du8gPSC88a9OuY1Vk0b52g,"[u'Ice Cream & Frozen Yogurt', u'Food']","[(8, 0.139359371047), (28, 0.125755298859), (2..."
880,y7yo-LAjJnNoYwWX6ZYbqA,"[u'General Dentistry', u'Dentists', u'Health &...","[(24, 0.222691194754), (47, 0.0915215120559), ..."


In [36]:
##df_topics.to_csv(main_path+'topics_top50.csv')

In [37]:
## Assign inferred names to topics

def rename_topics(topic_list):
    return [lda_topic_map[i[0]] for i in topic_list]

In [39]:
df_topics['topic_names'] = df_topics['predicted_topics'].apply(rename_topics)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [40]:
df_topics.head(3)

Unnamed: 0,business_id,categories,predicted_topics,topic_names
0,OQcvO5P3gH0cuJ-bPXwfQQ,"[u'Restaurants', u'Mexican']","[(20, 0.20127907292), (8, 0.111738134059), (0,...","[mexican, ice cream, service;steak, location, ..."
614,VkIKXVbSfcbxybbxbEt98w,"[u'Automotive', u'Car Dealers']","[(7, 0.0841880402612), (26, 0.0797780933151), ...","[home services, car dealer;auto service;car;au..."
820,ckc3l8cSuggloG4NlquEFQ,"[u'Fast Food', u'Restaurants']","[(38, 0.292302679118), (43, 0.145408656484), (...","[dentist;health, fast food;american, service;c..."


In [42]:
## Evaluation
def check_predictions(actual_cats, pred_topics):
    actual_cats = [j.lower() for j in ast.literal_eval(actual_cats)]
    for i in pred_topics[:3]:
        for j in i.split(";"):
            if j.lower() in actual_cats:
                return 'Yes'
        
    return 'No'

In [43]:
## Calculate prediction accuracy

predict_checks = []
actual_categories = df_topics['categories'].tolist()
predicted_topics = df_topics['topic_names'].tolist()

for i in range(df_topics.shape[0]):
    predict_checks.append(check_predictions(actual_categories[i], predicted_topics[i]))

df_topics['pred_check'] = predict_checks

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [44]:
## Accuracy - METHOD I
acc_1 = float(df_topics[df_topics['pred_check']=='Yes'].shape[0])/df_topics.shape[0]
print "Evaluating performance considering concatenated reviews of a business.."
print "Method I accuracy: ",acc_1*100,"%"

Evaluating performance considering concatenated reviews of a business..
Method I accuracy:  33.2897603486 %


## METHOD II - Major topics among all reviews of a business


In [46]:
## Predicting major topics from individual reviews of a business
def predict_review_topics(noun_tags):
    
    new_review_bow = dictionary.doc2bow(noun_tags)
    new_review_lda = ldamodel[new_review_bow]

    return sorted(new_review_lda,key=itemgetter(1), reverse=True)[:5]

In [47]:
df_topics2 = df_rev_new[['business_id','review_id','noun_tags','categories']]
df_topics2['predicted_topics'] = df_topics2['noun_tags'].apply(predict_review_topics)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [48]:
df_topics2.head(3)

Unnamed: 0,business_id,review_id,noun_tags,categories,predicted_topics
0,OQcvO5P3gH0cuJ-bPXwfQQ,BLIJFaJZ-_fOcBs16fL_6g,"[customer, rating, decor, dicount, food, note,...","[u'Restaurants', u'Mexican']","[(37, 0.738488634313), (40, 0.141511365687)]"
1,OQcvO5P3gH0cuJ-bPXwfQQ,VuKbGklNbOESJSx76_EjyA,"[food, owners, restaurant]","[u'Restaurants', u'Mexican']","[(47, 0.484422367176), (42, 0.275577632824)]"
2,OQcvO5P3gH0cuJ-bPXwfQQ,HBaAmcS9zp5rY1qiMuWygA,"[yummy, meat, service, restaurant, fish, vegas...","[u'Restaurants', u'Mexican']","[(20, 0.8775)]"


In [49]:
## Get the major topics per business
def get_major_topics(bus_id):
    
    df_sub = df_topics2[(df_topics2['business_id'] == bus_id)]
    #print(df_sub).shape[0]
    
    get_topic_list = [j[0] for i in df_sub['predicted_topics'].tolist() for j in i]
    topic_frequency = Counter(get_topic_list)
    
    top_5 = topic_frequency.most_common(5)
    
    return top_5

In [50]:
n4 = datetime.now()

df_topics2_sub = df_topics2[['business_id','categories']]
df_topics2_sub['topic_list'] = df_topics2_sub['business_id'].apply(get_major_topics)
df_topics2_sub.head()

print "Runtime: ",datetime.now()-n4

Runtime:  0:08:25.590000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [51]:
df_topics2_sub['topic_names'] = df_topics2_sub['topic_list'].apply(rename_topics)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [52]:
## Predict the topics per each review of the business

predict_checks2 = []
actual_categories2 = df_topics2_sub['categories'].tolist()
predicted_topics2 = df_topics2_sub['topic_names'].tolist()

for i in range(df_topics2_sub.shape[0]):
    predict_checks2.append(check_predictions(actual_categories2[i], predicted_topics2[i]))

df_topics2_sub['pred_check'] = predict_checks2

In [54]:
## Accuracy - Method II
acc_2 = float(df_topics2_sub[df_topics2_sub['pred_check']=='Yes'].shape[0])/df_topics2_sub.shape[0]
print "Evaluating performance considering the major topics of a business reviews"
print "Method II accuracy: ",acc_2*100,"%"

Evaluating performance considering the major topics of a business reviews
Method II accuracy:  51.3892531781 %


## Method III Supervised technique

In [55]:
## Perform 80 20 test split grouped by category
df_rsub = df_rev_new[['business_id','categories']]
df_rsub.drop_duplicates(inplace=True)

In [56]:
df_rsub.head(3)

Unnamed: 0,business_id,categories
0,OQcvO5P3gH0cuJ-bPXwfQQ,"[u'Restaurants', u'Mexican']"
614,VkIKXVbSfcbxybbxbEt98w,"[u'Automotive', u'Car Dealers']"
820,ckc3l8cSuggloG4NlquEFQ,"[u'Fast Food', u'Restaurants']"


In [57]:
df_rsub.shape

(2295, 2)

In [58]:
test_bus_id = []
train_bus_id = []
cat_grps = set(df_rsub['categories'].tolist())

for i in cat_grps:
    dfs1 = df_rsub[df_rsub['categories'] == i]
    #print '-->',dfs1.shape[0]
    blist = dfs1['business_id'].tolist()
    idx = int(len(blist)*0.8)
    
    train_bus_id.append(blist[:idx])
    test_bus_id.append(blist[idx:])

In [59]:
train_bus_id2 = [j for i in train_bus_id for j in i]
len(train_bus_id2)

test_bus_id2 = [j for i in test_bus_id for j in i]
len(test_bus_id2)

503

## Training

In [60]:
df_train = df_rev_new[df_rev_new['business_id'].isin(train_bus_id2)]
df_train.shape

(39433, 12)

In [61]:
df_train.head(3)

Unnamed: 0,funny,user_id,review_id,text,business_id,stars,date,useful,cool,categories,cleaned_text,noun_tags
0,0,jgzD7eBwZrasqy6wUy122w,BLIJFaJZ-_fOcBs16fL_6g,"Loved, Loved, Loved. It is a simple place, but...",OQcvO5P3gH0cuJ-bPXwfQQ,5,2017-04-01,0,0,"[u'Restaurants', u'Mexican']",loved loved loved simple place dont place deco...,"[customer, rating, decor, dicount, food, note,..."
1,0,2v_meK453YAWXz4NjJ9abA,VuKbGklNbOESJSx76_EjyA,Is a small restaurant food is good! Also the o...,OQcvO5P3gH0cuJ-bPXwfQQ,5,2016-11-05,0,0,"[u'Restaurants', u'Mexican']",small restaurant food good owners friendly mak...,"[food, owners, restaurant]"
2,0,cdFWtOgA1PAkNYkiwzUJbQ,HBaAmcS9zp5rY1qiMuWygA,Best Mexican restaurant in Vegas. Meat is supe...,OQcvO5P3gH0cuJ-bPXwfQQ,5,2017-06-01,1,0,"[u'Restaurants', u'Mexican']",mexican restaurant vegas meat super soft tasty...,"[yummy, meat, service, restaurant, fish, vegas..."


In [62]:
df_test = df_rev_new[df_rev_new['business_id'].isin(test_bus_id2)]
df_test.shape

(17756, 12)

In [63]:
## Concatenate reviews of a business to perform Tf-Idf
def concat_revs(data_frame, bus_id):
    df_sub = data_frame[data_frame['business_id']==bus_id]
    cstring = ''
    for i in df_sub['cleaned_text'].tolist():
        cstring = cstring + str(i)
        
    return cstring

In [64]:
#Create a corpus train and test dataset
corpus_train = [concat_revs(df_train, bid) for bid in train_bus_id2]
corpus_test = [concat_revs(df_test, bid2) for bid2 in test_bus_id2]

In [65]:
dfs2 = df_train[['business_id','categories']]
dfs2.drop_duplicates(inplace=True)
trainlabel_dict = dict(zip(dfs2['business_id'].tolist(),dfs2['categories'].tolist()))

dfs3 = df_test[['business_id','categories']]
dfs3.drop_duplicates(inplace=True)
testlabel_dict = dict(zip(dfs3['business_id'].tolist(),dfs3['categories'].tolist()))

In [66]:
dfs2.head(3)

Unnamed: 0,business_id,categories
0,OQcvO5P3gH0cuJ-bPXwfQQ,"[u'Restaurants', u'Mexican']"
614,VkIKXVbSfcbxybbxbEt98w,"[u'Automotive', u'Car Dealers']"
820,ckc3l8cSuggloG4NlquEFQ,"[u'Fast Food', u'Restaurants']"


In [67]:
dfs3.head(3)

Unnamed: 0,business_id,categories
120322,Y92uJXOUOZn8JvZAffV5mA,"[u'American (New)', u'Restaurants']"
163005,WBDT1L6Pc3TlTNB03o_zog,"[u'Dry Cleaning & Laundry', u'Laundry Services..."
163670,jWJQNxyidwtxE9pGMkcPVQ,"[u'Performing Arts', u'Arts & Entertainment']"


In [68]:
labels_train = [ast.literal_eval(trainlabel_dict[bid]) for bid in train_bus_id2]
labels_test = [ast.literal_eval(testlabel_dict[bid2]) for bid2 in test_bus_id2]

print labels_train[:2]
print labels_test[:2]

[[u'Appliances & Repair', u'Local Services'], [u'Appliances & Repair', u'Local Services']]
[[u'Appliances & Repair', u'Local Services'], [u'Appliances & Repair', u'Local Services']]


In [69]:
## Converting labels to sparse arrays to perform multi-class classification

unique_labels = list(set([j for i in labels_train for j in i]))

from sklearn.preprocessing import MultiLabelBinarizer
mlb =  MultiLabelBinarizer(classes = unique_labels)

labels_train = mlb.fit_transform(labels_train)
labels_test = mlb.fit_transform(labels_test)

In [70]:
## Performing grid search to obtain the best parameters for classifiers
def performGridCV(X_train, y_train, clf_current, params):
    model_to_set = OneVsRestClassifier(clf_current)
    grid_search = GridSearchCV(model_to_set, param_grid=params, scoring="f1_weighted")

    print('~' * 100)
    print("Performing grid search on " + str(clf_current).split('(')[0])
    print("Specified parameters:")
    print(params)
    grid_search.fit(X_train, y_train)
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters after tuning:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(params.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    print('~' * 100)

    gs = grid_search.grid_scores_
    ret = [(i[0], i[1]) for i in gs]
    return best_parameters, ret

In [72]:
import warnings

warnings.filterwarnings("ignore")

vectorizer = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf=True, use_idf=True,stop_words='english')
train_corpus_tf_idf = vectorizer.fit_transform(corpus_train) 
test_corpus_tf_idf = vectorizer.transform(corpus_test)

classlist = [
        (SGDClassifier(),
         {'estimator__penalty': ['l1', 'elasticnet'], "estimator__alpha": [.0001, .001], 'estimator__n_iter': [20, 50]}),
        (LinearSVC(), {'estimator__penalty': ['l1', 'l2'], 'estimator__loss': ['l2'], 'estimator__dual': [False],
                       'estimator__tol': [1e-2, 1e-3]}),
        (MultinomialNB(), {"estimator__alpha": [.01, .1], "estimator__fit_prior": [True, False]})
    ]

for classifier, params_to_optimize in classlist:
    best_params, gs = performGridCV(train_corpus_tf_idf, labels_train, classifier, params_to_optimize)
    print best_params['estimator']
    

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Performing grid search on SGDClassifier
Specified parameters:
{'estimator__alpha': [0.0001, 0.001], 'estimator__penalty': ['l1', 'elasticnet'], 'estimator__n_iter': [20, 50]}
()
Best score: 0.816
Best parameters after tuning:
	estimator__alpha: 0.0001
	estimator__n_iter: 50
	estimator__penalty: 'l1'
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='l1', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Performing grid search on LinearSVC
Specified parameters:
{'estimator__penalty': ['l1', 'l2'], 'est

In [85]:
model_sgd = OneVsRestClassifier(SGDClassifier(alpha=0.0001, n_iter=50, penalty='l1'))
model_svm = OneVsRestClassifier(LinearSVC(loss='l2',penalty='l1',dual=False,tol=0.001))
model_nb = OneVsRestClassifier(MultinomialNB(alpha=0.01, fit_prior=True))  

model_sgd.fit(train_corpus_tf_idf,labels_train)
model_svm.fit(train_corpus_tf_idf,labels_train)
model_nb.fit(train_corpus_tf_idf,labels_train)

result_sgd = model_sgd.predict(test_corpus_tf_idf)
result_svm = model_svm.predict(test_corpus_tf_idf)
result_nb = model_nb.predict(test_corpus_tf_idf)

## Snapshot of predictions

In [86]:
def format_predictions(result_arr):
    result_arr = result_arr.astype(bool)
    return np.array(unique_labels)[result_arr].tolist()

In [87]:
df_test_sub = pd.DataFrame(test_bus_id2, columns=['business_id'])
df_test_sub.head()

Unnamed: 0,business_id
0,ZmMCgM4RCqCXJ0Lswu6yxw
1,YxHIBPLb9SpDHjweRnUPgQ
2,uRMTX1jiNvKRDO45ULW3jQ
3,yZIA98HUgPYx4BY8iZr3IA
4,jAgNdPqhWBXuz-QOqHEVfQ


In [88]:
## Append predictions to business
format_preds_sgd = [format_predictions(i) for i in result_sgd]
format_preds_svm = [format_predictions(i) for i in result_svm]
format_preds_nb = [format_predictions(i) for i in result_nb]
df_test_sub['pred_sgd'] = format_preds_sgd
df_test_sub['pred_svm'] = format_preds_svm
df_test_sub['pred_nb'] = format_preds_nb

df_test_sub2 = df_test_sub.merge(dfs3, on='business_id', how='inner')
df_test_sub2.shape

(503, 5)

In [89]:
## A snapshot of predictions
df_test_sub2.head()

Unnamed: 0,business_id,pred_sgd,pred_svm,pred_nb,categories
0,ZmMCgM4RCqCXJ0Lswu6yxw,"[Local Services, Appliances & Repair]","[Local Services, Appliances & Repair]","[Home Services, Local Services, Appliances & R...","[u'Appliances & Repair', u'Local Services']"
1,YxHIBPLb9SpDHjweRnUPgQ,[],[],[Automotive],"[u'Appliances & Repair', u'Local Services']"
2,uRMTX1jiNvKRDO45ULW3jQ,"[Local Services, Appliances & Repair]","[Local Services, Appliances & Repair]","[Local Services, Appliances & Repair]","[u'Appliances & Repair', u'Local Services']"
3,yZIA98HUgPYx4BY8iZr3IA,"[Fast Food, Restaurants]","[Fast Food, Restaurants]","[Fast Food, Restaurants, Burgers]","[u'Fast Food', u'Restaurants']"
4,jAgNdPqhWBXuz-QOqHEVfQ,"[Fast Food, Restaurants]","[Fast Food, Restaurants]","[Fast Food, Restaurants]","[u'Fast Food', u'Restaurants']"


## Evaluation

In [90]:
## Accuracy

def get_acc(pred_list):
    correct_val = len([i for i in range(len(labels_test)) if np.all(labels_test[i]==pred_list[i])])
    return float(correct_val)/len(labels_test)*100
    

acc_svm = get_acc(result_svm)
acc_nb = get_acc(result_nb)
acc_sgd = get_acc(result_sgd)

print acc_sgd, acc_svm, acc_nb

83.6978131213 76.9383697813 68.986083499


In [91]:
## F1-Score

f1_sgd = f1_score(labels_test, result_sgd, average='weighted')
f1_svm = f1_score(labels_test, result_svm, average='weighted')
f1_nb = f1_score(labels_test, result_nb, average='weighted')

print f1_sgd, f1_svm, f1_nb

0.936514005011 0.894121442558 0.875760454193


In [92]:
## Average precision

ap_sgd = average_precision_score(labels_test, result_sgd, average='weighted')
ap_svm = average_precision_score(labels_test, result_svm, average='weighted')
ap_nb = average_precision_score(labels_test, result_nb, average='weighted')

print ap_sgd, ap_svm, ap_nb

0.946943771187 0.920008593946 0.896585582224
