In [None]:
"""features: 
- average sentence length (in words)
- average review length (in words)
- average review length (in sentences)
- paragraph rate
- bulleted or numbered list rate
- all caps, bad punctuation, run on sentences?
- bag of words: common words in elite vs. not elite; fp, fn, etc. 
"""

In [296]:
import string, re
import math
import pickle
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from spacy.en import English, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm, tqdm_pandas

In [2]:
nlp = English()

In [767]:
d = nltk.Text(b)

In [467]:
reviews = pd.read_csv('data/yelp_academic_dataset_review.csv')

In [87]:
stop = STOPWORDS
punct = {p for p in string.punctuation}

### get descriptive features of review text

In [258]:
def get_num_words(text):
    """Get number of words per review."""
    return float(len(text.split()))

In [257]:
# no spacy
def get_num_sents(text):
    """Get number of sentences per review."""
    # add 1 at the end for last punctuation 
    return text.count('. ') + text.count('! ') + text.count('? ') + text.count(') ') + \
            text.count('.\n') + text.count('!\n') + text.count('?\n') + text.count(')\n') + 1.0

In [259]:
def get_num_para(text):
    """Get number of paragraphs per review."""
    return text.count('\n\n') + 1.0

In [929]:
def mentions_price(text):
    """Check if review mentions price ($). Return 1 if yes, 0 if no."""
    return 1 if '$' in text else 0

In [977]:
def get_allcaps(text):
    """Get number of all uppercase words in review."""
    text = re.sub("[^a-zA-Z]", " ", text)
    return len([word for word in text.split() if word.isupper() and len(word) > 2])

In [941]:
def get_exclamations(text):
    """Get number of exclamation marks in review."""
    return text.count('!')

In [153]:
def get_num_chars(text):
    return float(len([char for char in text if char != ' ' and char not in punct]))

In [81]:
reviews.columns

Index([         u'user_id',        u'review_id',             u'text',
             u'votes.cool',      u'business_id',      u'votes.funny',
                  u'stars',             u'date',             u'type',
           u'votes.useful',  u'review_len_wrds',  u'review_len_sent',
       u'avg_wrds_in_sent',         u'num_para',   u'mentions_price',
            u'num_allcaps', u'num_exclamations',           u'tokens',
               u'is_elite'],
      dtype='object')

In [79]:
t = reviews.loc[0, 'text']

In [80]:
t

u'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.'

In [None]:
# bulleted or numbered list 

In [742]:
def decode(text):
    try:
        return text.decode('utf8')
    except:
        return text

In [866]:
# with spacy
def get_num_sents_spacy(text):
    try:
        return len([sent for sent in nlp(text).sents])
    except:
        return

### get tokens -- *not currently implemented*

In [476]:
def get_clean_tokens(text):
    #letters_only = re.sub("[^a-zA-Z]", " ", text) 
    union = punct.union(stop)
    #spacing = {'', ' ', '\n', '\n\n'}
    tokens = [token.lemma_ for token in nlp(text.decode('utf8'))]
    filtered = [token for token in tokens if token not in union]
    while "" in filtered:
        filtered.remove("")
    while " " in filtered:
        filtered.remove(" ")
    while "\n" in filtered:
        filtered.remove("\n")
    while "\n\n" in filtered:
        filtered.remove("\n\n")
    return ' '.join(filtered)

In [613]:
# version without utf decoding
# def get_clean_tokens2(text):  
#     letters_only = re.sub("[^a-zA-Z]", " ", text) 
#     words = ' '.join(letters_only.lower().split())
#     tokens = [token.lemma_ for token in nlp(words)]
#     filtered = [t for t in tokens if t not in stop and t != '' and t != ' ' and t != '\n' and t != '\n\n']
#     return ' '.join(filtered)

In [1014]:
# uses utf decoding
def get_clean_tokens2(text):  
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    words = ' '.join(letters_only.lower().split())
    tokens = [token.lemma_ for token in nlp(words)]
    filtered = [t for t in tokens if t not in stop and t != '' and t != ' ' and t != '\n' and t != '\n\n']
    return ' '.join(filtered)

In [1032]:
def tokenize2(df):
    return map(get_clean_tokens2, df.text.values)  

In [1067]:
def tokenize4(df):
    tokens = []
    for i in tqdm(range(len(df.text.values))):
        tokens.append(get_clean_tokens2(df.text.values[i]))
    return tokens

In [1071]:
clntkns = tokenize4(reviews)



In [1075]:
with open('pickled/tokens.pkl', 'w') as picklefile:
    pickle.dump(clntkns, picklefile)

### parse reviews df

In [279]:
### OLD
def get_features_old(df):
    # get number of words in single review
    df.loc[:,'review_len_wrds'] = df.loc[:,'text'].apply(lambda x: len(cleantext(x)))
    
    # get number of sentences in single review
    df.loc[:,'review_len_sent'] = df.loc[:,'text'].apply(
        lambda x: len([sent for sent in nlp(x.decode('utf8')).sents])) # better way?
    
    # get average number of words per sentence 
    df.loc[:,'avg_wrds_in_sent'] = df.loc[:,'review_len_wrds'] / df.loc[:,'review_len_sent']
    
    # get cleaned tokens for bag of words
    df.loc[:,'clean_tkns'] = df.loc[:, 'text'].apply(lambda x: get_clean_tokens(x))

In [969]:
def get_features(df):
#     decode
    df.loc[:, 'text'] = df.loc[:, 'text'].apply(decode)

    # get number of words in single review
    df.loc[:,'review_len_wrds'] = df.loc[:,'text'].apply(get_num_words)

    #get number of sentences in single review
    df.loc[:,'review_len_sent'] = df.loc[:,'text'].apply(get_num_sents)

    # get average number of words per sentence 
    df.loc[:,'avg_wrds_in_sent'] = df.loc[:,'review_len_wrds'] / df.loc[:,'review_len_sent']
    
    # get number of paragraphs
    df.loc[:, 'num_para'] = df.loc[:, 'text'].apply(get_num_para)
    
    # check if price is mentioned
    df.loc[:, 'mentions_price'] = df.loc[:, 'text'].apply(mentions_price)
    
    # get number of all caps words
    df.loc[:, 'num_allcaps'] = df.loc[:, 'text'].apply(get_allcaps)
    
    # get number of exclamation marks
    df.loc[:, 'num_exclamations'] = df.loc[:, 'text'].apply(get_exclamations)

#     get cleaned tokens for bag of words
#     %time df.loc[:,'clean_tkns'] = df.loc[:, 'text'].apply(lambda x: get_clean_tokens2(x))

In [190]:
# additional stuff - add to previous cell when cleaning
def get_more_features(df):
    df.loc[:,'num_chars'] = df.loc[:,'text'].apply(get_num_chars)
    df.loc[:,'ari_score'] = df.apply(
        lambda row: 4.71 * (row.num_chars/float(row.review_len_wrds)) \
        + 0.5 * (row.review_len_wrds/float(row.review_len_sent)) - 21.43, 
        axis = 1)
    
"""
ARI (automatic readability index) score:
where characters is the number of letters and numbers, words is the number of spaces, 
and sentences is the number of sentences. 
"""

'\nARI (automatic readability index) score:\nwhere characters is the number of letters and numbers, words is the number of spaces, \nand sentences is the number of sentences. \n'

In [255]:
reviews.columns

Index([         u'user_id',        u'review_id',             u'text',
             u'votes.cool',      u'business_id',      u'votes.funny',
                  u'stars',             u'date',             u'type',
           u'votes.useful',  u'review_len_wrds',  u'review_len_sent',
       u'avg_wrds_in_sent',         u'num_para',   u'mentions_price',
            u'num_allcaps', u'num_exclamations',           u'tokens',
               u'is_elite',        u'num_chars',        u'ari_score'],
      dtype='object')

In [282]:
def get_chars_per_word(df):
    df['avg_chars_per_word'] = df.loc[:,'num_chars'] / df.loc[:,'review_len_wrds']

In [263]:
get_chars_per_word(reviews)
get_features(reviews)
get_more_features(reviews)

In [288]:
byuser = reviews.groupby('user_id')

In [210]:
user_ari = byuser.mean().loc[:,'ari_score']

In [289]:
user_wrd_length = byuser.mean().loc[:,'avg_chars_per_word']

In [1003]:
user_avgs = byuser.mean().loc[:, 'review_len_wrds':]

In [1009]:
# pickle user avgs data
with open('pickled/user_avgs.pkl', 'w') as picklefile:
    pickle.dump(user_avgs, picklefile)

In [214]:
# pickle user avg ari data
with open('pickled/user_ari.pkl', 'w') as picklefile:
    pickle.dump(user_ari, picklefile)

In [292]:
# pickle user avg word length data
with open('pickled/user_wrdlength.pkl', 'w') as picklefile:
    pickle.dump(user_wrd_length, picklefile)

In [1077]:
reviews['tokens'] = clntkns

In [293]:
with open('pickled/reviewsdf2.pkl', 'w') as picklefile:
    pickle.dump(reviews, picklefile)

In [3]:
with open('pickled/reviewsdf2.pkl', 'r') as picklefile:
    reviews = pickle.load(picklefile)

### bag of words, etc. workspace 

In [21]:
with open('pickled/users_elite.pkl', 'r') as picklefile:
    userids = pickle.load(picklefile)
    
userids.set_index('user_id', inplace = True)
reviews['is_elite'] = reviews.user_id.apply(lambda x: userids.loc[x, 'is_elite'])

split into train and test

In [294]:
train, test = train_test_split(reviews, test_size = .25)

In [66]:
elite_reviews = train[train.is_elite == 1]
nonelite_reviews = train[train.is_elite == 0]

define word vector parameters

In [None]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None, \
                             max_features = 500) 

get most popular tokens for elite users

In [67]:
elite_features = vectorizer.fit_transform(elite_reviews.tokens)
elite_words = vectorizer.get_feature_names()
elite_features = elite_features.toarray()
elite_dist = np.sum(elite_features, axis = 0)

In [68]:
elite_sorted = sorted(zip(elite_words, elite_dist), key = lambda x: x[1], reverse = True)
elite_wrds_dict = dict(elite_sorted)

In [None]:
onlyelite_words = {
    word : elite_wrds_dict[word] 
    for word in elite_wrds_dict 
    if word not in nonelite_wrds_dict}

In [73]:
# top 20 words used by elites but not non-elites
sorted(onlyelite_words.items(), key = lambda x: x[1], reverse = True)[:50]

[(u'locate', 19308),
 (u'space', 18632),
 (u'crispy', 16606),
 (u'butter', 15524),
 (u'le', 15204),
 (u'pepper', 14970),
 (u'grab', 14605),
 (u'dance', 14596),
 (u'mushroom', 14517),
 (u'brunch', 14501),
 (u'sound', 14203),
 (u'tender', 13972),
 (u'standard', 13696),
 (u'toast', 13672),
 (u'mall', 13671),
 (u'note', 13661),
 (u'soft', 13526),
 (u'dip', 13510),
 (u'black', 13384),
 (u'center', 13354),
 (u'et', 13208),
 (u'event', 13173),
 (u'ton', 13114),
 (u'salmon', 12883),
 (u'seafood', 12819),
 (u'cute', 12809),
 (u'cafe', 12762),
 (u'interesting', 12732),
 (u'flavorful', 12620),
 (u'section', 12471),
 (u'sausage', 12351),
 (u'fruit', 12172),
 (u'spice', 12105),
 (u'idea', 12065),
 (u'corn', 11999),
 (u'sort', 11893),
 (u'hang', 11877),
 (u'wrap', 11833),
 (u'pour', 11769),
 (u'rock', 11742),
 (u'mac', 11733),
 (u'middle', 11701),
 (u'station', 11647),
 (u'unique', 11622),
 (u'compare', 11583),
 (u'crust', 11514),
 (u'joint', 11456),
 (u'sample', 11443),
 (u'ticket', 11428),
 (u'pie

get popular tokens for non-elite users

In [70]:
nonelite_features = vectorizer.fit_transform(nonelite_reviews.tokens)
nonelite_words = vectorizer.get_feature_names()
nonelite_features = nonelite_features.toarray()
nonelite_dist = np.sum(nonelite_features, axis = 0)
nonelite_sorted = sorted(zip(nonelite_words, nonelite_dist), key = lambda x: x[1], reverse = True)

In [71]:
nonelite_wrds_dict = dict(nonelite_sorted)

In [None]:
onlynonelite_words = {
    word : nonelite_wrds_dict[word] 
    for word in nonelite_wrds_dict 
    if word not in elite_wrds_dict}

In [74]:
# top 20 words used by elites but not non-elites
sorted(onlynonelite_words.items(), key = lambda x: x[1], reverse = True)[:50]

[(u'manager', 66142),
 (u'hair', 48765),
 (u'phone', 42729),
 (u'company', 41832),
 (u'receive', 39751),
 (u'nail', 39224),
 (u'thank', 39026),
 (u'rude', 36485),
 (u'fix', 36223),
 (u'professional', 35974),
 (u'horrible', 33552),
 (u'appointment', 32534),
 (u'office', 31827),
 (u'dr', 31712),
 (u'min', 31441),
 (u'speak', 30116),
 (u'explain', 29773),
 (u'thanks', 29761),
 (u'twice', 28396),
 (u'terrible', 28270),
 (u'send', 27595),
 (u'question', 27520),
 (u'understand', 27486),
 (u'purchase', 27432),
 (u'salon', 26985),
 (u'disappointed', 26105),
 (u'completely', 26035),
 (u'die', 25317),
 (u'massage', 25159),
 (u'provide', 24999),
 (u'greet', 24300),
 (u'break', 24279),
 (u'hope', 24220),
 (u'save', 23840),
 (u'complaint', 23475),
 (u'product', 23364),
 (u'boyfriend', 23210),
 (u'desk', 23078),
 (u'welcome', 22699),
 (u'deliver', 22603),
 (u'state', 22580),
 (u'dollar', 22557),
 (u'woman', 22533),
 (u'answer', 22099),
 (u'guest', 22045),
 (u'poor', 21945),
 (u'number', 21740),
 (u'

#### some other stuff

In [295]:
# test model?
features = vectorizer.fit_transform(train.tokens)
words = vectorizer.get_feature_names()
features = features.toarray()

In [None]:
reviews.head()

In [7]:
# get counts for each word in corpus 
dist = np.sum(features, axis = 0)
sorted(zip(words, dist), key = lambda x: x[1], reverse = True)[:100]

[(u'place', 1347563),
 (u'good', 1237040),
 (u'food', 1206517),
 (u'great', 1039474),
 (u'time', 987742),
 (u'like', 977408),
 (u'come', 834873),
 (u'service', 815935),
 (u'order', 767823),
 (u'try', 597247),
 (u'love', 531152),
 (u've', 472226),
 (u'nice', 469437),
 (u'look', 456345),
 (u'want', 445065),
 (u'restaurant', 430650),
 (u'price', 426258),
 (u'eat', 421034),
 (u'best', 420215),
 (u'know', 397865),
 (u'think', 397360),
 (u'wait', 392371),
 (u'little', 368174),
 (u'drink', 354747),
 (u'staff', 342886),
 (u'people', 341297),
 (u'day', 340781),
 (u'ask', 331337),
 (u'thing', 330179),
 (u'friendly', 329725),
 (u'pretty', 315407),
 (u'room', 313836),
 (u'menu', 311748),
 (u'experience', 311092),
 (u'need', 308685),
 (u'tell', 308509),
 (u'chicken', 308354),
 (u'work', 308325),
 (u'night', 298637),
 (u'bar', 297164),
 (u'table', 296750),
 (u'definitely', 296024),
 (u'way', 289305),
 (u'better', 285811),
 (u'feel', 285268),
 (u'bad', 284377),
 (u'delicious', 273730),
 (u'star', 269

In [297]:
model = GaussianNB().fit(features, train.is_elite)

In [302]:
test.tokens

1683421    use noble cleaner rental clean carpet home ext...
110536     phoenix night business recently hilton mesa gr...
1020063    people ask like wendy burger mcdonalds instead...
930773     service happy hour bar area service actual res...
1201759    fresh produce rarely fresh bough box triple wa...
1365051    authentic chai edinburgh little place gorgeous...
2071758    impressed selection ve stay store hour flavor ...
2095507    look high quality abd healthy food protein sou...
692971     come branch longer veggie burger people u thei...
1865827    suppose sexy cirque du soleil bare boob buff d...
2169360    absolutely delicious food quick friendly servi...
1468432    try place fish disappoint people hostess stand...
644146     twice sunday brunch wedding anniversary time e...
909549     start pride merlot best merlot drink progress ...
460045     attorney isn t cheap definitely best result mo...
658829     picture coca cola polar bear receive coupon fr...
1235821    yesterday tim

In [301]:
test_features = vectorizer.fit_transform(test.tokens)

AttributeError: 'numpy.float64' object has no attribute 'lower'

In [None]:
test_features = test_features.toarray()
test_pred = model.predict(test_features)
print accuracy_score(train.is_elite, pred)
print precision_score(train.is_elite, pred)
print recall_score(train.is_elite, pred)

In [298]:
pred = model.predict(features)

In [300]:
print accuracy_score(train.is_elite, pred)
print precision_score(train.is_elite, pred)
print recall_score(train.is_elite, pred)

0.727847081121
0.448259404194
0.485176924527


In [252]:
pred = model.predict(features)
print 'acc:', accuracy_score(test.is_elite, pred)
print 'acc:', precision_score(test.is_elite, pred, average = None)
print 'acc:', recall_score(test.is_elite, pred, average = None)
print 'acc:', f1_score(test.is_elite, pred, average = None)

acc: 1.0
acc: [ 1.  1.]
acc: [ 1.  1.]
acc: [ 1.  1.]
