In [None]:
"""features: 
- average sentence length (in words)
- average review length (in words)
- average review length (in sentences)
- paragraph rate
- bulleted or numbered list rate
- all caps, bad punctuation, run on sentences?
- bag of words: common words in elite vs. not elite; fp, fn, etc. 
"""

In [2]:
import string, re
import pickle
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from spacy.en import English, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm, tqdm_pandas

In [2]:
nlp = English()

In [767]:
d = nltk.Text(b)

In [467]:
reviews = pd.read_csv('data/yelp_academic_dataset_review.csv')

In [378]:
stop = STOPWORDS
punct = {p for p in string.punctuation}

### get descriptive features of review text

In [948]:
def get_num_words(text):
    """Get number of words per review."""
    return len(text.split())

In [949]:
# no spacy
def get_num_sents(text):
    """Get number of sentences per review."""
    # add 1 at the end for last punctuation 
    return text.count('. ') + text.count('! ') + text.count('? ') + text.count(') ') + \
            text.count('.\n') + text.count('!\n') + text.count('?\n') + text.count(')\n') + 1

In [904]:
def get_num_para(text):
    """Get number of paragraphs per review."""
    return text.count('\n\n') + 1

In [929]:
def mentions_price(text):
    """Check if review mentions price ($). Return 1 if yes, 0 if no."""
    return 1 if '$' in text else 0

In [977]:
def get_allcaps(text):
    """Get number of all uppercase words in review."""
    text = re.sub("[^a-zA-Z]", " ", text)
    return len([word for word in text.split() if word.isupper() and len(word) > 2])

In [941]:
def get_exclamations(text):
    """Get number of exclamation marks in review."""
    return text.count('!')

In [None]:
# bulleted or numbered list 

In [742]:
def decode(text):
    try:
        return text.decode('utf8')
    except:
        return text

In [866]:
# with spacy
def get_num_sents_spacy(text):
    try:
        return len([sent for sent in nlp(text).sents])
    except:
        return

### get tokens -- *not currently implemented*

In [476]:
def get_clean_tokens(text):
    #letters_only = re.sub("[^a-zA-Z]", " ", text) 
    union = punct.union(stop)
    #spacing = {'', ' ', '\n', '\n\n'}
    tokens = [token.lemma_ for token in nlp(text.decode('utf8'))]
    filtered = [token for token in tokens if token not in union]
    while "" in filtered:
        filtered.remove("")
    while " " in filtered:
        filtered.remove(" ")
    while "\n" in filtered:
        filtered.remove("\n")
    while "\n\n" in filtered:
        filtered.remove("\n\n")
    return ' '.join(filtered)

In [613]:
# version without utf decoding
# def get_clean_tokens2(text):  
#     letters_only = re.sub("[^a-zA-Z]", " ", text) 
#     words = ' '.join(letters_only.lower().split())
#     tokens = [token.lemma_ for token in nlp(words)]
#     filtered = [t for t in tokens if t not in stop and t != '' and t != ' ' and t != '\n' and t != '\n\n']
#     return ' '.join(filtered)

In [1014]:
# uses utf decoding
def get_clean_tokens2(text):  
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    words = ' '.join(letters_only.lower().split())
    tokens = [token.lemma_ for token in nlp(words)]
    filtered = [t for t in tokens if t not in stop and t != '' and t != ' ' and t != '\n' and t != '\n\n']
    return ' '.join(filtered)

In [1015]:
def tokenize(df):
    # get cleaned tokens for bag of words
    df.loc[:,'clean_tkns'] = df.loc[:, 'text'].apply(get_clean_tokens2)

In [1032]:
def tokenize2(df):
    return map(get_clean_tokens2, df.text.values)  

In [1038]:
def tokenize3(df):
    f = np.vectorize(get_clean_tokens2)
    return f(df.text.values)

In [1067]:
def tokenize4(df):
    tokens = []
    for i in tqdm(range(len(df.text.values))):
        tokens.append(get_clean_tokens2(df.text.values[i]))
    return tokens

In [1071]:
clntkns = tokenize4(reviews)



In [1074]:
len(clntkns)

2225213

In [1075]:
with open('pickled/tokens.pkl', 'w') as picklefile:
    pickle.dump(clntkns, picklefile)

### parse reviews df

In [279]:
### OLD
def get_features_old(df):
    # get number of words in single review
    df.loc[:,'review_len_wrds'] = df.loc[:,'text'].apply(lambda x: len(cleantext(x)))
    
    # get number of sentences in single review
    df.loc[:,'review_len_sent'] = df.loc[:,'text'].apply(
        lambda x: len([sent for sent in nlp(x.decode('utf8')).sents])) # better way?
    
    # get average number of words per sentence 
    df.loc[:,'avg_wrds_in_sent'] = df.loc[:,'review_len_wrds'] / df.loc[:,'review_len_sent']
    
    # get cleaned tokens for bag of words
    df.loc[:,'clean_tkns'] = df.loc[:, 'text'].apply(lambda x: get_clean_tokens(x))

In [969]:
def get_features(df):
#     decode
    df.loc[:, 'text'] = df.loc[:, 'text'].apply(decode)

    # get number of words in single review
    df.loc[:,'review_len_wrds'] = df.loc[:,'text'].apply(get_num_words)

    #get number of sentences in single review
    df.loc[:,'review_len_sent'] = df.loc[:,'text'].apply(get_num_sents)

    # get average number of words per sentence 
    df.loc[:,'avg_wrds_in_sent'] = df.loc[:,'review_len_wrds'] / df.loc[:,'review_len_sent']
    
    # get number of paragraphs
    df.loc[:, 'num_para'] = df.loc[:, 'text'].apply(get_num_para)
    
    # check if price is mentioned
    df.loc[:, 'mentions_price'] = df.loc[:, 'text'].apply(mentions_price)
    
    # get number of all caps words
    df.loc[:, 'num_allcaps'] = df.loc[:, 'text'].apply(get_allcaps)
    
    # get number of exclamation marks
    df.loc[:, 'num_exclamations'] = df.loc[:, 'text'].apply(get_exclamations)

#     get cleaned tokens for bag of words
#     %time df.loc[:,'clean_tkns'] = df.loc[:, 'text'].apply(lambda x: get_clean_tokens2(x))

In [986]:
get_features(reviews)

In [993]:
byuser = reviews.groupby('user_id')

In [1003]:
user_avgs = byuser.mean().loc[:, 'review_len_wrds':]

In [1009]:
# pickle user avgs data
with open('pickled/user_avgs.pkl', 'w') as picklefile:
    pickle.dump(user_avgs, picklefile)

In [1077]:
reviews['tokens'] = clntkns

In [None]:
with open('pickled/reviewsdf2.pkl', 'w') as picklefile:
    pickle.dump(reviews, picklefile)

In [3]:
with open('pickled/reviewsdf2.pkl', 'r') as picklefile:
    reviews = pickle.load(picklefile)

### bag of words, etc. workspace -- *not implemented*

In [14]:
reviews.columns

Index([         u'user_id',        u'review_id',             u'text',
             u'votes.cool',      u'business_id',      u'votes.funny',
                  u'stars',             u'date',             u'type',
           u'votes.useful',  u'review_len_wrds',  u'review_len_sent',
       u'avg_wrds_in_sent',         u'num_para',   u'mentions_price',
            u'num_allcaps', u'num_exclamations',           u'tokens'],
      dtype='object')

In [21]:
with open('pickled/users_elite.pkl', 'r') as picklefile:
    userids = pickle.load(picklefile)

In [24]:
userids.set_index('user_id', inplace = True)

In [26]:
t = reviews.loc[:5]

In [30]:
reviews['is_elite'] = reviews.user_id.apply(lambda x: userids.loc[x, 'is_elite'])

In [34]:
elite_reviews = reviews[reviews.is_elite == 1]
nonelite_reviews = reviews[reviews.is_elite == 0]

get most popular tokens for elite users

In [35]:
elite_features = vectorizer.fit_transform(elite_reviews.tokens)
words = vectorizer.get_feature_names()
elite_features = elite_features.toarray()
dist = np.sum(elite_features, axis = 0)
sorted(zip(words, dist), key = lambda x: x[1], reverse = True)[:100]

[(u'place', 422491),
 (u'good', 414171),
 (u'like', 373306),
 (u'food', 337150),
 (u'time', 295422),
 (u'come', 277728),
 (u'great', 265779),
 (u'order', 255788),
 (u'try', 199987),
 (u'service', 191908),
 (u'love', 165209),
 (u'nice', 163142),
 (u've', 162112),
 (u'think', 156943),
 (u'look', 152108),
 (u'want', 148604),
 (u'little', 146494),
 (u'eat', 143299),
 (u'restaurant', 141329),
 (u'pretty', 140637),
 (u'know', 137719),
 (u'drink', 134979),
 (u'price', 128315),
 (u'thing', 125599),
 (u'bar', 120683),
 (u'menu', 119607),
 (u'people', 115308),
 (u'wait', 114556),
 (u'night', 112406),
 (u'best', 111804),
 (u'table', 110382),
 (u'day', 109613),
 (u'room', 109387),
 (u'chicken', 108951),
 (u'lot', 104989),
 (u'way', 101484),
 (u'friend', 99267),
 (u'definitely', 97970),
 (u'feel', 97363),
 (u'vega', 96392),
 (u'cheese', 96357),
 (u'sauce', 95872),
 (u'taste', 95082),
 (u'right', 93498),
 (u'need', 93335),
 (u'better', 92773),
 (u'star', 92115),
 (u'delicious', 90004),
 (u'bit', 898

In [5]:
# test model?
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None, \
                             max_features = 500) 
features = vectorizer.fit_transform(reviews.tokens)
words = vectorizer.get_feature_names()
features = features.toarray()

In [None]:
reviews.head()

In [7]:
# get counts for each word in corpus 
dist = np.sum(features, axis = 0)
sorted(zip(words, dist), key = lambda x: x[1], reverse = True)[:100]

[(u'place', 1347563),
 (u'good', 1237040),
 (u'food', 1206517),
 (u'great', 1039474),
 (u'time', 987742),
 (u'like', 977408),
 (u'come', 834873),
 (u'service', 815935),
 (u'order', 767823),
 (u'try', 597247),
 (u'love', 531152),
 (u've', 472226),
 (u'nice', 469437),
 (u'look', 456345),
 (u'want', 445065),
 (u'restaurant', 430650),
 (u'price', 426258),
 (u'eat', 421034),
 (u'best', 420215),
 (u'know', 397865),
 (u'think', 397360),
 (u'wait', 392371),
 (u'little', 368174),
 (u'drink', 354747),
 (u'staff', 342886),
 (u'people', 341297),
 (u'day', 340781),
 (u'ask', 331337),
 (u'thing', 330179),
 (u'friendly', 329725),
 (u'pretty', 315407),
 (u'room', 313836),
 (u'menu', 311748),
 (u'experience', 311092),
 (u'need', 308685),
 (u'tell', 308509),
 (u'chicken', 308354),
 (u'work', 308325),
 (u'night', 298637),
 (u'bar', 297164),
 (u'table', 296750),
 (u'definitely', 296024),
 (u'way', 289305),
 (u'better', 285811),
 (u'feel', 285268),
 (u'bad', 284377),
 (u'delicious', 273730),
 (u'star', 269

In [249]:
model = RandomForestClassifier(n_estimators = 50).fit(features, test.is_elite)

In [252]:
pred = model.predict(features)
print 'acc:', accuracy_score(test.is_elite, pred)
print 'acc:', precision_score(test.is_elite, pred, average = None)
print 'acc:', recall_score(test.is_elite, pred, average = None)
print 'acc:', f1_score(test.is_elite, pred, average = None)

acc: 1.0
acc: [ 1.  1.]
acc: [ 1.  1.]
acc: [ 1.  1.]
