In [7]:
"""features: 
- average sentence length (in words)
- average review length (in words)
- average review length (in sentences)
- paragraph rate
- bulleted or numbered list rate
- all caps, bad punctuation, run on sentences?
- bag of words: common words in elite vs. not elite; fp, fn, etc. 
"""

'features: \n- average sentence length (in words)\n- average review length (in words)\n- average review length (in sentences)\n- paragraph rate\n- bulleted or numbered list rate\n- all caps, bad punctuation, run on sentences?\n- bag of words: common words in elite vs. not elite; fp, fn, etc. \n'

In [8]:
import string, re
import math
import pickle
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from spacy.en import English, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm, tqdm_pandas

In [2]:
nlp = English()

In [467]:
reviews = pd.read_csv('data/yelp_academic_dataset_review.csv')

In [87]:
stop = STOPWORDS
punct = {p for p in string.punctuation}

### Define functions to decode review and get descriptive features and tokens
Features include:
* number of words
* number of sentences
* number of paragraphs
* number of letters
* if review mentions price
* number of words in all caps
* number of exclamation marks

In [53]:
def decode(text):
    """Decode text."""
    try:
        return text.decode('utf8')
    except:
        return text

In [258]:
def get_num_words(text):
    """Get number of words per review."""
    return float(len(text.split()))

In [257]:
def get_num_sents(text):
    """Get number of sentences per review."""
    # add 1 at the end for last punctuation 
    return text.count('. ') + text.count('! ') + text.count('? ') + text.count(') ') + \
            text.count('.\n') + text.count('!\n') + text.count('?\n') + text.count(')\n') + 1.0

In [259]:
def get_num_para(text):
    """Get number of paragraphs per review."""
    return text.count('\n\n') + 1.0

In [929]:
def mentions_price(text):
    """Check if review mentions price ($). Return 1 if yes, 0 if no."""
    return 1 if '$' in text else 0

In [977]:
def get_allcaps(text):
    """Get number of all uppercase words in review."""
    text = re.sub("[^a-zA-Z]", " ", text)
    return len([word for word in text.split() if word.isupper() and len(word) > 2])

In [941]:
def get_exclamations(text):
    """Get number of exclamation marks in review."""
    return text.count('!')

In [52]:
def get_num_chars(text):
    """Get number of characters in review (excluding punctuation and spaces)."""
    return float(len([char for char in text if char != ' ' and char not in punct]))

In [54]:
def get_clean_tokens(text):  
    """Return tokens for each review; exclude stop words and lemmatize."""
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    words = ' '.join(letters_only.lower().split())
    tokens = [token.lemma_ for token in nlp(words)]
    filtered = [t for t in tokens if t not in stop and t != '' and t != ' ' and t != '\n' and t != '\n\n']
    return ' '.join(filtered)

In [55]:
def tokenize(df):
    """Get tokens for each review in df. Implemented with tqdm to show process bar."""
    tokens = []
    for i in tqdm(range(len(df.text.values))):
        tokens.append(get_clean_tokens(df.text.values[i]))
    return tokens

In [1071]:
clntkns = tokenize(reviews)



In [1075]:
# with open('pickled/tokens.pkl', 'w') as picklefile:
#     pickle.dump(clntkns, picklefile)

### parse reviews df

In [969]:
def get_features(df):
    # decode
    df.loc[:, 'text'] = df.loc[:, 'text'].apply(decode)

    # get number of words in single review
    df.loc[:,'review_len_wrds'] = df.loc[:,'text'].apply(get_num_words)

    #get number of sentences in single review
    df.loc[:,'review_len_sent'] = df.loc[:,'text'].apply(get_num_sents)

    # get average number of words per sentence 
    df.loc[:,'avg_wrds_in_sent'] = df.loc[:,'review_len_wrds'] / df.loc[:,'review_len_sent']
    
    # get number of paragraphs
    df.loc[:, 'num_para'] = df.loc[:, 'text'].apply(get_num_para)
    
    # check if price is mentioned
    df.loc[:, 'mentions_price'] = df.loc[:, 'text'].apply(mentions_price)
    
    # get number of all caps words
    df.loc[:, 'num_allcaps'] = df.loc[:, 'text'].apply(get_allcaps)
    
    # get number of exclamation marks
    df.loc[:, 'num_exclamations'] = df.loc[:, 'text'].apply(get_exclamations)
    
    # get number of characters
    df.loc[:,'num_chars'] = df.loc[:,'text'].apply(get_num_chars)
    
    # calculate ARI score (automatic readability index) for each review 
    df.loc[:,'ari_score'] = df.apply(
        lambda row: 4.71 * (row.num_chars/float(row.review_len_wrds)) \
        + 0.5 * (row.review_len_wrds/float(row.review_len_sent)) - 21.43, 
        axis = 1)
    
    # get characters per word
    df['avg_chars_per_word'] = df.loc[:,'num_chars'] / df.loc[:,'review_len_wrds']

### add content counts - move this stuff to after text processing?

In [378]:
# add elite/nonelite words

def get_elite_words(tokens):
    try:
        tokens = tokens.split()
        return len(elite_words.intersection(tokens))
    except:
        return 0

def get_nonelite_words(tokens):
    try:
        tokens = tokens.split()
        return len(ne_words.intersection(tokens))
    except:
        return 0

In [382]:
def add_content_counts(df):
    df['num_elite_words'] = df.text.apply(get_elite_words)
    df['num_ne_words'] = df.text.apply(get_nonelite_words)

In [383]:
add_content_counts(reviews)

### /end add content counts

In [263]:
get_chars_per_word(reviews)
get_features(reviews)
get_more_features(reviews)

In [388]:
byuser = reviews.groupby('user_id')

In [1003]:
user_avgs = byuser.mean().loc[:, 'review_len_wrds':]

In [390]:
# can delete, captured above 
# user_content = byuser.mean().loc[:, 'num_elite_words':'num_ne_words']
# user_ari = byuser.mean().loc[:,'ari_score']
# user_wrd_length = byuser.mean().loc[:,'avg_chars_per_word']

In [392]:
# pickle user words/content data
with open('pickled/user_content.pkl', 'w') as picklefile:
    pickle.dump(user_content, picklefile)

In [1009]:
# pickle user avgs data
with open('pickled/user_avgs.pkl', 'w') as picklefile:
    pickle.dump(user_avgs, picklefile)

In [214]:
# pickle user avg ari data
with open('pickled/user_ari.pkl', 'w') as picklefile:
    pickle.dump(user_ari, picklefile)

In [292]:
# pickle user avg word length data
with open('pickled/user_wrdlength.pkl', 'w') as picklefile:
    pickle.dump(user_wrd_length, picklefile)

In [1077]:
reviews['tokens'] = clntkns

#### workspace for pickling and loading entire df

In [293]:
with open('pickled/reviewsdf2.pkl', 'w') as picklefile:
    pickle.dump(reviews, picklefile)

In [9]:
with open('pickled/reviewsdf2.pkl', 'r') as picklefile:
    reviews = pickle.load(picklefile)

### Get most common words for elite vs. non-elite users

In [21]:
with open('pickled/users_elite.pkl', 'r') as picklefile:
    userids = pickle.load(picklefile)
    
userids.set_index('user_id', inplace = True)
reviews['is_elite'] = reviews.user_id.apply(lambda x: userids.loc[x, 'is_elite'])

In [11]:
elite_reviews = train[train.is_elite == 1]
nonelite_reviews = train[train.is_elite == 0]

Prepare train/test sets and vectorizer.

In [10]:
train, test = train_test_split(reviews, test_size = .25)
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None, \
                             max_features = 500) 

Get most popular tokens for elite and non-elite users. 

In [13]:
elite_features = vectorizer.fit_transform(elite_reviews.tokens)
elite_words = vectorizer.get_feature_names()
elite_features = elite_features.toarray()
elite_dist = np.sum(elite_features, axis = 0)
elite_sorted = sorted(zip(elite_words, elite_dist), key = lambda x: x[1], reverse = True)
elite_wrds_dict = dict(elite_sorted)

In [16]:
nonelite_features = vectorizer.fit_transform(nonelite_reviews.tokens)
nonelite_words = vectorizer.get_feature_names()
nonelite_features = nonelite_features.toarray()
nonelite_dist = np.sum(nonelite_features, axis = 0)
nonelite_sorted = sorted(zip(nonelite_words, nonelite_dist), key = lambda x: x[1], reverse = True)
nonelite_wrds_dict = dict(nonelite_sorted)

In [20]:
# get unique words (not present in intersection)
onlyelite_words = {
    word : elite_wrds_dict[word] 
    for word in elite_wrds_dict 
    if word not in nonelite_wrds_dict}

onlynonelite_words = {
    word : nonelite_wrds_dict[word] 
    for word in nonelite_wrds_dict 
    if word not in elite_wrds_dict}

In [22]:
elite_top50 = sorted(onlyelite_words.items(), key = lambda x: x[1], reverse = True)[:50]
ne_top50 = sorted(onlynonelite_words.items(), key = lambda x: x[1], reverse = True)[:50]

**Top 50 words used by elites but not non-elites**

In [57]:
elite_top50

[(u'locate', 19447),
 (u'space', 18615),
 (u'crispy', 16474),
 (u'butter', 15533),
 (u'le', 15208),
 (u'pepper', 15005),
 (u'dance', 14583),
 (u'grab', 14577),
 (u'mushroom', 14510),
 (u'brunch', 14379),
 (u'sound', 14018),
 (u'tender', 14004),
 (u'toast', 13732),
 (u'standard', 13707),
 (u'mall', 13668),
 (u'note', 13633),
 (u'dip', 13542),
 (u'soft', 13531),
 (u'black', 13376),
 (u'center', 13310),
 (u'et', 13263),
 (u'event', 13218),
 (u'ton', 13133),
 (u'salmon', 13075),
 (u'cute', 12819),
 (u'lobster', 12763),
 (u'cafe', 12677),
 (u'seafood', 12664),
 (u'interesting', 12542),
 (u'flavorful', 12533),
 (u'section', 12463),
 (u'sausage', 12433),
 (u'idea', 12094),
 (u'spice', 12042),
 (u'fruit', 12033),
 (u'corn', 12012),
 (u'sort', 11937),
 (u'rock', 11905),
 (u'wrap', 11834),
 (u'hang', 11803),
 (u'pour', 11765),
 (u'middle', 11738),
 (u'station', 11706),
 (u'unique', 11694),
 (u'crust', 11625),
 (u'joint', 11568),
 (u'sample', 11538),
 (u'bag', 11497),
 (u'compare', 11392),
 (u'ma

**Top 50 words used by non-elites but not elites**

In [58]:
ne_top50

[(u'manager', 66115),
 (u'hair', 48884),
 (u'phone', 42378),
 (u'company', 41741),
 (u'receive', 39615),
 (u'nail', 39438),
 (u'thank', 38951),
 (u'rude', 36427),
 (u'fix', 36228),
 (u'professional', 35954),
 (u'horrible', 33384),
 (u'appointment', 32467),
 (u'office', 31867),
 (u'dr', 31805),
 (u'min', 31301),
 (u'speak', 30233),
 (u'thanks', 29895),
 (u'explain', 29689),
 (u'twice', 28382),
 (u'terrible', 28275),
 (u'send', 27792),
 (u'question', 27773),
 (u'understand', 27664),
 (u'purchase', 27605),
 (u'salon', 27032),
 (u'completely', 26334),
 (u'disappointed', 25992),
 (u'massage', 25212),
 (u'die', 25132),
 (u'provide', 24828),
 (u'break', 24359),
 (u'hope', 24247),
 (u'greet', 24100),
 (u'save', 23744),
 (u'product', 23469),
 (u'boyfriend', 23468),
 (u'class', 23456),
 (u'complaint', 23384),
 (u'desk', 23061),
 (u'welcome', 22702),
 (u'state', 22597),
 (u'dollar', 22573),
 (u'woman', 22500),
 (u'deliver', 22477),
 (u'answer', 22129),
 (u'poor', 21996),
 (u'guest', 21820),
 (u'n

### Get string of words with frequency repetitions for word cloud visualization

In [39]:
# elite
tempdict = {key : np.ceil(onlyelite_words[key]/11388.0*10) for key in onlyelite_words}
elitewc = ' '.join([(key + ' ') * tempdict[key] for key in tempdict])
elite_words = {item[0] for item in elite_top50}

In [50]:
# not elite 
ne_tempdict = {key : np.ceil(onlynonelite_words[key]/21435.0*5) for key in onlynonelite_words}
nonelitewc = ' '.join([(key + ' ') * ne_tempdict[key] for key in ne_tempdict])
ne_words = {item[0] for item in ne_top50}

  from ipykernel import kernelapp as app


### Run bag of words naive bayes model to predict elite/non-elite status of reviewer by review

In [68]:
train2 = train.dropna(subset=['tokens']) 

In [69]:
features = vectorizer.fit_transform(train2.tokens)
words = vectorizer.get_feature_names()
features = features.toarray()

In [70]:
model = GaussianNB().fit(features, train2.is_elite)

In [71]:
test_features = vectorizer.fit_transform(test.tokens)

In [72]:
test_features = test_features.toarray()
test_pred = model.predict(test_features)

In [73]:
print accuracy_score(test.is_elite, test_pred)
print precision_score(test.is_elite, test_pred)
print recall_score(test.is_elite, test_pred)

0.683385343266
0.397691174146
0.573464210875


Compare to baseline model

In [74]:
def baseline_model(xtest, ytest):
    return np.array(xtest.shape[0] * [ytest.mode()])

In [None]:
base_pred = baseline_model(test_features, test.is_elite)

In [336]:
test.is_elite.value_counts()[0] / float(test.is_elite.shape[0])

0.75549519687077571