In [None]:
"""features: 
- average sentence length (in words)
- average review length (in words)
- average review length (in sentences)
- paragraph rate
- bulleted or numbered list rate
- all caps, bad punctuation, run on sentences?
- bag of words: common words in elite vs. not elite; fp, fn, etc. 
"""

In [980]:
import string, re
import pickle
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from spacy.en import English, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm, tqdm_pandas

In [2]:
nlp = English()

In [767]:
d = nltk.Text(b)

In [467]:
reviews = pd.read_csv('data/yelp_academic_dataset_review.csv')

In [378]:
stop = STOPWORDS
punct = {p for p in string.punctuation}

### get descriptive features of review text

In [948]:
def get_num_words(text):
    """Get number of words per review."""
    return len(text.split())

In [949]:
# no spacy
def get_num_sents(text):
    """Get number of sentences per review."""
    # add 1 at the end for last punctuation 
    return text.count('. ') + text.count('! ') + text.count('? ') + text.count(') ') + \
            text.count('.\n') + text.count('!\n') + text.count('?\n') + text.count(')\n') + 1

In [904]:
def get_num_para(text):
    """Get number of paragraphs per review."""
    return text.count('\n\n') + 1

In [929]:
def mentions_price(text):
    """Check if review mentions price ($). Return 1 if yes, 0 if no."""
    return 1 if '$' in text else 0

In [977]:
def get_allcaps(text):
    """Get number of all uppercase words in review."""
    text = re.sub("[^a-zA-Z]", " ", text)
    return len([word for word in text.split() if word.isupper() and len(word) > 2])

In [941]:
def get_exclamations(text):
    """Get number of exclamation marks in review."""
    return text.count('!')

In [None]:
# bulleted or numbered list 

In [742]:
def decode(text):
    try:
        return text.decode('utf8')
    except:
        return text

In [866]:
# with spacy
def get_num_sents_spacy(text):
    try:
        return len([sent for sent in nlp(text).sents])
    except:
        return

### get tokens -- *not currently implemented*

In [476]:
def get_clean_tokens(text):
    #letters_only = re.sub("[^a-zA-Z]", " ", text) 
    union = punct.union(stop)
    #spacing = {'', ' ', '\n', '\n\n'}
    tokens = [token.lemma_ for token in nlp(text.decode('utf8'))]
    filtered = [token for token in tokens if token not in union]
    while "" in filtered:
        filtered.remove("")
    while " " in filtered:
        filtered.remove(" ")
    while "\n" in filtered:
        filtered.remove("\n")
    while "\n\n" in filtered:
        filtered.remove("\n\n")
    return ' '.join(filtered)

In [613]:
# version without utf decoding
# def get_clean_tokens2(text):  
#     letters_only = re.sub("[^a-zA-Z]", " ", text) 
#     words = ' '.join(letters_only.lower().split())
#     tokens = [token.lemma_ for token in nlp(words)]
#     filtered = [t for t in tokens if t not in stop and t != '' and t != ' ' and t != '\n' and t != '\n\n']
#     return ' '.join(filtered)

In [686]:
# uses utf decoding
def get_clean_tokens2(text):  
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    words = ' '.join(letters_only.lower().split())
    tokens = [token.lemma_ for token in nlp(words)]
    filtered = [t for t in tokens if t not in stop and t != '' and t != ' ' and t != '\n' and t != '\n\n']
    return ' '.join(filtered)

### parse reviews df

In [279]:
### OLD
def get_features_old(df):
    # get number of words in single review
    df.loc[:,'review_len_wrds'] = df.loc[:,'text'].apply(lambda x: len(cleantext(x)))
    
    # get number of sentences in single review
    df.loc[:,'review_len_sent'] = df.loc[:,'text'].apply(
        lambda x: len([sent for sent in nlp(x.decode('utf8')).sents])) # better way?
    
    # get average number of words per sentence 
    df.loc[:,'avg_wrds_in_sent'] = df.loc[:,'review_len_wrds'] / df.loc[:,'review_len_sent']
    
    # get cleaned tokens for bag of words
    df.loc[:,'clean_tkns'] = df.loc[:, 'text'].apply(lambda x: get_clean_tokens(x))

In [969]:
def get_features(df):
#     decode
    df.loc[:, 'text'] = df.loc[:, 'text'].apply(decode)

    # get number of words in single review
    df.loc[:,'review_len_wrds'] = df.loc[:,'text'].apply(get_num_words)

    #get number of sentences in single review
    df.loc[:,'review_len_sent'] = df.loc[:,'text'].apply(get_num_sents)

    # get average number of words per sentence 
    df.loc[:,'avg_wrds_in_sent'] = df.loc[:,'review_len_wrds'] / df.loc[:,'review_len_sent']
    
    # get number of paragraphs
    df.loc[:, 'num_para'] = df.loc[:, 'text'].apply(get_num_para)
    
    # check if price is mentioned
    df.loc[:, 'mentions_price'] = df.loc[:, 'text'].apply(mentions_price)
    
    # get number of all caps words
    df.loc[:, 'num_allcaps'] = df.loc[:, 'text'].apply(get_allcaps)
    
    # get number of exclamation marks
    df.loc[:, 'num_exclamations'] = df.loc[:, 'text'].apply(get_exclamations)

#     get cleaned tokens for bag of words
#     %time df.loc[:,'clean_tkns'] = df.loc[:, 'text'].apply(lambda x: get_clean_tokens2(x))

In [986]:
get_features(reviews)

In [988]:
with open('pickled/reviewsdf.pkl', 'w') as picklefile:
    pickle.dump(reviews, picklefile)

In [1010]:
reviews[reviews.user_id == 'AckuDQQQ7d4tKE8IOZ0ttw']

Unnamed: 0,user_id,review_id,text,votes.cool,business_id,votes.funny,stars,date,type,votes.useful,review_len_wrds,review_len_sent,avg_wrds_in_sent,num_para,mentions_price,num_allcaps,num_exclamations
1349168,AckuDQQQ7d4tKE8IOZ0ttw,ccE-yH9ROF5EChmHc4QTdw,This place was amazing. I'm writing this revie...,0,vSf0pqvaLp5sVSjJPeOqqQ,0,5,2015-02-11,review,0,80,6,13.333333,1,0,1,0


In [993]:
byuser = reviews.groupby('user_id')

In [998]:
128/8.33

15.366146458583433

In [1003]:
user_avgs = byuser.mean().loc[:, 'review_len_wrds':]

In [1009]:
# pickle user avgs data
with open('pickled/user_avgs.pkl', 'w') as picklefile:
    pickle.dump(user_avgs, picklefile)

### bag of words, etc. workspace -- *not implemented*

In [228]:
# test model?
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None) 
features = vectorizer.fit_transform(test.clean_tkns)
words = vectorizer.get_feature_names()
features = features.toarray()

In [244]:
# get counts for each word in corpus 
dist = np.sum(features, axis = 0)
print sorted(zip(words, dist), key = lambda x: x[1], reverse = True)[:5]

In [249]:
model = RandomForestClassifier(n_estimators = 50).fit(features, test.is_elite)

In [252]:
pred = model.predict(features)
print 'acc:', accuracy_score(test.is_elite, pred)
print 'acc:', precision_score(test.is_elite, pred, average = None)
print 'acc:', recall_score(test.is_elite, pred, average = None)
print 'acc:', f1_score(test.is_elite, pred, average = None)

acc: 1.0
acc: [ 1.  1.]
acc: [ 1.  1.]
acc: [ 1.  1.]
