In [None]:
"""features: 
- average sentence length (in words)
- average review length (in words)
- average review length (in sentences)
- paragraph rate
- bulleted or numbered list rate
- all caps, bad punctuation, run on sentences?
- bag of words: common words in elite vs. not elite; fp, fn, etc. 
"""

In [598]:
from __future__ import unicode_literals
import string, re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from spacy.en import English, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from tqdm import tqdm, tqdm_pandas

In [2]:
nlp = English()

In [467]:
reviews = pd.read_csv('data/yelp_academic_dataset_review.csv')

In [378]:
stop = STOPWORDS
punct = {p for p in string.punctuation}

In [475]:
def cleantext(text):
    tokens = [token.lemma_.strip() for token in nlp(text.decode('utf8'))] # lemmatize 
    filtered = [token for token in tokens if token not in punct]
    while "" in filtered:
        filtered.remove("")
    while " " in filtered:
        filtered.remove(" ")
    while "\n" in filtered:
        filtered.remove("\n")
    while "\n\n" in filtered:
        filtered.remove("\n\n")
    return filtered

In [572]:
def get_num_words(text):
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    words = letters_only.lower().split()
    return len(words)

In [654]:
def get_num_sents(text):
    return len([sent for sent in nlp(text).sents])

In [633]:
print get_num_sents(b)

4


In [640]:
d = 'this is a sentence. this is another sentence! some ellipses...(and also other stuff)'

In [641]:
[sent for sent in nlp(d).sents]

[this is a sentence.,
 this is another sentence!,
 some ellipses...,
 (and also other stuff)]

In [642]:
b.split('. ')

[u'Mr Hoagie is an institution',
 u'Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food',
 u'Their speciality is the Italian Hoagie, and it is voted the best in the area year after year',
 u'I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh',
 u'Overall, its a good alternative to Subway, which is down the road.']

In [585]:
get_num_sents(b)

5

In [476]:
def get_clean_tokens(text):
    #letters_only = re.sub("[^a-zA-Z]", " ", text) 
    union = punct.union(stop)
    #spacing = {'', ' ', '\n', '\n\n'}
    tokens = [token.lemma_ for token in nlp(text.decode('utf8'))]
    filtered = [token for token in tokens if token not in union]
    while "" in filtered:
        filtered.remove("")
    while " " in filtered:
        filtered.remove(" ")
    while "\n" in filtered:
        filtered.remove("\n")
    while "\n\n" in filtered:
        filtered.remove("\n\n")
    return ' '.join(filtered)

In [613]:
# version without utf decoding
# def get_clean_tokens2(text):  
#     letters_only = re.sub("[^a-zA-Z]", " ", text) 
#     words = ' '.join(letters_only.lower().split())
#     tokens = [token.lemma_ for token in nlp(words)]
#     filtered = [t for t in tokens if t not in stop and t != '' and t != ' ' and t != '\n' and t != '\n\n']
#     return ' '.join(filtered)

In [669]:
# uses utf decoding
def get_clean_tokens2(text):  
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    words = ' '.join(letters_only.lower().split())
    tokens = [token.lemma_ for token in nlp(words)]
    filtered = [t for t in tokens if t not in stop and t != '' and t != ' ' and t != '\n' and t != '\n\n']
    return filtered

In [677]:
# uses utf decoding -- 
def get_clean_tokens3(text):
    union = punct.union(stop)
    tokens = [token.lemma_ for token in nlp(text)]
    filtered = [t for t in tokens if t not in union and t != '' and t != ' ' and t != '\n' and t != '\n\n']
    return filtered

In [678]:
get_clean_tokens3(b.decode('utf8'))

[u'mr',
 u'hoagie',
 u'institution',
 u'walk',
 u'like',
 u'throwback',
 u'30',
 u'year',
 u'ago',
 u'old',
 u'fashion',
 u'menu',
 u'board',
 u'booth',
 u'70',
 u'large',
 u'selection',
 u'food',
 u'speciality',
 u'italian',
 u'hoagie',
 u'vote',
 u'best',
 u'area',
 u'year',
 u'year',
 u'usually',
 u'order',
 u'burger',
 u'patty',
 u'obviously',
 u'cook',
 u'frozen',
 u'ingredient',
 u'fresh',
 u'overall',
 u'good',
 u'alternative',
 u'subway',
 u'road']

In [666]:
b

'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.'

In [282]:
def remove_stopwrds(l):
    l = [t for t in l if t not in stop]
    return ' '.join(l)

In [279]:
def get_features(df):
    # get number of words in single review
    df.loc[:,'review_len_wrds'] = df.loc[:,'text'].apply(lambda x: len(cleantext(x)))
    
    # get number of sentences in single review
    df.loc[:,'review_len_sent'] = df.loc[:,'text'].apply(
        lambda x: len([sent for sent in nlp(x.decode('utf8')).sents])) # better way?
    
    # get average number of words per sentence 
    df.loc[:,'avg_wrds_in_sent'] = df.loc[:,'review_len_wrds'] / df.loc[:,'review_len_sent']
    
    # get cleaned tokens for bag of words
    df.loc[:,'clean_tkns'] = df.loc[:, 'text'].apply(lambda x: get_clean_tokens(x))

In [672]:
def get_features2(df):
    # decode
    df.loc[:, 'text'] = df.loc[:, 'text'].apply(lambda x: x.decode('utf8'))
    
    # get number of words in single review
    df.loc[:,'review_len_wrds'] = df.loc[:,'text'].apply(lambda x: get_num_words(x))
    
    # get number of sentences in single review
    df.loc[:,'review_len_sent'] = df.loc[:,'text'].apply(lambda x: get_num_sents(x))
    
    # get average number of words per sentence 
    df.loc[:,'avg_wrds_in_sent'] = df.loc[:,'review_len_wrds'] / df.loc[:,'review_len_sent']
    
    # get cleaned tokens for bag of words
    df.loc[:,'clean_tkns'] = df.loc[:, 'text'].apply(lambda x: get_clean_tokens2(x))

In [448]:
%time get_clean_tokens2(t)

CPU times: user 982 µs, sys: 442 µs, total: 1.42 ms
Wall time: 1.09 ms


u'check place past monday wing night heard wing great decid final time check wing wing crispi nice chang pace got wet cajun sauc garlic butter wing cajun bold flavor sauc sauc garlic butter expect better averag t like see sauc rest boat definit come tri place sampl item menu probabl regular stop wing anytim soon'

In [597]:
%time get_features(test2)

CPU times: user 644 ms, sys: 80.2 ms, total: 724 ms
Wall time: 729 ms


In [480]:
%time get_features(test4)

CPU times: user 32.7 s, sys: 576 ms, total: 33.3 s
Wall time: 34 s


In [655]:
%time get_features2(test4)

CPU times: user 20.3 s, sys: 407 ms, total: 20.7 s
Wall time: 20.8 s


In [673]:
%time get_features2(test4)

CPU times: user 10.7 s, sys: 101 ms, total: 10.8 s
Wall time: 11 s


In [650]:
%time decode(test4)

CPU times: user 175 ms, sys: 8.02 ms, total: 183 ms
Wall time: 218 ms


u'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.'

In [None]:
100 = 3.67

In [468]:
test = reviews.loc[0:5, :]
test2 = reviews.loc[6:11, :]

In [469]:
test3 = reviews.loc[:100, :]

In [479]:
test4 = reviews.loc[:1000, :]

In [659]:
test4.head()

Unnamed: 0,user_id,review_id,text,votes.cool,business_id,votes.funny,stars,date,type,votes.useful,review_len_wrds,review_len_sent,avg_wrds_in_sent,clean_tkns
0,PUFPaY9KxDAcGqfsorJp3Q,Ya85v4eqdd6k9Od8HbQjyA,"Mr Hoagie is an institution. Walking in, it do...",0,5UmKMjUEUNdYWqANhGckJw,0,4,2012-08-01,review,0,82,5,16.4,mr hoagie institution walk like throwback year...
1,Iu6AxdBYGR4A0wspR9BYHA,KPvLNJ21_4wbYNctrOwWdQ,Excellent food. Superb customer service. I mis...,0,5UmKMjUEUNdYWqANhGckJw,0,5,2014-02-13,review,0,24,3,8.0,excellent food superb customer service miss ma...
2,auESFwWvW42h6alXgFxAXQ,fFSoGV46Yxuwbr3fHNuZig,Yes this place is a little out dated and not o...,0,5UmKMjUEUNdYWqANhGckJw,0,5,2015-10-31,review,0,73,6,12.166667,yes place little date open weekend staff pleas...
3,uK8tzraOp4M5u3uYrqIBXg,Di3exaUCFNw1V4kSNW5pgA,All the food is great here. But the best thing...,0,UsFtqoBl7naz8AVUBZMjQQ,0,5,2013-11-08,review,0,75,9,8.333333,food great best thing wing wing simply fantast...
4,I_47G-R2_egp7ME5u_ltew,0Lua2-PbqEQMjD9r89-asw,We checked this place out this past Monday for...,0,UsFtqoBl7naz8AVUBZMjQQ,0,3,2014-03-29,review,0,138,8,17.25,check place past monday wing night hear wing g...


In [470]:
df = reviews.loc[:100, :]

In [471]:
b = test.loc[0, 'text']

In [272]:
b.decode('utf8')

u'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.'

In [220]:
t = test.loc[4, 'text']

In [199]:
get_clean_tokens(t)

u'wing sauce like water pretty much lot butter hot sauce frank red hot maybe whole wing good size crispy 1 wing sauce could better hot extra hot flavor/heat fish sandwich good large portion side decent'

In [200]:
test.loc[:,'review_len_wrds'] = test.loc[:,'text'].apply(lambda x: len(cleantext(x))) # number of words 
test.loc[:,'review_len_sent'] = test.loc[:,'text'].apply(
    lambda x: len([sent for sent in nlp(unicode(x)).sents])) # number of sentences; there must be a better way

In [201]:
test.loc[:,'avg_wrds_in_sent'] = test.loc[:,'review_len_wrds'] / test.loc[:,'review_len_sent']

In [210]:
# get cleaned tokens
test.loc[:,'clean_tkns'] = test.loc[:, 'text'].apply(lambda x: get_clean_tokens(x)))

In [224]:
# add fake elite
test['is_elite'] = [1, 0, 0, 1, 1, 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [227]:
test.loc[5, 'clean_tkns'][0]

u'w'

In [211]:
# get paragraph breaks 

In [228]:
# test model?
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None) 
features = vectorizer.fit_transform(test.clean_tkns)
words = vectorizer.get_feature_names()
features = features.toarray()

In [244]:
# get counts for each word in corpus 
dist = np.sum(features, axis = 0)
print sorted(zip(words, dist), key = lambda x: x[1], reverse = True)[:5]

In [249]:
model = RandomForestClassifier(n_estimators = 50).fit(features, test.is_elite)

In [252]:
pred = model.predict(features)
print 'acc:', accuracy_score(test.is_elite, pred)
print 'acc:', precision_score(test.is_elite, pred, average = None)
print 'acc:', recall_score(test.is_elite, pred, average = None)
print 'acc:', f1_score(test.is_elite, pred, average = None)

acc: 1.0
acc: [ 1.  1.]
acc: [ 1.  1.]
acc: [ 1.  1.]


Unnamed: 0,user_id,review_id,text,votes.cool,business_id,votes.funny,stars,date,type,votes.useful
6,JPPhyFE-UE453zA6K0TVgw,mjCJR33jvUNt41iJCxDU_g,Cold cheap beer. Good bar food. Good service. ...,0,UsFtqoBl7naz8AVUBZMjQQ,0,4,2014-11-28,review,0
7,2d5HeDvZTDUNVog_WuUpSg,Ieh3kfZ-5J9pLju4JiQDvQ,I highly recommend this place. The mechanics a...,0,3eu6MEFlq2Dg7bQh8QbdOg,0,5,2014-02-27,review,0
8,BShxMIUwaJS378xcrz4Nmg,PU28OoBSHpZLkYGCmNxlmg,"I am a big believer in first impressions, so w...",0,3eu6MEFlq2Dg7bQh8QbdOg,0,5,2015-06-16,review,0
9,fhNxoMwwTipzjO8A9LFe8Q,XsA6AojkWjOHA4FmuAb8XQ,Decent range somewhat close to the city. The ...,0,cE27W9VPgO88Qxe4ol6y_g,0,3,2012-08-19,review,1
10,-6rEfobYjMxpUWLNxszaxQ,rkD7UDbQ9VM3Va6bI-eBHQ,Owning a driving range inside the city limits ...,0,cE27W9VPgO88Qxe4ol6y_g,0,1,2013-04-18,review,1
11,KZuaJtFindQM9x2ZoMBxcQ,WExNE-f93SL4D1q8s9QWKg,This place is absolute garbage... Half of the...,0,cE27W9VPgO88Qxe4ol6y_g,0,1,2013-07-14,review,0


In [213]:
list(test.clean_tkns)

[u'mr hoagie institution walk seem like throwback 30 year ago old fashion menu board booth 70 large selection food speciality italian hoagie vote best area year year usually order burger patty obviously cook frozen ingredient fresh overall good alternative subway road',
 u'excellent food superb customer service miss mario machine use still great place steep tradition',
 u'yes place little date open weekend staff always pleasant fast make order always spot fresh veggie hoggies food also daily special ice cream really good banana split pile topping win pennysaver award ever year see',
 u"food great best thing wing wing simply fantastic wet cajun best popular also like seasoned salt wing wing night monday wednesday night 0.75 whole wing dining area nice family friendly bar nice well place truly yinzer 's dream pittsburgh dad would love place n'at",
 u'check place past monday wing night hear wing great decide finally time check wing whole wing crispy nice change pace get wet cajun sauce ga

In [225]:
test.head()

Unnamed: 0,user_id,review_id,text,votes.cool,business_id,votes.funny,stars,date,type,votes.useful,review_len_wrds,review_len_sent,avg_wrds_in_sent,clean_tkns,is_elite
0,PUFPaY9KxDAcGqfsorJp3Q,Ya85v4eqdd6k9Od8HbQjyA,"Mr Hoagie is an institution. Walking in, it do...",0,5UmKMjUEUNdYWqANhGckJw,0,4,2012-08-01,review,0,83,5,16.6,mr hoagie institution walk seem like throwback...,1
1,Iu6AxdBYGR4A0wspR9BYHA,KPvLNJ21_4wbYNctrOwWdQ,Excellent food. Superb customer service. I mis...,0,5UmKMjUEUNdYWqANhGckJw,0,5,2014-02-13,review,0,23,3,7.666667,excellent food superb customer service miss ma...,0
2,auESFwWvW42h6alXgFxAXQ,fFSoGV46Yxuwbr3fHNuZig,Yes this place is a little out dated and not o...,0,5UmKMjUEUNdYWqANhGckJw,0,5,2015-10-31,review,0,73,6,12.166667,yes place little date open weekend staff alway...,0
3,uK8tzraOp4M5u3uYrqIBXg,Di3exaUCFNw1V4kSNW5pgA,All the food is great here. But the best thing...,0,UsFtqoBl7naz8AVUBZMjQQ,0,5,2013-11-08,review,0,75,9,8.333333,food great best thing wing wing simply fantast...,1
4,I_47G-R2_egp7ME5u_ltew,0Lua2-PbqEQMjD9r89-asw,We checked this place out this past Monday for...,0,UsFtqoBl7naz8AVUBZMjQQ,0,3,2014-03-29,review,0,138,8,17.25,check place past monday wing night hear wing g...,1


In [174]:
reviews.shape

(2225213, 10)

In [175]:
byuser.count()

NameError: name 'byuser' is not defined

In [17]:
sents = []
for span in doc.sents:
    # go from the start to the end of each span, returning each token in the sentence
    # combine each token using join()
    sent = ''.join(doc[i].string for i in range(span.start, span.end)).strip()
    sents.append(sent)

In [18]:
trial = []
for sent in doc.sents:
    trial.append(str(sent))

In [53]:
np.mean([len(sent) for sent in doc.sents])

19.399999999999999

In [28]:
trial[0]

'Mr Hoagie is an institution.'