# Imports

In [1]:
# Base
import numpy as np
import pandas as pd
import re
from pymongo import MongoClient

In [2]:
# NLP
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.preprocessing import Normalizer
from gensim.models import Word2Vec

In [3]:
# Modeling
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

# Data

In [5]:
# Connect to MongoDB
client = MongoClient()
client.database_names()
db = client.yelp
collection = db.reviews

In [6]:
# Funny reviews
funnies = collection.find({'votes.funny':{'$gt':10}})
funnies.count()

7303

In [7]:
# Non-funny reviews
non_funnies = collection.find({'votes.funny':{'$lt':10}})
non_funnies.next()['text']

'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.'

In [8]:
# Data to lists
reviews = []
idx = []
for i in funnies:
    reviews.append(i['text'])
    idx.append(1)
    reviews.append(non_funnies.next()['text'])
    idx.append(0)
reviews = pd.Series(reviews)
reviews[1]

"Excellent food. Superb customer service. I miss the mario machines they used to have, but it's still a great place steeped in tradition."

In [9]:
X_train, X_test, y_train, y_test = train_test_split(reviews,idx,test_size=.2,random_state=42)

# NLP

In [10]:
# Additional features
class LengthTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        lengths = pd.DataFrame(X.apply(lambda x: len(x.split())))
        return lengths
    def fit(self, X, y=None, **fit_params):
        return self
class CapTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        cap_lengths = pd.DataFrame(X.apply(lambda x: len([i for i in x.split() if i[0].isupper()])))
        return cap_lengths
    def fit(self, X, y=None, **fit_params):
        return self
class NumCount(TransformerMixin):
    def transform(self, X, **transform_params): 
        return pd.DataFrame(X.apply(lambda x: len(re.findall(r'\d', x))))
    def fit(self, X, y=None, **fit_params):
        return self
class ToArray(TransformerMixin):
    def transform(self, X, **transform_params): 
        return pd.DataFrame(X.toarray())
    def fit(self, X, y=None, **fit_params):
        return self
class WordVec(TransformerMixin):
    def transform(self, X, **transform_params):
        # Train word2vec
        texts = [[word for word in document.lower().split()] for document in X]
        w2v = Word2Vec(texts, size=100, window=5, min_count=1, workers=4, sg=0)
        # Make features
        def word2vec(document):
            vectors = pd.Series([w2v[word] for word in document.lower().split()]).mean()
            return pd.Series(vectors)
        df = pd.concat([word2vec(X.iloc[idx]) for idx in range(len(X))], axis=1).T
        return df
    def fit(self, X, y=None, **fit_params):
        return self

# Model

In [None]:
log_pipe = Pipeline([
    ('features', FeatureUnion([
        ('count_vect', CountVectorizer(stop_words='english')),
        ('tfidf_vect', TfidfVectorizer()),
        ('email_length', LengthTransformer()),
        ('capital_letters', CapTransformer()),
        ('numcount', NumCount()),
        ('word2vec', WordVec())])),
    ('model', LogisticRegression())])
log_pipe.fit(X_train,y_train)
print('Accuray: '+str(round(log_pipe.score(X_test,y_test),3)))
y_pred = log_pipe.predict_proba(X_test)[:,1]
print("Roc_Auc: "+str(round(roc_auc_score(y_test,y_pred),3)))

In [18]:
rfr_pipe = Pipeline([
    ('features', FeatureUnion([
        ('count_vect', CountVectorizer(stop_words='english')),
        ('tfidf_vect', TfidfVectorizer()),
        ('email_length', LengthTransformer()),
        ('capital_letters', CapTransformer()),
        ('numcount', NumCount()),
        ('word2vec', WordVec())])),
    ('model', RandomForestClassifier(n_estimators=300))])
rfr_pipe.fit(X_train,y_train)
print('Accuray: '+str(round(rfr_pipe.score(X_test,y_test),3)))
y_pred = rfr_pipe.predict_proba(X_test)[:,1]
print("Roc_Auc: "+str(round(roc_auc_score(y_test,y_pred),3)))

Accuray: 0.843
Roc_Auc: 0.925


In [21]:
nb_pipe = Pipeline([
    ('features', FeatureUnion([
        ('count_vect', CountVectorizer(stop_words='english', min_df=1)),
        ('tfidf_vect', TfidfVectorizer()),
        ('email_length', LengthTransformer()),
        ('capital_letters', CapTransformer()),
        ('numcount', NumCount()),
        ('word2vec', WordVec())])),
    ('to_array', ToArray()),
    ('lsa', TruncatedSVD(100, algorithm = 'randomized')),
    ('normalizer', Normalizer(copy=False)),
    ('model', GaussianNB())])
nb_pipe.fit(X_train,y_train)
print('Accuray: '+str(round(nb_pipe.score(X_test,y_test),3)))
y_pred = nb_pipe.predict_proba(X_test)[:,1]
print("Roc_Auc: "+str(round(roc_auc_score(y_test,y_pred),3)))

MemoryError: 

In [22]:
xgb_pipe = Pipeline([
    ('features', FeatureUnion([
        ('count_vect', CountVectorizer(stop_words='english', min_df=1)),
        ('tfidf_vect', TfidfVectorizer()),
        ('email_length', LengthTransformer()),
        ('capital_letters', CapTransformer()),
        ('numcount', NumCount()),
        ('word2vec', WordVec())])),
    ('to_array', ToArray()),
    ('lsa', TruncatedSVD(100, algorithm = 'randomized')),
    ('normalizer', Normalizer(copy=False)),
    ('model', XGBClassifier())])
xgb_pipe.fit(X_train,y_train)
print('Accuray: '+str(round(xgb_pipe.score(X_test,y_test),3)))
y_pred = xgb_pipe.predict_proba(X_test)[:,1]
print("Roc_Auc: "+str(round(roc_auc_score(y_test,y_pred),3)))

MemoryError: 