In [47]:
import pandas as pd
from pymongo import MongoClient
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

In [2]:
client = MongoClient()

In [3]:
client.database_names()

['local', 'yelp']

In [30]:
db = client.yelp

In [31]:
collection = db.reviews

In [32]:
funnies = collection.find({'votes.funny':{'$gt':10}})

In [33]:
funnies.count()

7303

In [34]:
non_funnies = collection.find({'votes.funny':{'$lt':10}})

In [35]:
non_funnies.next()['text']

'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.'

In [36]:
reviews = []
idx = []

In [37]:
for i in funnies:
    reviews.append(i['text'])
    idx.append(1)
    reviews.append(non_funnies.next()['text'])
    idx.append(0)

In [41]:
reviews[1]

"Excellent food. Superb customer service. I miss the mario machines they used to have, but it's still a great place steeped in tradition."

In [67]:
vectorizer = CountVectorizer(stop_words='english', min_df=1)

In [68]:
dtm = vectorizer.fit_transform(reviews)

In [69]:
len(vectorizer.get_feature_names())

51056

In [76]:
lsa = TruncatedSVD(100, algorithm = 'randomized')
dtm_lsa = lsa.fit_transform(dtm)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

In [77]:
dtm_lsa.shape

(14606, 100)

In [78]:
X_train, X_test, y_train, y_test = train_test_split(dtm_lsa,idx,test_size=.3,random_state=42)

In [79]:
model = RandomForestClassifier()

In [80]:
model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [81]:
model.score(X_test,y_test)

0.73710634413509812