# Data

In [1]:
# Base
import numpy as np
import pandas as pd
import re
from pymongo import MongoClient

In [2]:
# NLP
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.preprocessing import Normalizer
from gensim.models import Word2Vec

In [3]:
# Modeling
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin
from sklearn.metrics import roc_auc_score

In [4]:
# Connect to MongoDB
client = MongoClient()
client.database_names()
db = client.yelp
collection = db.reviews

In [5]:
# Funny reviews
funnies = collection.find({'votes.funny':{'$gt':10}})
funnies.count()

7303

In [6]:
# Non-funny reviews
non_funnies = collection.find({'votes.funny':{'$lt':10}})
non_funnies.next()['text']

'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.'

In [7]:
# Data to lists
reviews = []
idx = []
for i in funnies:
    reviews.append(i['text'])
    idx.append(1)
    reviews.append(non_funnies.next()['text'])
    idx.append(0)
reviews = pd.Series(reviews)
reviews[1]

"Excellent food. Superb customer service. I miss the mario machines they used to have, but it's still a great place steeped in tradition."

In [8]:
# Additional features
class LengthTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        lengths = pd.DataFrame(X.apply(lambda x: len(x.split())))
        return lengths
    def fit(self, X, y=None, **fit_params):
        return self
class CapTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        cap_lengths = pd.DataFrame(X.apply(lambda x: len([i for i in x.split() if i[0].isupper()])))
        return cap_lengths
    def fit(self, X, y=None, **fit_params):
        return self
class NumCount(TransformerMixin):
    def transform(self, X, **transform_params): 
        return pd.DataFrame(X.apply(lambda x: len(re.findall(r'\d', x))))
    def fit(self, X, y=None, **fit_params):
        return self
class WordVec(TransformerMixin):
    def transform(self, X, **transform_params):
        # Train word2vec
        texts = [[word for word in document.lower().split()] for document in X]
        w2v = Word2Vec(texts, size=100, window=5, min_count=1, workers=4, sg=0)
        # Make features
        def word2vec(document):
            vectors = pd.Series([w2v[word] for word in document.lower().split()]).mean()
            return pd.Series(vectors)
        df = pd.concat([word2vec(X.iloc[idx]) for idx in range(len(X))], axis=1).T
        return df
    def fit(self, X, y=None, **fit_params):
        return self

In [9]:
features = FeatureUnion([
        ('count_vect', TfidfVectorizer(stop_words='english')),
        ('length', LengthTransformer()),
        ('caps', CapTransformer()),
        ('num_of_num', NumCount()),
        ('word2vec', WordVec())])
data = features.fit_transform(reviews)

In [10]:
lsa = TruncatedSVD(n_components=500)
dat = lsa.fit_transform(data)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(dat,idx,test_size=.2,random_state=42)

In [12]:
X_train = X_train
X_test = X_test
y_train = np.array([np.float64(i) for i in y_train])
y_train.shape = (len(y_train),1)
y_train = np.concatenate((y_train,1-y_train),axis=1)
y_test = np.array([np.float64(i) for i in y_test])
y_test.shape = (len(y_test),1)
y_test = np.concatenate((y_test,1-y_test),axis=1)

# TensorFlow

In [13]:
import tensorflow as tf

In [14]:
from tensorflow.examples.tutorials.mnist import input_data

In [15]:
mnist = input_data.read_data_sets('MNIST_data/', one_hot=True)
mnist_X_train = mnist.train.images
mnist_y_train = mnist.train.labels 
mnist_X_test = mnist.test.images
mnist_y_test = mnist.test.labels

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [16]:
print("MNIST")
print("X_train: "+str(type(mnist_X_train))+" "+str(mnist_X_train.shape))
print("y_train: "+str(type(mnist_y_train))+" "+str(mnist_y_train.shape))
print("X_test: "+str(type(mnist_X_test))+" "+str(mnist_X_test.shape))
print("y_test: "+str(type(mnist_y_test))+" "+str(mnist_y_test.shape))

MNIST
X_train: <class 'numpy.ndarray'> (55000, 784)
y_train: <class 'numpy.ndarray'> (55000, 10)
X_test: <class 'numpy.ndarray'> (10000, 784)
y_test: <class 'numpy.ndarray'> (10000, 10)


In [17]:
print("REAL")
print("X_train: "+str(type(X_train))+" "+str(X_train.shape))
print("y_train: "+str(type(y_train))+" "+str(y_train.shape))
print("X_test: "+str(type(X_test))+" "+str(X_test.shape))
print("y_test: "+str(type(y_test))+" "+str(y_test.shape))

REAL
X_train: <class 'numpy.ndarray'> (11684, 500)
y_train: <class 'numpy.ndarray'> (11684, 2)
X_test: <class 'numpy.ndarray'> (2922, 500)
y_test: <class 'numpy.ndarray'> (2922, 2)


In [21]:
sess = tf.InteractiveSession()

# Create the model
x = tf.placeholder(tf.float32, [None, 500])
W1 = tf.Variable(tf.zeros([500,10]))
b1 = tf.Variable(tf.zeros([10]))
h1 = tf.matmul(x, W1) + b1
# W2 = tf.Variable(tf.zeros([10,10]))
# b2 = tf.Variable(tf.zeros([10]))
# h2 = tf.matmul(h1, W2) + b2
W3 = tf.Variable(tf.zeros([10,2]))
b3 = tf.Variable(tf.zeros([2]))
y = tf.nn.softmax(tf.matmul(h2, W3) + b3)

# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 2])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(.05).minimize(cross_entropy)

# Train
tf.initialize_all_variables().run()
for i in range(100):
    xs = X_train[100*i:100*(i+1)]
    ys = y_train[100*i:100*(i+1)]
    train_step.run({x: xs, y_: ys})

# for i in range(100):
#     xs = X_train[100*i:100*(i+1)]
#     ys = y_train[100*i:100*(i+1)]
#     train_step.run({x: xs, y_: ys})

# Test trained model
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print("Accuracy: "+ str(accuracy.eval({x: X_test, y_: y_test})))
# print("Train Accuracy: "+ str(accuracy.eval({x: X_train, y_: y_train})))

sess.close()

Exception ignored in: <bound method BaseSession.__del__ of <tensorflow.python.client.session.InteractiveSession object at 0x7f158f5c3da0>>
Traceback (most recent call last):
  File "/home/dohyun0012/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 171, in __del__
    self.close()
  File "/home/dohyun0012/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 976, in close
    self._default_session.__exit__(None, None, None)
  File "/home/dohyun0012/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 66, in __exit__
    next(self.gen)
  File "/home/dohyun0012/anaconda3/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3378, in get_controller
    % type(default))
AssertionError: Nesting violated for default stack of <class 'weakref'> objects


Accuracy: 0.50924
