# Data

In [1]:
# Base
import numpy as np
import pandas as pd
import re
from pymongo import MongoClient

In [2]:
# NLP
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.preprocessing import Normalizer
from gensim.models import Word2Vec

In [3]:
# Modeling
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin
from sklearn.metrics import roc_auc_score

In [4]:
# Connect to MongoDB
client = MongoClient()
client.database_names()
db = client.yelp
collection = db.reviews

In [5]:
# Funny reviews
funnies = collection.find({'votes.funny':{'$gt':10}})
funnies.count()

7303

In [6]:
# Non-funny reviews
non_funnies = collection.find({'votes.funny':{'$lt':10}})
non_funnies.next()['text']

'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.'

In [7]:
# Data to lists
reviews = []
idx = []
for i in funnies:
    reviews.append(i['text'])
    idx.append(1)
    reviews.append(non_funnies.next()['text'])
    idx.append(0)
reviews = pd.Series(reviews)
reviews[1]

"Excellent food. Superb customer service. I miss the mario machines they used to have, but it's still a great place steeped in tradition."

In [8]:
# Additional features
class LengthTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        lengths = pd.DataFrame(X.apply(lambda x: len(x.split())))
        return lengths
    def fit(self, X, y=None, **fit_params):
        return self
class CapTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        cap_lengths = pd.DataFrame(X.apply(lambda x: len([i for i in x.split() if i[0].isupper()])))
        return cap_lengths
    def fit(self, X, y=None, **fit_params):
        return self
class NumCount(TransformerMixin):
    def transform(self, X, **transform_params): 
        return pd.DataFrame(X.apply(lambda x: len(re.findall(r'\d', x))))
    def fit(self, X, y=None, **fit_params):
        return self
class ToArray(TransformerMixin):
    def transform(self, X, **transform_params): 
        return pd.DataFrame(X.toarray())
    def fit(self, X, y=None, **fit_params):
        return self
class WordVec(TransformerMixin):
    def transform(self, X, **transform_params):
        # Train word2vec
        texts = [[word for word in document.lower().split()] for document in X]
        w2v = Word2Vec(texts, size=100, window=5, min_count=1, workers=4, sg=0)
        # Make features
        def word2vec(document):
            vectors = pd.Series([w2v[word] for word in document.lower().split()]).mean()
            return pd.Series(vectors)
        df = pd.concat([word2vec(X.iloc[idx]) for idx in range(len(X))], axis=1).T
        return df
    def fit(self, X, y=None, **fit_params):
        return self

In [9]:
features = FeatureUnion([
        ('count_vect', CountVectorizer(stop_words='english')),
        ('tfidf_vect', TfidfVectorizer()),
        ('email_length', LengthTransformer()),
        ('capital_letters', CapTransformer()),
        ('numcount', NumCount()),
        ('word2vec', WordVec())])
data = features.fit_transform(reviews)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data,idx,test_size=.2,random_state=42)

# TensorFlow

In [11]:
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf

In [12]:
mnist = input_data.read_data_sets('MNIST_data/', one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [13]:
mnist1, mnist2 = mnist.train.next_batch(100)

## X

In [45]:
print(mnist1.shape)
print(type(mnist1))
print(mnist1[0].shape)
print(type(mnist1[0]))

(100, 784)
<class 'numpy.ndarray'>
(784,)
<class 'numpy.ndarray'>


In [19]:
print(X_train.shape)
print(type(X_train))
a = X_train[0].toarray()[0]
a.shape = (1,102521)
print(a.shape)
print(type(a))

(11684, 102521)
<class 'scipy.sparse.csr.csr_matrix'>
(1, 102521)
<class 'numpy.ndarray'>


## Y

In [51]:
print(mnist2.shape)
print(type(mnist2))
print(mnist2[0].shape)

(100, 10)
<class 'numpy.ndarray'>
(10,)


In [27]:
print(type(y_train))
print(y_train[0])
a = np.array(y_train[0])
a.shape = (1,1)
a.shape

<class 'list'>
1


(1, 1)

# Test

In [33]:
tst = mnist.test.images
print(tst.shape)
print(type(tst))

(10000, 784)
<class 'numpy.ndarray'>


In [36]:
print(X_test.shape)
print(type(X_test.toarray()))

(2922, 102521)
<class 'numpy.ndarray'>


In [38]:
tst = mnist.test.labels
print(tst.shape)
print(type(tst))

(10000, 10)
<class 'numpy.ndarray'>


In [45]:
tst = np.array(y_test)
tst.shape = (2922,1)
print(tst.shape)
print(type(tst))

(2922, 1)
<class 'numpy.ndarray'>


In [44]:
sess = tf.InteractiveSession()

# Create the model
x = tf.placeholder(tf.float32, [None, 102521])
W = tf.Variable(tf.zeros([102521, 1]))
b = tf.Variable(tf.zeros([1]))
y = tf.nn.softmax(tf.matmul(x, W) + b)

# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 1])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

# Train
tf.initialize_all_variables().run()
for i in range(11684):
    xs = X_train[i].toarray()[0]
    xs.shape = (1,102521)
    ys = np.array(y_train[i])
    ys.shape = (1,1)
    train_step.run({x: xs, y_: ys})

# Test trained model
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
ys_test = np.array(y_test)
ys_test.shape = (2922,1)
print(accuracy.eval({x: X_test.toarray(), y_: ys_test}))

MemoryError: 