### Importing necessary libraries

In [12]:
from keras.layers.core import Dense, Dropout, SpatialDropout1D
from keras.layers.convolutional import Conv1D
from keras.layers.embeddings import Embedding
from keras.layers.pooling import GlobalMaxPooling1D
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import collections
import matplotlib.pyplot as plt
import nltk
import numpy as np
# Setting random seed to 42 because we get consistent result between runs
np.random.seed(42)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Getting and Preparing data

In [2]:
import pandas as pd

# quoting -> controls whether the quotes should be recognized.
# quoting = 0, Quote minimal
# quoting = 1, Quote All
# quoting = 2, Quote None
# quoting = 3, Quote Non-Numeric

test_data_df = pd.read_csv("./data/umich-sentiment-test.txt", header=None, delimiter="\t", quoting=3)
test_data_df.columns = ["Text"]
train_data_df = pd.read_csv("./data/umich-sentiment-train.txt", header=None, delimiter="\t", quoting=3)
train_data_df.columns = ["Sentiment","Text"]

In [4]:
train_data_df.shape

(7086, 2)

In [6]:
train_data_df.head()

Unnamed: 0,Sentiment,Text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [7]:
test_data_df.shape

(33052, 1)

In [8]:
test_data_df.head()

Unnamed: 0,Text
0,""" I don't care what anyone says, I like Hillar..."
1,have an awesome time at purdue!..
2,"Yep, I'm still in London, which is pretty awes..."
3,"Have to say, I hate Paris Hilton's behavior bu..."
4,i will love the lakers.


Let's count how many labels do we have for each sentiment class.

In [9]:
train_data_df.Sentiment.value_counts()

1    3995
0    3091
Name: Sentiment, dtype: int64

**Finally, let's calculate the average number of words per sentence.   **      

We could do the following using a list comprehension with the number of words per sentence.

In [10]:
import numpy as np 

np.mean([len(s.split(" ")) for s in train_data_df.Text])

10.886819079875812

### Initialize basic variables

**Note: ** **Words** are called **tokens** and the process of splitting text into tokens is called tokenization.
1. vocab_size -> We will consider only 5000 words(tokens) in the text.
2. EMBED_SIZE setting -> is the size of the embedding that will be generated by the embedding layer.
3. NUM_FILTERS -> is the number of convolution filters we will train for our convolution layer.
4. NUM_WORDS ->is the size of each filter, that is, how many words we will convolve at a time.
5. BATCH_SIZE -> number of records to feed the network each time
6. NUM_EPOCHS -> number of records to feed the network each time and how many times we will run through the entire dataset during training:

In [39]:
INPUT_FILE = "./data/umich-sentiment-train.txt"
VOCAB_SIZE = 5000 # we will consider 5000 words in the text
EMBED_SIZE = 100 #
NUM_FILTERS = 256
NUM_WORDS = 3
BATCH_SIZE = 64
NUM_EPOCHS = 20

### Preparing a corpus

In [17]:
import re, nltk
from sklearn.feature_extraction.text import CountVectorizer        
from nltk.stem.porter import PorterStemmer

#######
# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # stem
    stems = stem_tokens(tokens, stemmer)
    return stems
######## 

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = 'english',
    max_features = 85
)

In [18]:
corpus_data_features = vectorizer.fit_transform(train_data_df.Text.tolist() + test_data_df.Text.tolist())

In [19]:
corpus_data_features_nd = corpus_data_features.toarray()
corpus_data_features_nd.shape

(40138, 85)

In [21]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print(vocab)

['aaa', 'amaz', 'angelina', 'awesom', 'beauti', 'becaus', 'boston', 'brokeback', 'citi', 'code', 'cool', 'cruis', 'd', 'da', 'drive', 'francisco', 'friend', 'fuck', 'geico', 'good', 'got', 'great', 'ha', 'harri', 'harvard', 'hate', 'hi', 'hilton', 'honda', 'imposs', 'joli', 'just', 'know', 'laker', 'left', 'like', 'littl', 'london', 'look', 'lot', 'love', 'm', 'macbook', 'make', 'miss', 'mission', 'mit', 'mountain', 'movi', 'need', 'new', 'oh', 'onli', 'pari', 'peopl', 'person', 'potter', 'purdu', 'realli', 'right', 'rock', 's', 'said', 'san', 'say', 'seattl', 'shanghai', 'stori', 'stupid', 'suck', 't', 'thi', 'thing', 'think', 'time', 'tom', 'toyota', 'ucla', 've', 'vinci', 'wa', 'want', 'way', 'whi', 'work']


We can also print the counts of each word in the vocabulary as follows.

In [23]:
# Sum up the counts of each vocabulary word
dist = np.sum(corpus_data_features_nd, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print(count, tag)

1179 aaa
485 amaz
1765 angelina
3170 awesom
2146 beauti
1694 becaus
2190 boston
2000 brokeback
423 citi
2003 code
481 cool
2031 cruis
439 d
2087 da
433 drive
1926 francisco
477 friend
452 fuck
1085 geico
773 good
571 got
1178 great
776 ha
2094 harri
2103 harvard
4492 hate
794 hi
2086 hilton
2192 honda
1098 imposs
1764 joli
1054 just
896 know
2019 laker
425 left
4080 like
507 littl
2233 london
811 look
421 lot
10334 love
1568 m
1059 macbook
631 make
1098 miss
1101 mission
1340 mit
2081 mountain
1207 movi
1220 need
459 new
551 oh
674 onli
2094 pari
1018 peopl
454 person
2093 potter
1167 purdu
2126 realli
661 right
475 rock
3914 s
495 said
2038 san
627 say
2019 seattl
1189 shanghai
467 stori
2886 stupid
4614 suck
1455 t
1705 thi
662 thing
1524 think
781 time
2117 tom
2028 toyota
2008 ucla
774 ve
2001 vinci
3703 wa
1656 want
932 way
547 whi
512 work


In [32]:
# remember that corpus_data_features_nd contains all of our original train and test data, so we need to exclude
# the unlabeled test entries
X_train, X_test, y_train, y_test  = train_test_split(
    corpus_data_features_nd[0:len(train_data_df)], 
    train_data_df.Sentiment,
    train_size=0.3, 
    random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2125, 85) (4961, 85) (2125,) (4961,)




In [37]:
vocab_sz=2329

In [41]:
model = Sequential()
model.add(Embedding(vocab_sz,EMBED_SIZE, input_length=maxlen))
#model.add(SpatialDropout1D(Dropout(0.2)))
model.add(Conv1D(filters=NUM_FILTERS, kernel_size=NUM_WORDS, activation="relu"))
model.add(GlobalMaxPooling1D())
model.add(Dense(2, activation="softmax"))

model.compile(optimizer="adam", loss="categorical_crossentropy",
              metrics=["accuracy"])
history = model.fit(X_train, y_train, batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=(X_test, y_test))              

ValueError: Error when checking target: expected dense_2 to have shape (None, 2) but got array with shape (2125, 1)