# Task 1 CNN for Text Classification

# Introduction

This part is task 1 of HW3 focusing on the first option: Text Classification. We developed a CNN for the text classification NLP task.


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
from sklearn.utils import shuffle

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.optimizers import SGD
from keras.layers import Embedding, Conv1D, Dropout, MaxPooling1D, Flatten, Dense
from keras import regularizers

Using TensorFlow backend.


In [6]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score

## data loading and preprocessing

In [7]:
df_n = pd.read_table('/Users/jianwenliu/nlp/rt-polaritydata/rt-polaritydata/rt-polarity.neg', names=['review'],  header=None, encoding='latin-1')
df_p = pd.read_table('/Users/jianwenliu/nlp/rt-polaritydata/rt-polaritydata/rt-polarity.pos', names=['review'],  header=None, encoding='latin-1')


  """Entry point for launching an IPython kernel.
  


In [8]:
df_n.head(10)

Unnamed: 0,review
0,"simplistic , silly and tedious ."
1,"it's so laddish and juvenile , only teenage bo..."
2,exploitative and largely devoid of the depth o...
3,[garbus] discards the potential for pathologic...
4,a visually flashy but narratively opaque and e...
5,"the story is also as unoriginal as they come ,..."
6,about the only thing to give the movie points ...
7,not so much farcical as sour .
8,unfortunately the story and the actors are ser...
9,all the more disquieting for its relatively go...


label the review with 0 and 1

In [9]:
df_n['lable'] = 0
df_p['lable'] = 1

Concate and shuffle the dataset, making it 50-50

In [10]:
df = shuffle(pd.concat((df_n, df_p), axis=0), random_state=7)

In [11]:
df

Unnamed: 0,review,lable
1777,this is an egotistical endeavor from the daugh...,0
2272,where last time jokes flowed out of cho's life...,0
155,everything that has to do with yvan and charlo...,1
4137,an affectionately goofy satire that's unafraid...,1
2428,i was impressed by how many tit-for-tat retali...,1
3039,"what [denis] accomplishes in his chilling , un...",1
2411,a journey that's too random and inconclusive t...,0
3626,promises is one film that's truly deserving of...,1
2463,all this turns out to be neither funny nor pro...,0
2302,"the stripped-down dramatic constructs , auster...",1


In [12]:
df.shape

(10662, 2)

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['review'])

In [14]:
sequences = tokenizer.texts_to_sequences(df['review'])

In [15]:
max_len = max([len(seq) for seq in sequences])
max_len

51

Padding the sequences

In [16]:
sequences = pad_sequences(sequences, maxlen=max_len, dtype='int32', padding='pre', truncating='pre', value=0)

In [17]:
sequences

array([[   0,    0,    0, ...,    9, 1022, 4625],
       [   0,    0,    0, ..., 1074, 1023, 3890],
       [   0,    0,    0, ...,   64,    3,  281],
       ...,
       [   0,    0,    0, ...,   14,    1,  941],
       [   0,    0,    0, ...,  282, 4424,  147],
       [   0,    0,    0, ...,   93, 1003,  369]], dtype=int32)

get thr dictionaty for the word index

In [16]:
tokenizer.word_index

{'the': 1,
 'a': 2,
 'and': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'in': 7,
 'that': 8,
 'it': 9,
 'as': 10,
 'but': 11,
 'with': 12,
 'film': 13,
 'for': 14,
 'this': 15,
 'its': 16,
 'an': 17,
 'movie': 18,
 "it's": 19,
 'be': 20,
 'on': 21,
 'you': 22,
 'not': 23,
 'by': 24,
 'one': 25,
 'like': 26,
 'about': 27,
 'more': 28,
 'has': 29,
 'are': 30,
 'at': 31,
 'than': 32,
 'from': 33,
 'all': 34,
 'his': 35,
 'have': 36,
 'so': 37,
 'if': 38,
 'or': 39,
 'story': 40,
 'too': 41,
 'i': 42,
 'out': 43,
 'just': 44,
 'who': 45,
 'up': 46,
 'good': 47,
 'into': 48,
 'what': 49,
 'most': 50,
 'no': 51,
 'much': 52,
 'even': 53,
 'comedy': 54,
 'time': 55,
 'will': 56,
 'can': 57,
 'some': 58,
 'well': 59,
 'characters': 60,
 'only': 61,
 'little': 62,
 'way': 63,
 'funny': 64,
 'their': 65,
 'director': 66,
 'make': 67,
 'been': 68,
 'your': 69,
 'enough': 70,
 'very': 71,
 'never': 72,
 'when': 73,
 'there': 74,
 'makes': 75,
 'life': 76,
 'bad': 77,
 'may': 78,
 'which': 79,
 'us': 80,
 'b

get the voc size

In [17]:
voc_size=len(tokenizer.word_index)

In [18]:
voc_size

19498

## CNN model

Build the CNN model

In [19]:
def conv_model(output_dim=64, filter_size=128, window_size=3, stride=1, pool_size=2, dense_dim=16):

    model = Sequential()
    model.add(Embedding(voc_size+1, output_dim, input_length=max_len, embeddings_initializer='random_uniform'))
    
    # conv1D, same padding
    model.add(Conv1D(filter_size, kernel_size=(output_dim*window_size),padding='same', strides=(stride), activation='relu'))
    
    #dropout1
    model.add(Dropout(0.6))
    
    #pooling
    model.add(MaxPooling1D(pool_size=(pool_size), padding='same'))
    
    # flatten
    model.add(Flatten())
    
    # fully connected layer
    model.add(Dense(dense_dim,activation='relu'))
    
    #dropout2
    model.add(Dropout(0.6))
    
    # output layer
    model.add(Dense(1, kernel_regularizer=regularizers.l2(0.01),activity_regularizer=regularizers.l1(0.01),activation='sigmoid'))
    
    #use rmsprop
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    
    return model
    
conv_model().summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 51, 64)            1247936   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 51, 128)           1572992   
_________________________________________________________________
dropout_1 (Dropout)          (None, 51, 128)           0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 26, 128)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 3328)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                53264     
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
__________

In [20]:
EPOCHS = 10
BATCH_SIZE = 128
fold = 8

model = conv_model()
X = sequences
y = np.array(df.lable.tolist())

K-Fold for validation

In [21]:
kfold = StratifiedKFold(n_splits=fold, shuffle=True, random_state=7)

## Model performance : prediction accuracy

In [22]:
acr = list()
i = 0

#validation
for train_index, valid_index in kfold.split(X, y):
    i += 1
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    
    #model
    model = conv_model()
    model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=2, validation_data=(X_valid, y_valid),callbacks=[EarlyStopping(patience=1, monitor='val_loss')])
    
    #prediction
    y_bar = model.predict(X_valid)
    y_pred = [round(pred) for pred in y_bar.reshape(-1)]
    
    #accuracy
    acr.append(accuracy_score(y_valid, y_pred))

Train on 9328 samples, validate on 1334 samples
Epoch 1/10
 - 106s - loss: 1.2014 - acc: 0.4994 - val_loss: 1.0976 - val_acc: 0.5000
Epoch 2/10
 - 104s - loss: 1.0123 - acc: 0.6307 - val_loss: 1.0101 - val_acc: 0.7324
Epoch 3/10
 - 110s - loss: 0.8693 - acc: 0.8201 - val_loss: 0.9833 - val_acc: 0.7459
Epoch 4/10
 - 108s - loss: 0.7825 - acc: 0.8814 - val_loss: 1.0180 - val_acc: 0.7564
Train on 9328 samples, validate on 1334 samples
Epoch 1/10
 - 101s - loss: 1.1940 - acc: 0.4994 - val_loss: 1.0855 - val_acc: 0.5000
Epoch 2/10
 - 100s - loss: 0.9795 - acc: 0.6946 - val_loss: 1.0306 - val_acc: 0.6829
Epoch 3/10
 - 104s - loss: 0.8539 - acc: 0.8406 - val_loss: 1.0592 - val_acc: 0.7429
Train on 9328 samples, validate on 1334 samples
Epoch 1/10
 - 101s - loss: 1.2376 - acc: 0.4994 - val_loss: 1.1734 - val_acc: 0.5000
Epoch 2/10
 - 107s - loss: 1.0678 - acc: 0.5846 - val_loss: 1.0767 - val_acc: 0.7009
Epoch 3/10
 - 106s - loss: 0.9192 - acc: 0.8095 - val_loss: 1.0302 - val_acc: 0.7346
Epoch 

In [24]:
print('Average accuracy ', np.mean(np.array(acr)))

Average accuracy  0.7432940342891616


# Conclusion

We can find out from the accuracy that CNN has a acceptable prediction performance, reaching 0.74. However, the speed to train the CNN model is so slow which takes 110s for each epoch. In practice, this will be a obvious drawback.