In [0]:
from keras.models import Sequential   #DeepLearning Libraries
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers import Embedding
from keras.optimizers import Adam
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.utils import np_utils

In [0]:
from keras.preprocessing.text import Tokenizer   #preprocessing
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
train_image = drive.CreateFile({'id' : '18keD3rntMU1ErXYXYR6jKTe2VXZPa0VU'})
train_image.GetContentFile('train_pos_full.txt')
train_image = drive.CreateFile({'id' : '1_E6c3Qe05hwSZzy0goTYrmB2wb-5Y2tb'})
train_image.GetContentFile('train_neg_full.txt')

In [0]:
pos_tweet = [line.rstrip('\n') for line in open('train_pos_full.txt')] #Eureka
neg_tweet = [line.rstrip('\n') for line in open('train_neg_full.txt')]

In [6]:
neg_tweet[:5]

['vinco tresorpack 6 ( difficulty 10 of 10 object : disassemble and reassemble the wooden pieces this beautiful wo ... <url>',
 'glad i dot have taks tomorrow ! ! #thankful #startho',
 '1-3 vs celtics in the regular season = were fucked if we play them in the playoffs',
 "<user> i could actually kill that girl i'm so sorry ! ! !",
 '<user> <user> <user> i find that very hard to believe im afraid']

In [7]:
pos_tweet[:5]

['<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15',
 "because your logic is so dumb , i won't even crop out your name or your photo . tsk . <url>",
 '" <user> just put casper in a box ! " looved the battle ! #crakkbitch',
 "<user> <user> thanks sir > > don't trip lil mama ... just keep doin ya thang !",
 'visiting my brother tmr is the bestest birthday gift eveerrr ! ! !']

In [9]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> stopwords
    Downloading package stopwords to /content/nltk_data...
      Unzipping corpora/stopwords.zip.

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [0]:
posneg_no_stop = []
bwords = stopwords.words('english')

In [0]:
for line in pos_tweet :   #Removing stopwords from postive tweets
  sum = ''
  for w in line.split() :
    if w not in bwords :
      sum = sum + w + ' '
  posneg_no_stop.append(sum.strip())

for line in neg_tweet :    #Removing stopwords from negative tweets
  sum = ''
  for w in line.split() :
    if w not in bwords :
      sum = sum + w + ' '
  posneg_no_stop.append(sum.strip())

**All tweets appended to a single list so that a single tokenizer could be used on all samples**

In [17]:
posneg_no_stop[0:5]

['<user> dunno justin read mention . justin god knows , hope follow #believe 15',
 'logic dumb , even crop name photo . tsk . <url>',
 '" <user> put casper box ! " looved battle ! #crakkbitch',
 '<user> <user> thanks sir > > trip lil mama ... keep doin ya thang !',
 'visiting brother tmr bestest birthday gift eveerrr ! ! !']

In [18]:
posneg_no_stop[1250000:1250005]

['vinco tresorpack 6 ( difficulty 10 10 object : disassemble reassemble wooden pieces beautiful wo ... <url>',
 'glad dot taks tomorrow ! ! #thankful #startho',
 '1-3 vs celtics regular season = fucked play playoffs',
 "<user> could actually kill girl i'm sorry ! ! !",
 '<user> <user> <user> find hard believe im afraid']

**Text tokenization & sequence development for feeding to the ConvNets**

In [0]:
token = Tokenizer()

In [0]:
token.fit_on_texts(posneg_no_stop)

In [22]:
len(token.word_index)

508195

In [0]:
seq = token.texts_to_sequences(posneg_no_stop)

In [29]:
seq[:10]   #word vectors

[[1, 1729, 407, 311, 936, 407, 175, 695, 60, 15, 266, 389],
 [2998, 1612, 83, 10349, 248, 503, 6285, 2],
 [1, 236, 18533, 571, 93835, 2193, 209563],
 [1, 1, 25, 1730, 744, 478, 1296, 160, 1703, 203, 3039],
 [4231, 387, 2309, 4655, 111, 967, 74220],
 [1, 465, 209564, 99, 622, 132, 14, 17],
 [1,
  8593,
  61,
  18171,
  72,
  208,
  93836,
  72,
  1645,
  2985,
  1431,
  7,
  640,
  111607,
  7,
  21410],
 [4139, 193, 2833, 4139, 4, 1, 55076, 762, 16979, 1],
 [1, 272, 49, 3810, 344],
 [5616]]

In [0]:
p = 0
for i in seq :
  if len(i) > p :
    p = len(i)

In [0]:
seq = pad_sequences(seq,maxlen = p+1,padding = 'post', value = 0)

In [35]:
seq.shape

(2500000, 65)

In [36]:
seq

array([[    1,  1729,   407, ...,     0,     0,     0],
       [ 2998,  1612,    83, ...,     0,     0,     0],
       [    1,   236, 18533, ...,     0,     0,     0],
       ...,
       [    1, 26737,   782, ...,     0,     0,     0],
       [ 8672,  3201,  1992, ...,     0,     0,     0],
       [    1,  7341,  1647, ...,     0,     0,     0]], dtype=int32)

**Data Preparation and train test split   (1 : positive tweet  | 0 : Negative Tweet)  **

In [0]:
X = np.asmatrix(seq)

In [0]:
y = np.ones((1250000,1))

In [0]:
y = np.append(y,np.zeros((1250000,1)))

In [0]:
y = np.asmatrix(np.reshape(y,(2500000,1)))

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 92)

**ConvNet Architectures (without embedding layer)**

In [0]:
X_train = np.array(X_train).reshape((2000000,65,1))
X_test = np.array(X_test).reshape((500000,65,1))


In [0]:
X_train_E = np.array(X_train).reshape((2000000,65))
X_test_E = np.array(X_test).reshape((500000,65))

In [136]:
X_train.shape

(2000000, 65, 1)

In [116]:
model = Sequential()
model.add(Conv1D(32, 3, input_shape=(65,1), padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(64, 3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Flatten())
model.add(Dense(800, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_61 (Conv1D)           (None, 65, 32)            128       
_________________________________________________________________
max_pooling1d_61 (MaxPooling (None, 32, 32)            0         
_________________________________________________________________
conv1d_62 (Conv1D)           (None, 32, 64)            6208      
_________________________________________________________________
max_pooling1d_62 (MaxPooling (None, 16, 64)            0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 1024)              0         
_________________________________________________________________
dense_33 (Dense)             (None, 800)               820000    
_________________________________________________________________
dropout_25 (Dropout)         (None, 800)               0         
__________

In [117]:
model.fit(X_train, y_train, validation_split = 0.2, epochs=5, batch_size=1000)

Train on 1600000 samples, validate on 400000 samples
Epoch 1/5
Epoch 2/5
 163000/1600000 [==>...........................] - ETA: 30s - loss: 0.6641 - acc: 0.5975

Epoch 3/5

Epoch 4/5

Epoch 5/5



<keras.callbacks.History at 0x7f40cd3acb38>

In [118]:
scores = model.evaluate(X_test,y_test,verbose = 0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 60.16%


**ConvNet Architectures (with embedding layer)**

In [121]:
token.word_index

{'user': 1,
 'url': 2,
 "i'm": 3,
 'rt': 4,
 'love': 5,
 'like': 6,
 '3': 7,
 'get': 8,
 'frame': 9,
 'u': 10,
 'lol': 11,
 'good': 12,
 'one': 13,
 'know': 14,
 'follow': 15,
 'go': 16,
 'please': 17,
 'day': 18,
 '2': 19,
 "'": 20,
 'see': 21,
 'want': 22,
 'back': 23,
 '1': 24,
 'thanks': 25,
 'x': 26,
 'time': 27,
 'got': 28,
 'really': 29,
 'today': 30,
 "can't": 31,
 'im': 32,
 'haha': 33,
 'going': 34,
 'think': 35,
 'miss': 36,
 'new': 37,
 'need': 38,
 'much': 39,
 'well': 40,
 'would': 41,
 'make': 42,
 '4': 43,
 'paperback': 44,
 'come': 45,
 'still': 46,
 'oh': 47,
 'thank': 48,
 "i'll": 49,
 'best': 50,
 'night': 51,
 'never': 52,
 'happy': 53,
 'wish': 54,
 '5': 55,
 'right': 56,
 'tomorrow': 57,
 'work': 58,
 "that's": 59,
 'hope': 60,
 'feel': 61,
 'people': 62,
 'black': 63,
 'yeah': 64,
 'gonna': 65,
 'wanna': 66,
 'xx': 67,
 'say': 68,
 'picture': 69,
 'home': 70,
 'pack': 71,
 'life': 72,
 'complete': 73,
 'school': 74,
 'great': 75,
 'way': 76,
 'always': 77,
 'las

In [127]:
model1 = Sequential()
model1.add(Embedding(508195, 10, input_length=65))
model1.add(Conv1D(32, 3, input_shape=(65,10), padding='same', activation='relu'))
model1.add(MaxPooling1D(pool_size=2))
model1.add(Conv1D(64, 3, padding='same', activation='relu'))
model1.add(MaxPooling1D(pool_size=2))
model1.add(Flatten())
model1.add(Dense(800, activation='relu'))
model1.add(Dropout(0.5))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model1.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 65, 10)            5081950   
_________________________________________________________________
conv1d_66 (Conv1D)           (None, 65, 32)            992       
_________________________________________________________________
max_pooling1d_65 (MaxPooling (None, 32, 32)            0         
_________________________________________________________________
conv1d_67 (Conv1D)           (None, 32, 64)            6208      
_________________________________________________________________
max_pooling1d_66 (MaxPooling (None, 16, 64)            0         
_________________________________________________________________
flatten_12 (Flatten)         (None, 1024)              0         
_________________________________________________________________
dense_37 (Dense)             (None, 800)               820000    
__________

In [138]:
model1.fit(X_train_E, y_train, validation_split = 0.2, epochs=5, batch_size=1000)

Train on 1600000 samples, validate on 400000 samples
Epoch 1/5

Epoch 2/5

Epoch 3/5

Epoch 4/5

Epoch 5/5



<keras.callbacks.History at 0x7f40e6721a58>

In [140]:
scores = model1.evaluate(X_test_E,y_test,verbose = 0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 81.99%
