In [1]:
import numpy as np
import pandas as pd
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM

np.random.seed(7)

In [2]:
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
twitter = pd.read_csv('training.1600000.processed.noemoticon.csv', 
                      encoding='latin1',
                      names=['Target', 'ID', 'Date', 'Query', 'User', 'Tweet'])

# label encoder
le = LabelEncoder()
twitter['Target'] = le.fit_transform(twitter['Target'])
print(twitter['Target'].value_counts())

0    800000
1    800000
Name: Target, dtype: int64


In [4]:
# preprocess
twitter['Tweet'] = twitter['Tweet'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','', x)))

In [5]:
X = twitter['Tweet']
print(X.shape)
y = twitter['Target'].values

(1600000,)


In [6]:
tokenizer = Tokenizer(num_words=5000, lower=True, split=' ')
tokenizer.fit_on_texts(X.values)
X = tokenizer.texts_to_sequences(X.values)
X = pad_sequences(X, padding='post')

In [7]:
X.shape

(1600000, 40)

In [8]:
embed_dim = 128
lstm_out = 128
batch_size= 32

# LSTM network

model = Sequential()
model.add(Embedding(5000, embed_dim, input_length=X.shape[1]))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(lstm_out, dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 128)           640000    
                                                                 
 conv1d (Conv1D)             (None, 40, 32)            12320     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 10, 32)           0         
 )                                                               
                                                                 
 lstm (LSTM)                 (None, 128)               82432     
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 734,881
Trainable params: 734,881
Non-trainable params: 0
__________________________________________________

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=7)

# train the network.

model.fit(X_train, y_train, batch_size=batch_size, epochs=3, verbose=1)

# score and accuracy on test set
score, acc = model.evaluate(X_test, y_test, verbose=0, batch_size=batch_size)
print("Loss score: %.2f" % (score))
print("Test Accuracy: %.2f" % (acc * 100))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Loss score: 0.40
Test Accuracy: 81.85


In [11]:
y_pred = model.predict(X_test)

In [12]:
print(classification_report(y_test, np.round(y_pred)))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82    160000
           1       0.82      0.82      0.82    160000

    accuracy                           0.82    320000
   macro avg       0.82      0.82      0.82    320000
weighted avg       0.82      0.82      0.82    320000

