In [36]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Exploratory Data Analysis

In [29]:
tweet_data = pd.read_json('../data/realdonaldtrump.ndjson', lines=True)

In [30]:
shape = tweet_data.shape
print(f"there are {shape[0]} records/tweets")
print(f"there are {shape[1]} columns:")
tweet_data.columns

there are 40241 records
there are 34 columns:


Index(['contributors', 'coordinates', 'created_at', 'entities',
       'extended_entities', 'favorite_count', 'favorited', 'geo', 'id',
       'id_str', 'in_reply_to_screen_name', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'is_quote_status', 'lang', 'place',
       'possibly_sensitive', 'quoted_status', 'quoted_status_id',
       'quoted_status_id_str', 'retrieved_utc', 'retweet_count', 'retweeted',
       'retweeted_status', 'scopes', 'source', 'text', 'truncated', 'user',
       'withheld_copyright', 'withheld_in_countries', 'withheld_scope'],
      dtype='object')

In [27]:
tweet_data['text'].head()

0    Be sure to tune in and watch Donald Trump on L...
1    Donald Trump will be appearing on The View tom...
2    Donald Trump reads Top Ten Financial Tips on L...
3    New Blog Post: Celebrity Apprentice Finale and...
4    "My persona will never be that of a wallflower...
Name: text, dtype: object

In [31]:
tweets = tweet_data['text']

In [33]:
tweets.head()

0    Be sure to tune in and watch Donald Trump on L...
1    Donald Trump will be appearing on The View tom...
2    Donald Trump reads Top Ten Financial Tips on L...
3    New Blog Post: Celebrity Apprentice Finale and...
4    "My persona will never be that of a wallflower...
Name: text, dtype: object

In [47]:
#figure out the tweet with the most words to tigure out max_length for sequences
max_seq_length = max(tweets.apply(lambda x: len(x.split())))
print(f"Setting max_seq_length as {max_seq_length}")

Setting max_seq_length as 32


### Tokenization

In [59]:
embedding_dims = 32
trunc_type='post'
padding_type='pre'
oov_tok = "<OOV>"

In [38]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tweets)

In [41]:
word_index = tokenizer.word_index
vocab_size=len(word_index)
print(f"There are {vocab_size} different words in the corpus")

There are 54258 different words in the corpus


### Creation of Sequences

In [54]:
sequences = tokenizer.texts_to_sequences(tweets)

In [60]:
#padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding=padding_type, truncating=trunc_type)

In [68]:
input_sequences = []

for i in range(1, len(sequences)):
    curr_sequence = sequences[i]
    for j in range(len(curr_sequence)):
        n_gram_sequence = curr_sequence[:j+1]
        input_sequences.append(n_gram_sequence)

In [72]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_seq_length, padding=padding_type))
print(f"The input sequence tensor shape is: {input_sequences.shape}")

In [83]:
# create predictors and label
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

#this doesn't fit in memory, keeping as int and using sparse_categorical_crossentropy
ys =  labels#tf.keras.utils.to_categorical(labels, num_classes=vocab_size)

In [84]:
print(f"The shape of x tensor is {xs.shape}")
print(f"The shape of y tensor is {ys.shape}")

The shape of x tensor is (724195, 31)
The shape of y tensor is (724195,)


### Modeling

In [86]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [88]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dims, input_length=max_seq_length-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(vocab_size, activation='softmax'))
adam = Adam(lr=0.01)
model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
#earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')
history = model.fit(xs, ys, epochs=100, verbose=1)
#print model.summary()
print(model)

Instructions for updating:
Colocations handled automatically by placer.
Epoch 1/100
 37376/724195 [>.............................] - ETA: 44:47 - loss: 9.4066 - acc: 0.0205

KeyboardInterrupt: 