In [1]:
%load_ext autoreload
%autoreload 2
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My\ Drive/NLP

Mounted at /gdrive
/gdrive/My Drive/NLP


In [2]:
%pip install tensorflow



In [3]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

# set seeds for reproducability
from tensorflow.random import set_seed
from numpy.random import seed
set_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)


In [4]:
lines = []
with open("data/BarackObama/train.txt") as file:
    for line in file:
        lines.append(line.strip())



In [5]:
tokenizer = Tokenizer()


def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(lines)


In [6]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [7]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 104, 10)           108050    
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 10805)             1091305   
                                                                 
Total params: 1,243,755
Trainable params: 1,243,755
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(predictors, label, epochs=50, verbose=5)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f4517aaaad0>

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    randNum = np.random.randint(0, len(tokenizer.word_index.keys()))
    seed_text += list(tokenizer.word_index.keys())[randNum]
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted = np.argmax(predicted)
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        if output_word == 'eos':
            break
        seed_text += " "+output_word
    return seed_text

generated_tweets = []
for i in range(20):
    generated_tweets.append(generate_text("", max_sequence_len - 1, model, max_sequence_len))

generated_tweets

['biden fact the first year of the union address president obama is taking action on the payroll tax cut',
 'severely insurance meeting at the white house watch live at pm et',
 'knows the president on the united states and invest in chief we will be you',
 'twist president obama on the first sitting important to be a little issue that we need to do everything we can do it',
 'mccain the president will speak about the importance of the importance of congress in congress watch live at pm et',
 "prekforall don't miss this chance to join the fight for change",
 'join the deadline to enroll on the agenda of the affordable care act',
 'bridge president obama is speaking about the economy to discuss the union address at stake in the oval office',
 "side's the president is taking a fair shot for the supreme court",
 'necessity in the oval office is a big deal that means to get out the way to get the covid 19 vaccine and earth to get involved in the fight of gerrymandering and the obamafoundat

In [None]:
model.save('models/Obama')



INFO:tensorflow:Assets written to: models/Obama/assets


INFO:tensorflow:Assets written to: models/Obama/assets
