# Code for Generating the Model 

## Libraries used are 
* nltk
* pandas
* keras
* numpy

In [None]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM, SimpleRNN
from keras.layers.wrappers import TimeDistributed
from nltk.tokenize import TweetTokenizer

## Reading the Tweets 
We read the tweets off the data.csv file, where all the tweets are kept. 
Then we tokenize the tweets with NLTK's tweet tokenizer to strip off the handles and hashtags. 
Next we join the list of tokens to get back a string, which we convert into a list of characters. 
We run a list comprehension over the list of characters to remove all non-alphabet characters like numbers and punctuation marks. This is to reduce the character space and hence preserve computational power for faster and more effective training. 
Then we find the list of unique characters by converting the list of characters into a set. 

In [None]:
data = pd.read_csv("data.csv")
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
tweets = data["Text"].tolist()
tweets = [str(i) for i in tweets]
text = tknzr.tokenize(" ".join(tweets))
text = [i.lower() for i in text if i.isalpha() == True]
tweet = " ".join(text)
data = list(tweet)
chars = list(set(data))

## Converting Text into NumPy Array Sequences. 
The size of the character space is 36. 
First we build two functions. One to convert characters to index based on the index of the respective characters in the variable ***chars*** and another to find the character given the index. Next we start representing character sequences as numpy arrays of dimensions (Sequence Length * Character Space) with each character being denoted along axis=1 using 1, and the rest of the values being kept as zeros. The Y sequence is one character ahead of the X sequence. 

In [None]:
indexOfChar = {ix:char for ix, char in enumerate(chars)}
charOfIndex = {char:ix for ix, char in enumerate(chars)}

length = 140
charspace = len(chars)
X = np.zeros((int(len(data)/length), length, charspace))
y = np.zeros((int(len(data)/length), length, charspace))
if 1 == 1:
    for i in range(0, int(len(data)/length)):
        X_sequence = data[i*length:(i+1)*length]
        X_sequence_ix = [charOfIndex[value] for value in X_sequence]
        input_sequence = np.zeros((length, charspace))
        for j in range(length):
            input_sequence[j][X_sequence_ix[j]] = 1.
            X[i] = input_sequence

        y_sequence = data[i*length+1:(i+1)*length+1]
        y_sequence_ix = [charOfIndex[value] for value in y_sequence]
        target_sequence = np.zeros((length, charspace))
        for j in range(length):
            target_sequence[j][y_sequence_ix[j]] = 1.
            y[i] = target_sequence

## Building the Model
The model consists of four LSTM layers, with a dropout of 0.2 after the last three layers. We use the softmax activation function as this is a multiclass classification problem, where we try to find the character immediately after the given sequence. 

In [None]:
model = Sequential()
model = Sequential()
model.add(LSTM(1200, input_shape=(None, charspace), return_sequences=True))
for i in range(3):
    model.add(LSTM(int(1000- i*100), return_sequences=True))
    model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(charspace)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"])

## Generator Function 

In [None]:
def generate_text(model, length, charspace, indexOfChar):
    # starting with random character
    ix = [np.random.randint(charspace)]
    y = [indexOfChar[ix[-1]]]
    X = np.zeros((1, length, charspace))
    for i in range(length):
        X[0, i, :][ix[-1]] = 1
        print(indexOfChar[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y.append(indexOfChar[ix[-1]])
    return ('').join(y)

Train the model for 200 epochs, and try generating sequences after every epoch to check progress. Save model when done.

In [None]:
epochs = 0
while epochs< 200:
    print('\n\nEpoch: {}\n'.format(epochs))
    model.fit(X, y, batch_size=64, verbose=1, epochs=1)
    epochs += 1
    generate_text(model, 140, charspace, indexOfChar)

model.save("model.h5")