In [1]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving ML-tweets.txt to ML-tweets.txt
User uploaded file "ML-tweets.txt" with length 1281503 bytes


In [1]:

from keras.models import Sequential
from keras import optimizers
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, CuDNNLSTM
from keras.layers.embeddings import Embedding
from keras.callbacks import LambdaCallback
from keras.preprocessing import sequence
import tensorflow as tf
import random, os
import numpy as np
from tqdm import tqdm

import argparse

FLAGS = None
char_idx = None
idx_char = None
chars= None

# fix random seed for reproducibility
np.random.seed(24)


def string_to_semi_redundant_sequences(text, seq_maxlen=128, redun_step=3, char_idx=None):
    print("Vectorizing text...")

    chars = sorted(list(set(text)))

    if char_idx is None:  
      char_idx = {c: i for i, c in enumerate(sorted(chars))}
      idx_char = {i: c for i, c in enumerate(sorted(chars))}

    len_chars=len(char_idx)

    sequences = []
    next_chars = []
    for i in range(0, len(text) - seq_maxlen, redun_step):
        sequences.append(text[i: i + seq_maxlen])
        next_chars.append(text[i + seq_maxlen])

    X = np.zeros((len(sequences), seq_maxlen, len_chars), dtype=np.bool)
    Y = np.zeros((len(sequences), len_chars), dtype=np.bool)
    for i, seq in tqdm(enumerate(sequences)):
        for t, char in enumerate(seq):
            X[i, t, char_idx[char]] = 1
        Y[i, char_idx[next_chars[i]]] = 1

    print("Text total length: {:,}".format(len(text)))
    print("Distinct chars   : {:,}".format(len_chars))
    print("Total sequences  : {:,}".format(len(sequences)))

    return X, Y, char_idx
  


def create_model(maxlen, input_length):
    model = Sequential()
    model.add(CuDNNLSTM(512,input_shape=(maxlen, input_length)))
    model.add(Dropout(0.2))
    model.add(Dense(input_length))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())

    return model
    
def gen(model,sentence,maxlen,char_idx,reslen):
    chars = sorted(list(set(text)))
    idx_char = {i: c for i, c in enumerate(sorted(chars))}

    input_length = len(char_idx)
    generated=''
   
    x = np.zeros((1, maxlen, input_length), dtype=np.bool)
    for t, char in enumerate(sentence):
        x[0, t, char_idx[char]] = 1.
    
    for i in range(reslen):
        preds = np.argmax(model.predict(x, verbose=0))
        next_char = idx_char[preds]

        generated += next_char
        act = np.zeros((1, 1, input_length), dtype=np.bool)
        act[0,0,preds] = 1
        x = np.concatenate((x[:,1:,:], act), axis=1)
        
    print(sentence+generated)
        

maxlen = 140
path = 'ML-tweets.txt'
with open(path) as f:
  text = f.read()
  
X, Y, char_idx = string_to_semi_redundant_sequences(text, seq_maxlen=maxlen)

model = create_model(maxlen, len(char_idx))
model.fit(X, Y, epochs=1, batch_size=512)
model.save('lstm-keras.h5')


gen(model, "What ", maxlen, char_idx,140)


    

Using TensorFlow backend.


Vectorizing text...


427121it [00:22, 18995.84it/s]


Text total length: 1,281,503
Distinct chars   : 176
Total sequences  : 427,121
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm_1 (CuDNNLSTM)     (None, 512)               1413120   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 176)               90288     
_________________________________________________________________
activation_1 (Activation)    (None, 176)               0         
Total params: 1,503,408
Trainable params: 1,503,408
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/1
What  #Machine #Machine #Machine #Machine #Machine #Machine #Machine #Machine #Machine #Machine #Machine #Machine #Machine #Machine #Machine #Mac
