In [148]:
import numpy as np
import pandas as pd
import re
import os 
import random 
import spacy
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding
from keras.utils import to_categorical
from pickle import dump, load


df = pd.read_csv("./tweets_cunha.csv",encoding='utf-8')
df.head(5)
# Not gonna bother cleaning up the data because i'm only interested in the text colummn

Unnamed: 0.1,Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL
0,0,DeputadoEduardoCunha,@DepEduardoCunha,2014-01-06T17:47:42.000Z,"Eduardo Cunha | Blog | Blatter, a razão e a 'm...",,,,,,[],https://twitter.com/DepEduardoCunha/status/420...
1,1,Lucio Vieira Lima,@luciovl_,2014-01-10T12:46:55.000Z,Replying to \r\n@DepEduardoCunha,@DepEduardoCunha\r\n Bom dia!,,1.0,,,[],https://twitter.com/luciovl_/status/4216241920...
2,2,DeputadoEduardoCunha,@DepEduardoCunha,2014-01-10T13:12:32.000Z,@luciovl_\r\n @DepEduardoCunha\r\n bom dia,,,3.0,,,[],https://twitter.com/DepEduardoCunha/status/421...
3,3,DeputadoEduardoCunha,@DepEduardoCunha,2014-01-10T12:44:07.000Z,"Eduardo Cunha | Blog | Presí­dios brasileiros,...",,,3.0,3.0,1.0,[],https://twitter.com/DepEduardoCunha/status/421...
4,4,DeputadoEduardoCunha,@DepEduardoCunha,2014-01-05T17:42:14.000Z,"Boa tarde a todos,ano mal começou e já está ag...",,,5.0,196.0,66.0,[],https://twitter.com/DepEduardoCunha/status/419...


# Tokenizing the data and Training the model

In [76]:
# Downloading trained pipelines for Portuguese if its not installed already
# !python -m spacy download pt_core_news_lg

nlp = spacy.load('pt_core_news_lg',disable=['parser', 'tagger','ner'])

In [78]:
doc_text = ''.join(str(df["Text"].tolist())

trash = '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n\r\n~" ' # Stripping tweets
tokens = [re.sub(r'^https?:\/\/.*[\r\n]*', '', token.text.lower(), flags=re.MULTILINE) for token in nlp(doc_text) if token.text not in trash]

tokens = separate_punc(txt)
l_ats = ['\\r\\n@depeduardocunha','@depeduardocunha\\r\\n','\\r\\n@camaradeputados'] # Removing common @s that cause a lot of bias in the final result
tokens = list(filter(lambda a: a not in l_ats, tokens))
len(tokens)
tokens

['eduardo',
 'cunha',
 'blog',
 'blatter',
 'a',
 'razão',
 'e',
 'a',
 "'",
 'malandragem',
 "'",
 '',
 '…',
 'via',
 "'",
 'replying',
 'to',
 "'",
 "'",
 '@luciovl_\\r\\n',
 'bom',
 'dia',
 "'",
 "'",
 'eduardo',
 'cunha',
 'blog',
 'presí\\xaddios',
 'brasileiros',
 'verdadeiras',
 'bombas-relógio',
 '',
 '…',
 'via',
 "'",
 "'",
 'boa',
 'tarde',
 'a',
 'todos',
 'ano',
 'mal',
 'começou',
 'e',
 'já',
 'está',
 'agitado',
 "'",
 "'",
 'não',
 'podemos',
 'ficar',
 'de',
 'cócoras',
 'para',
 'o',
 'judiciário',
 "'",
 'diz',
 'o',
 'líder',
 'do',
 'pmdb',
 'eduardo',
 'cunha',
 '',
 '…',
 'via',
 "'",
 'terça',
 'irei',
 'a',
 'brasília',
 'para',
 'cuidar',
 'dessas',
 'confusões',
 'de',
 'emendas',
 'dos',
 'deps',
 'da',
 'bancada',
 "'",
 "'",
 'ontem',
 'estive',
 'em',
 'palmas',
 'prestigiando',
 'o',
 'dep',
 'junior',
 'coimbra',
 'junto',
 'de',
 'vários',
 'deps',
 "'",
 "'",
 'o',
 'ano',
 'será',
 'de',
 'trabalho',
 'intenso',
 "'",
 "'",
 'amanha',
 'estarei',
 '

In [129]:
# organize into sequences of tokens
train_len = 25 

text_sequences = [tokens[i-train_len+1:i] for i in range(train_len+1, len(tokens))]

In [130]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

tokenizer.index_word

sequences = np.array(tokenizer.texts_to_sequences(text_sequences))
sequences

array([[  21,   14,  231, ...,    1,   21,   14],
       [  14,  231, 7029, ...,   21,   14,  231],
       [ 231, 7029,    2, ...,   14,  231, 2027],
       ...,
       [   4, 7024,    5, ...,    2,  174,  100],
       [7024,    5,    2, ...,  174,  100, 7030],
       [   5,    2,  192, ...,  100, 7030, 7031]])

In [131]:
vocab_sz = len(tokenizer.word_counts)

X = sequences[:,:-1]
y = to_categorical(sequences[:,-1], num_classes=vocab_sz+1)

In [132]:
# Implementing a LSTM model
model = Sequential()

model.add(Embedding(vocab_sz+1, train_len, input_length=train_len))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(150))
model.add(Dense(150, activation='relu'))
model.add(Dense(vocab_sz+1, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 25, 25)            175800    
_________________________________________________________________
lstm_8 (LSTM)                (None, 25, 150)           105600    
_________________________________________________________________
lstm_9 (LSTM)                (None, 150)               180600    
_________________________________________________________________
dense_8 (Dense)              (None, 150)               22650     
_________________________________________________________________
dense_9 (Dense)              (None, 7032)              1061832   
Total params: 1,546,482
Trainable params: 1,546,482
Non-trainable params: 0
_________________________________________________________________


In [134]:
model.fit(X, y, batch_size=128, epochs=290, verbose=1)

Epoch 1/292
Epoch 2/292
Epoch 3/292
Epoch 4/292
Epoch 5/292
Epoch 6/292
Epoch 7/292
Epoch 8/292
Epoch 9/292
Epoch 10/292
Epoch 11/292
Epoch 12/292
Epoch 13/292
Epoch 14/292
Epoch 15/292
Epoch 16/292
Epoch 17/292
Epoch 18/292
Epoch 19/292
Epoch 20/292
Epoch 21/292
Epoch 22/292
Epoch 23/292
Epoch 24/292
Epoch 25/292
Epoch 26/292
Epoch 27/292
Epoch 28/292
Epoch 29/292
Epoch 30/292
Epoch 31/292
Epoch 32/292
Epoch 33/292
Epoch 34/292
Epoch 35/292
Epoch 36/292
Epoch 37/292
Epoch 38/292
Epoch 39/292
Epoch 40/292
Epoch 41/292
Epoch 42/292
Epoch 43/292
Epoch 44/292
Epoch 45/292
Epoch 46/292
Epoch 47/292
Epoch 48/292
Epoch 49/292
Epoch 50/292
Epoch 51/292
Epoch 52/292
Epoch 53/292
Epoch 54/292
Epoch 55/292
Epoch 56/292
Epoch 57/292
Epoch 58/292
Epoch 59/292
Epoch 60/292
Epoch 61/292
Epoch 62/292
Epoch 63/292
Epoch 64/292
Epoch 65/292
Epoch 66/292
Epoch 67/292
Epoch 68/292
Epoch 69/292
Epoch 70/292
Epoch 71/292
Epoch 72/292
Epoch 73/292
Epoch 74/292
Epoch 75/292
Epoch 76/292
Epoch 77/292
Epoch 78

Epoch 80/292
Epoch 81/292
Epoch 82/292
Epoch 83/292
Epoch 84/292
Epoch 85/292
Epoch 86/292
Epoch 87/292
Epoch 88/292
Epoch 89/292
Epoch 90/292
Epoch 91/292
Epoch 92/292
Epoch 93/292
Epoch 94/292
Epoch 95/292
Epoch 96/292
Epoch 97/292
Epoch 98/292
Epoch 99/292
Epoch 100/292
Epoch 101/292
Epoch 102/292
Epoch 103/292
Epoch 104/292
Epoch 105/292
Epoch 106/292
Epoch 107/292
Epoch 108/292
Epoch 109/292
Epoch 110/292
Epoch 111/292
Epoch 112/292
Epoch 113/292
Epoch 114/292
Epoch 115/292
Epoch 116/292
Epoch 117/292
Epoch 118/292
Epoch 119/292
Epoch 120/292
Epoch 121/292
Epoch 122/292
Epoch 123/292
Epoch 124/292
Epoch 125/292
Epoch 126/292
Epoch 127/292
Epoch 128/292
Epoch 129/292
Epoch 130/292
Epoch 131/292
Epoch 132/292
Epoch 133/292
Epoch 134/292
Epoch 135/292
Epoch 136/292
Epoch 137/292
Epoch 138/292
Epoch 139/292
Epoch 140/292
Epoch 141/292
Epoch 142/292
Epoch 143/292
Epoch 144/292
Epoch 145/292
Epoch 146/292
Epoch 147/292
Epoch 148/292
Epoch 149/292
Epoch 150/292
Epoch 151/292
Epoch 152/29

Epoch 157/292
Epoch 158/292
Epoch 159/292
Epoch 160/292
Epoch 161/292
Epoch 162/292
Epoch 163/292
Epoch 164/292
Epoch 165/292
Epoch 166/292
Epoch 167/292
Epoch 168/292
Epoch 169/292
Epoch 170/292
Epoch 171/292
Epoch 172/292
Epoch 173/292
Epoch 174/292
Epoch 175/292
Epoch 176/292
Epoch 177/292
Epoch 178/292
Epoch 179/292
Epoch 180/292
Epoch 181/292
Epoch 182/292
Epoch 183/292
Epoch 184/292
Epoch 185/292
Epoch 186/292
Epoch 187/292
Epoch 188/292
Epoch 189/292
Epoch 190/292
Epoch 191/292
Epoch 192/292
Epoch 193/292
Epoch 194/292
Epoch 195/292
Epoch 196/292
Epoch 197/292
Epoch 198/292
Epoch 199/292
Epoch 200/292
Epoch 201/292
Epoch 202/292
Epoch 203/292
Epoch 204/292
Epoch 205/292
Epoch 206/292
Epoch 207/292
Epoch 208/292
Epoch 209/292
Epoch 210/292
Epoch 211/292
Epoch 212/292
Epoch 213/292
Epoch 214/292
Epoch 215/292
Epoch 216/292
Epoch 217/292
Epoch 218/292
Epoch 219/292
Epoch 220/292
Epoch 221/292
Epoch 222/292
Epoch 223/292
Epoch 224/292
Epoch 225/292
Epoch 226/292
Epoch 227/292
Epoch 

Epoch 234/292
Epoch 235/292
Epoch 236/292
Epoch 237/292
Epoch 238/292
Epoch 239/292
Epoch 240/292
Epoch 241/292
Epoch 242/292
Epoch 243/292
Epoch 244/292
Epoch 245/292
Epoch 246/292
Epoch 247/292
Epoch 248/292
Epoch 249/292
Epoch 250/292
Epoch 251/292
Epoch 252/292
Epoch 253/292
Epoch 254/292
Epoch 255/292
Epoch 256/292
Epoch 257/292
Epoch 258/292
Epoch 259/292
Epoch 260/292
Epoch 261/292
Epoch 262/292
Epoch 263/292
Epoch 264/292
Epoch 265/292
Epoch 266/292
Epoch 267/292
Epoch 268/292
Epoch 269/292
Epoch 270/292
Epoch 271/292
Epoch 272/292
Epoch 273/292
Epoch 274/292
Epoch 275/292
Epoch 276/292
Epoch 277/292
Epoch 278/292
Epoch 279/292
Epoch 280/292
Epoch 281/292
Epoch 282/292
Epoch 283/292
Epoch 284/292
Epoch 285/292
Epoch 286/292
Epoch 287/292
Epoch 288/292
Epoch 289/292
Epoch 290/292
Epoch 291/292
Epoch 292/292


<tensorflow.python.keras.callbacks.History at 0x282c1688ac0>

In [135]:
# Saving the model and tokenizer
model.save('epochBG3.h5')
dump(tokenizer, open('epochBG3', 'wb'))

# Generating new tweets

In [146]:
def gen_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    input_text = seed_text
    
    for i in range(num_gen_words):
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        output_text.append(pred_word)
        
    return ' '.join(output_text)

In [153]:
model = load_model('epochBG2.h5')
tokenizer = load(open('epochBG2','rb'))

In [156]:
random.seed(os.urandom(1337))
random_pick = random.randint(0,len(text_sequences))

random_seed_text = text_sequences[random_pick]
seed_text = ' '.join(random_seed_text)
print('Seed Text: ',seed_text)

Seed Text:  pelo movimento acorda brasil ouvindo as demandas de diversos movimentos ' ' deputados aprovam acordos de cooperação tributária e defesa entre brasil e eua  '


In [157]:
gen_txt = gen_text(model, tokenizer, train_len, seed_text=seed_text, num_gen_words=31)

print(seed_text,'\n', gen_txt)

pelo movimento acorda brasil ouvindo as demandas de diversos movimentos ' ' deputados aprovam acordos de cooperação tributária e defesa entre brasil e eua  ' 
 ' e não deixar bem claro sem registrar a relação ao que ter isso pelo anuncio tomada de alguns sem resposta contra e não trabalha ' ' coincidencias ambos nada e
