# Base Line

In [1]:
import os
from data_preparation import *
from model import *

## 1. load dataset

In [2]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset, char2idx, idx2char, vocab = char_text2tf_data(filename='donald_tweets.txt',
                                                       seq_length=100,
                                                       batch_size=BATCH_SIZE,
                                                       buffer_size=BUFFER_SIZE)

## 2. build training model

In [3]:
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 256
RNN_UNIT = 1024
model = build_basic_model(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    rnn_units=RNN_UNIT,
    batch_size=BATCH_SIZE
)
model.summary()

Model: "BaseLineModel"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           27392     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 107)           109675    
Total params: 4,075,371
Trainable params: 4,075,371
Non-trainable params: 0
_________________________________________________________________


In [4]:
LOSS = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
OPTIMIZER = 'adam'
model.compile(optimizer=OPTIMIZER, loss=LOSS)

## 3. train

In [5]:
checkpoint_dir = './base_line_character_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

EPOCHS = 10
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])
tf.train.latest_checkpoint(checkpoint_dir)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


'./base_line_character_checkpoints\\ckpt_10'

## 4. build predicting model

In [6]:
model = build_basic_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNIT, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [7]:
def generate_text(model, start_string, num_generate, split_string):
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []
    temperature = 1.0

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        # Pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])
    return start_string + split_string.join(text_generated)


print(generate_text(model, '@', 1000, ''))

@ONBCL what we need delate was because he has before Mr Trump! It did Negas speechs NEW YORK! MOREAT AGAIN!
Great.
"@mitchellvii1: @60Mannetions on show an HappyNorvill embrace thought how trade Iowa polls short what is the dinder is an amazing his strategy, the worry. Only one shows Trump."
RT @meeyino: DonaldTrump cadest #Jeb Halky 3pm! Its on Tuns, I alse along with Californa shows for president about Trump. Really biased I said in Lau undounded by @MSNBC.
Is Rubio and American tomorrow in the accupath 4 approvedat care of the father. All the one of the about pretical proud of what I have rans amo; with the Misseigh, was one of the U.S. Sunder!
So wad to governs many people shows a massive crowd of evangelitacisp"
"@DJ: A record: Manority, Lyin۪t: No couver that meeting with @jaketapper, his mumphimbs by the #Nexsasnow - Factits. I havere send." Bill he say with a nice "whole in bank drop is lifely dishonest delivers!
I am in Wouht strong article in Class...
"@jasudablesses:

## 5. Metrics

In [8]:
from metrics import *

trump_tweets = load_file()
for i in (2, 4, 6, 8, 10):
    scores_char(i, 10, generate_text, model, trump_tweets)

for i in (9, 10, 18, 175):
    scores_char(2, i, generate_text, model, trump_tweets)


Generated sentence: RT @DonaldJTrumpJr: At۪spectansing ply for 

Reference sentence: RT @DonaldJTrumpJr: FINAL PUSH! Eric and I doing dozens of radio interviews. We can win this thing! GET OUT AND VOTE! #MAGA #ElectionDay ht_

--------------------------------------------------------------------------

2 initial words from #10 sentences -- rouge scores:
rouge-1 {'f': 0.13793103162901313, 'p': 0.4, 'r': 0.08333333333333333}
rouge-2 {'f': 0.07407407155006868, 'p': 0.25, 'r': 0.043478260869565216}
rouge-l {'f': 0.13793103162901313, 'p': 0.4, 'r': 0.08333333333333333}

--------------------------------------------------------------------------

2 initial words from #10 sentences -- BLEU scores:
0.4476869577070369

##########################################################################

Generated sentence: RT @DonaldJTrumpJr: FINAL PUSH! Ere- Clinton just over 

Reference sentence: RT @DonaldJTrumpJr: FINAL PUSH! Eric and I doing dozens of radio interviews. We can win this thing! GET OUT A