In [2]:
import tensorflow as tf
from tensorflow import keras

In [3]:
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [5]:
X_train[0][:10]

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [6]:
word_index = keras.datasets.imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [7]:
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}

In [8]:
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token

In [13]:
" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

'<sos> this film was just brilliant casting location scenery story'

In [14]:
import tensorflow_datasets as tfds

In [16]:
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

In [18]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='C:\\Users\\User\\tensorflow_datasets\\imdb_reviews\\plain_text\\1.0.0',
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        'train': 

In [19]:
train_size = info.splits["train"].num_examples

In [20]:
train_size

25000

In [21]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [22]:
from collections import Counter

In [23]:
vocabulary = Counter()

In [27]:
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [28]:
vocabulary.most_common()[:3]

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [29]:
vocab_size = 10000
truncated_vocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]

In [30]:
truncated_vocabulary

[b'<pad>',
 b'the',
 b'a',
 b'of',
 b'and',
 b'to',
 b'I',
 b'is',
 b'in',
 b'this',
 b'it',
 b'was',
 b'movie',
 b'that',
 b'The',
 b'film',
 b'with',
 b'for',
 b'as',
 b'on',
 b'but',
 b'have',
 b'This',
 b'one',
 b'not',
 b'be',
 b'are',
 b'you',
 b'an',
 b'at',
 b'about',
 b'by',
 b'all',
 b'his',
 b'so',
 b'like',
 b'from',
 b'who',
 b'has',
 b'It',
 b'good',
 b'my',
 b'just',
 b'very',
 b'out',
 b'or',
 b'story',
 b'some',
 b'time',
 b'had',
 b'he',
 b'they',
 b'really',
 b'me',
 b'when',
 b'what',
 b'first',
 b'movies',
 b'bad',
 b'see',
 b'seen',
 b'up',
 b'only',
 b'were',
 b"it's",
 b'would',
 b'more',
 b'made',
 b'great',
 b'can',
 b'been',
 b'i',
 b'her',
 b'no',
 b'A',
 b'which',
 b'even',
 b'films',
 b'there',
 b'ever',
 b'people',
 b'much',
 b'because',
 b'most',
 b'plot',
 b'if',
 b'than',
 b'acting',
 b'get',
 b'their',
 b'well',
 b'into',
 b'how',
 b'best',
 b'think',
 b'other',
 b'its',
 b"It's",
 b'saw',
 b'could',
 b'watch',
 b'many',
 b"don't",
 b'do',
 b'will',
 

In [31]:
words = tf.constant(truncated_vocabulary)

In [32]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [33]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

In [34]:
train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [38]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])

In [39]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [40]:
history = model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [41]:
K = keras.backend

In [42]:
inputs=keras.layers.Input(shape=[None])

In [43]:
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)
z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)
z = keras.layers.GRU(128, return_sequences=True)(z, mask=mask)
z = keras.layers.GRU(128)(z, mask=mask)
outputs = keras.layers.Dense(1, activation="sigmoid")(z)

In [44]:
model = keras.Model(inputs=[inputs], outputs=[outputs])

In [49]:
import tensorflow_hub as hub

In [50]:
model = keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1", dtype=tf.string, input_shape=[], output_shape=[50]),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])

In [51]:
import os
TFHUB_CACHE_DIR = os.path.join(os.curdir, "my_tfhub_cache")
os.environ["TFHUB_CACHE_DIR"] = TFHUB_CACHE_DIR

In [52]:
for dirpath, dirnames, filenames in os.walk(TFHUB_CACHE_DIR):
    for filename in filenames:
        print(os.path.join(dirpath, filename))

.\my_tfhub_cache\82c4aaf4250ffb09088bd48368ee7fd00e5464fe.descriptor.txt
.\my_tfhub_cache\82c4aaf4250ffb09088bd48368ee7fd00e5464fe\saved_model.pb
.\my_tfhub_cache\82c4aaf4250ffb09088bd48368ee7fd00e5464fe\assets\tokens.txt
.\my_tfhub_cache\82c4aaf4250ffb09088bd48368ee7fd00e5464fe\variables\variables.data-00000-of-00001
.\my_tfhub_cache\82c4aaf4250ffb09088bd48368ee7fd00e5464fe\variables\variables.index


In [53]:
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

In [54]:
train_size = info.splits["train"].num_examples
batch_size = 32

In [56]:
train_set = datasets["train"].batch(batch_size).prefetch(1)
model.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["accuracy"])
history = model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [57]:
import tensorflow_addons as tda

 The versions of TensorFlow you are currently using is 2.10.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [64]:
import numpy as np
ec_inputs = keras.layers.Input(shape=[None], dtype=np.int32)

dc_inputs = keras.layers.Input(shape=[None], dtype=np.int32) 

sequence_lenght = keras.layers.Input(shape=[], dtype=np.int32)

In [68]:
embeddings = keras.layers.Embedding(vocab_size, embed_size)

ec_embeddigs = embeddings(ec_inputs)
dc_embeddings = embeddings(dc_inputs)



In [69]:
ecoder = keras.layers.LSTM(512, return_state=True)

In [1]:
default_reber_grammar = [
    [("B", 1)],           # (state 0) =B=>(state 1)
    [("T", 2), ("P", 3)], # (state 1) =T=>(state 2) or =P=>(state 3)
    [("S", 2), ("X", 4)], # (state 2) =S=>(state 2) or =X=>(state 4)
    [("T", 3), ("V", 5)], # and so on...
    [("X", 3), ("S", 6)],
    [("P", 4), ("V", 6)],
    [("E", None)]] 

In [2]:
embedded_reber_grammar = [
    [("B", 1)],
    [("T", 2), ("P", 3)],
    [(default_reber_grammar, 4)],
    [(default_reber_grammar, 5)],
    [("T", 6)],
    [("P", 6)],
    [("E", None)]]

In [12]:
import numpy as np
def generate_string(grammar):
    state=0
    output=[]
    while state is not None:
        index = np.random.randint(len(grammar[state]))
        production, state = grammar[state][index]
        if isinstance(production, list):
            production = generate_string(grammar=production)
        output.append(production)
    return "".join(output)

In [13]:
for _ in range(25):
    print(generate_string(default_reber_grammar), end=" ")

BPTTTVPXVVE BTXSE BTSSSSSXXTTTVPXVVE BPTVPSE BTXXTTVVE BPVPSE BPTVVE BTXXVPXTVPXVPSE BPVVE BTXSE BTXXTVPXTVVE BTSXSE BPVVE BTSSXSE BTXXVVE BTSXXVVE BTSSSXXTTTTVVE BPTTTVVE BPTVVE BTXSE BPTVPXVPSE BTSXSE BTXSE BPTTTTTTVPXVPXTTVVE BPTTTVPXVPXTTVVE 

In [14]:
POSSIBLE_CHARS = "BEPSTVX"
def generate_corrupted_string(grammar, chars=POSSIBLE_CHARS):
    good_string = generate_string(grammar)
    index = np.random.randint(len(good_string))
    good_char = good_string[index]
    bad_char = np.random.choice(sorted(set(chars) - set(good_char)))
    return good_string[:index] + bad_char + good_string[index + 1:]

In [15]:
for _ in range(25):
    print(generate_corrupted_string(embedded_reber_grammar), end=" ")

XPBPVVEPE BTBTXXTTTTTTXPSETE BPBTXXTTTBVVEPE BPBPVPXVPXVESEPE BPVTSXSEPE BPETXSEPE BPBTXSESE ETBPVVETE BTBTXXTVPEVVETE BTBTSXXVPXTVTSETE STBTXSETE BTSTXSETE BPVTSXXVVEPE BPBTSSXSEEE BPBTXVEPE BPBTSSXSPPE BPBTSSXXTTVPSETE BPBTVXSEPE STBPTVPXVPSETE BPXTSSXSEPE BTBVXXVPXTTTTTTTTVPSETE BTBBVPSETE BTBPTTTTVVTTE BTBTXXTTTTTVPXVVEXE BPBTSSSSSSEPE 

In [16]:
def string_to_ids(s, chars=POSSIBLE_CHARS):
    return [chars.index(c) for c in s]

In [17]:
string_to_ids("BTTTXXVVETE")

[0, 4, 4, 4, 6, 6, 5, 5, 1, 4, 1]

In [20]:
import tensorflow as tf
def generate_dataset(size):
    good_strings = [string_to_ids(generate_string(embedded_reber_grammar))
                    for _ in range(size // 2)]
    bad_strings = [string_to_ids(generate_corrupted_string(embedded_reber_grammar))
                   for _ in range(size - size // 2)]
    all_strings = good_strings + bad_strings
    X = tf.ragged.constant(all_strings, ragged_rank=1)
    y = np.array([[1.] for _ in range(len(good_strings))] +
                 [[0.] for _ in range(len(bad_strings))])
    return X, y

In [26]:
X_train, y_train = generate_dataset(10000)
X_valid, y_valid = generate_dataset(2000)

In [27]:
X_train[0]

<tf.Tensor: shape=(15,), dtype=int32, numpy=array([0, 4, 0, 2, 4, 5, 2, 6, 4, 5, 2, 3, 1, 4, 1])>

In [28]:
y_train[0]

array([1.])

In [30]:
embedding_size = 5
from tensorflow import keras
model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=[None], dtype=tf.int32, ragged=True),
    keras.layers.Embedding(input_dim=len(POSSIBLE_CHARS), output_dim=embedding_size),
    keras.layers.GRU(30),
    keras.layers.Dense(1, activation="sigmoid")
])


In [31]:
optimizer = keras.optimizers.SGD(learning_rate=0.02, momentum = 0.95, nesterov=True)

In [32]:
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [48]:
model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x13d762b7fa0>

In [105]:
test_strings = ["BPBTSSSSSSSXXTTVPXVPXTTTTTVVETE",
                "BPBTSSSSSSSXXTTVPXVPXTTTTTVVEPE"]

In [106]:
X_test = tf.ragged.constant([string_to_ids(s) for s in test_strings], ragged_rank=1)

In [107]:
y_proba = model.predict(X_test)



In [108]:
y_proba

array([[0.001542 ],
       [0.9998921]], dtype=float32)

In [116]:
for index, string in enumerate(test_strings):
    print("{}: {:.2f}%".format(string, 100 * y_proba[index][0]))

BPBTSSSSSSSXXTTVPXVPXTTTTTVVETE: 0.15%
BPBTSSSSSSSXXTTVPXVPXTTTTTVVEPE: 99.99%


In [271]:
MONTHS = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]

In [275]:
import calendar
from datetime import date
import random

def generate_random_date():
    # Gera um ano aleatório
    year = random.randint(1900, 2024)
    
    # Gera um mês aleatório
    month = random.randint(1, 12)
    
    # Obtém o número de dias do mês e ano gerado
    _, num_days = calendar.monthrange(year, month)
    
    # Gera um dia aleatório dentro do intervalo válido para o mês
    day = random.randint(1, num_days)
    
    # Cria a data
    random_date = date(year, month, day)
    
    # Converte a data para o formato desejado
    date_str = random_date.strftime("%B %d, %Y")
    return date_str

# Teste
for _ in range(5):
    print(generate_random_date())


June 15, 1903
May 27, 2008
December 18, 1970
May 27, 1991
July 03, 1995


In [276]:
test = generate_random_date()

In [277]:
print(test)

December 27, 1998


In [278]:
from datetime import datetime
def convert_format(date_str):
    date_obj = datetime.strptime(date_str, '%B %d, %Y')
    return date_obj.strftime('%Y-%m-%d')

In [279]:
examples = []
for _ in range(10):
    dates = generate_random_date()
    print(dates)
    new_date = convert_format(dates)
    examples.append([dates, new_date])

March 06, 1993
May 29, 1938
October 03, 1994
November 16, 1954
September 16, 1970
November 27, 1905
August 29, 2021
January 31, 1956
December 31, 1937
April 19, 1902


In [280]:
examples

[['March 06, 1993', '1993-03-06'],
 ['May 29, 1938', '1938-05-29'],
 ['October 03, 1994', '1994-10-03'],
 ['November 16, 1954', '1954-11-16'],
 ['September 16, 1970', '1970-09-16'],
 ['November 27, 1905', '1905-11-27'],
 ['August 29, 2021', '2021-08-29'],
 ['January 31, 1956', '1956-01-31'],
 ['December 31, 1937', '1937-12-31'],
 ['April 19, 1902', '1902-04-19']]

In [281]:
def create_dataset(n_samples):
    X = []
    y = []
    for _ in range(n_samples):
        date_str = generate_random_date()
        X.append(date_str)
        target = convert_format(date_str)
        y.append(target)
    return X, y

In [282]:
n_samples = 100
test1, test1_y = create_dataset(n_samples)
test1

['August 09, 1968',
 'December 30, 1988',
 'November 20, 1933',
 'November 15, 1954',
 'May 18, 1974',
 'December 16, 1912',
 'December 12, 1916',
 'January 28, 1998',
 'September 14, 2019',
 'June 08, 1984',
 'December 17, 1922',
 'August 22, 1990',
 'May 24, 1952',
 'April 22, 1953',
 'December 07, 1965',
 'September 18, 1916',
 'October 21, 2003',
 'November 27, 1952',
 'October 28, 1997',
 'June 16, 1937',
 'September 09, 1946',
 'September 19, 1951',
 'July 14, 1956',
 'September 12, 2021',
 'January 07, 1972',
 'June 06, 1981',
 'July 03, 1996',
 'April 19, 1956',
 'July 17, 1911',
 'February 15, 1987',
 'January 14, 1972',
 'October 28, 1933',
 'October 07, 2020',
 'November 17, 1979',
 'October 02, 1931',
 'May 02, 1942',
 'October 31, 1962',
 'May 27, 1996',
 'March 29, 1934',
 'March 19, 1949',
 'July 14, 1902',
 'August 14, 1980',
 'May 27, 1911',
 'December 23, 1922',
 'February 11, 1944',
 'May 29, 1950',
 'January 30, 1917',
 'May 21, 2006',
 'February 11, 2005',
 'Septem

In [299]:
"2" in vocab

False

In [306]:
vocab = []
for month in MONTHS:
    for char in month:
        if char not in vocab:
            vocab.append(char)
print(vocab)

['J', 'a', 'n', 'u', 'r', 'y', 'F', 'e', 'b', 'M', 'c', 'h', 'A', 'p', 'i', 'l', 'g', 's', 't', 'S', 'm', 'O', 'o', 'N', 'v', 'D']


In [307]:
for i in range(10):
    vocab.append(str(i))

In [308]:
vocab

['J',
 'a',
 'n',
 'u',
 'r',
 'y',
 'F',
 'e',
 'b',
 'M',
 'c',
 'h',
 'A',
 'p',
 'i',
 'l',
 'g',
 's',
 't',
 'S',
 'm',
 'O',
 'o',
 'N',
 'v',
 'D',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9']

In [314]:
INPUT_CHARS = "0123456789-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ,"

def date_str_to_ids(date_str, chars=INPUT_CHARS):
    return [chars.index(c) for c in date_str]

# Exemplo com uma string de data
date_str = "April 22, 2019"
date_ids = date_str_to_ids(date_str, INPUT_CHARS)
print(date_ids)


[11, 52, 54, 45, 48, 63, 2, 2, 64, 63, 2, 0, 1, 9]


In [315]:
date_str = "April 21, 2019"
date_ids = date_str_to_ids(date_str, INPUT_CHARS)
print(date_ids)


[11, 52, 54, 45, 48, 63, 2, 1, 64, 63, 2, 0, 1, 9]
