In [1]:
import os
import json
from tqdm import tqdm
import pandas as pd
from notebooks.utils import logger, file_with_timestamp
from notebooks.config import DATASET_PATH, CHUNK_SIZE, FILTERED_DATASET_DIRECTORY, FILTER_RULES_PATH

In [2]:
logger = logger()

if not os.path.exists(FILTERED_DATASET_DIRECTORY):
    os.makedirs(FILTERED_DATASET_DIRECTORY)

<h4>Read dataset</h4>

In [3]:
def df_chunks():
    return pd.read_csv(DATASET_PATH, sep='\t', header=None, chunksize=CHUNK_SIZE)

df_chunks().read(5).head()

Unnamed: 0,0,1,2,3,4
0,christmas tree on a black background .,https://thumb1.shutterstock.com/display_pic_wi...,"christmas tree,christmas decoration,font,text,...","/m/025nd,/m/05fc9mj,/m/03gq5hm,/m/07s6nbt,/m/0...","0.9818305373191833,0.952756941318512,0.9227379..."
1,item : drawing of a figure surrounded by person,https://i.pinimg.com/736x/f9/fd/48/f9fd4878090...,"drawing,modern art,line,visual arts,art,sketch...","/m/02csf,/m/015r61,/m/03scnj,/m/0p9xx,/m/0jjw,...","0.8945257067680359,0.8489813804626465,0.828646..."
2,the sidewalk near the corner of streets has on...,http://s3-us-west-2.amazonaws.com/ktoo/2017/08...,"mode of transport,transport,vehicle,street,nei...","/m/079bkr,/m/07bsy,/m/07yv9,/m/01c8br,/m/0180x...","0.8588771224021912,0.8513096570968628,0.783954..."
3,actor attends the season premiere,https://media.gettyimages.com/photos/aidan-gil...,"musician,premiere,event,singer,suit,performance","/m/09jwl,/m/03n3f3,/m/081pkj,/m/09l65,/m/01xyh...","0.8445718288421631,0.8338453769683838,0.670432..."
4,"another possible invitation -- love the font ,...",https://i.pinimg.com/736x/c1/c3/3a/c1c33aed458...,"text,font,material property,label","/m/07s6nbt,/m/03gq5hm,/m/0457gc6,/m/05c0n6k","0.9419509172439575,0.8490258455276489,0.699199..."


<h4>Filter dataset</h4>

In [4]:
with open(FILTER_RULES_PATH, 'r') as file:
    filter_rules = json.load(file)
    filter_rules['include'] = set(filter_rules['include'])
    filter_rules['exclude'] = set(filter_rules['exclude'])
    filter_rules['excludeDomains'] = set(filter_rules['excludeDomains'])

def leave_in_dataset(image_labels):
    for include in filter_rules['include']:
        if include in image_labels:
            for label in image_labels:
                if label in filter_rules['exclude']:
                    return False
            return True
    return False

In [5]:
filtered_chunks = []

for i, dataset in enumerate(df_chunks()):
    dataset[5] = False
    logger.info("Chunk #{}".format(i + 1))
    for index, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
        try:
            if type(row[2]) == float:
                continue
            labels = row[2].split(',')
            if leave_in_dataset(labels):
                dataset.at[index, 5] = True
        except Exception as e:
            logger.error(e)
            logger.error(row)
            break
    filtered_chunks.append(dataset[dataset[5] == True].drop(5, axis=1))

filtered_dataset = pd.concat(filtered_chunks)
filtered_dataset.head()

[32m2024-04-04 19:53:16,333 - INFO - Chunk #1[0m
100%|██████████| 100000/100000 [00:03<00:00, 28028.50it/s]
[32m2024-04-04 19:53:20,570 - INFO - Chunk #2[0m
100%|██████████| 100000/100000 [00:03<00:00, 27874.41it/s]
[32m2024-04-04 19:53:24,801 - INFO - Chunk #3[0m
100%|██████████| 100000/100000 [00:03<00:00, 27602.07it/s]
[32m2024-04-04 19:53:29,168 - INFO - Chunk #4[0m
100%|██████████| 100000/100000 [00:03<00:00, 27853.38it/s]
[32m2024-04-04 19:53:33,356 - INFO - Chunk #5[0m
100%|██████████| 100000/100000 [00:03<00:00, 27926.18it/s]
[32m2024-04-04 19:53:37,590 - INFO - Chunk #6[0m
100%|██████████| 100000/100000 [00:03<00:00, 28087.20it/s]
[32m2024-04-04 19:53:41,794 - INFO - Chunk #7[0m
100%|██████████| 100000/100000 [00:03<00:00, 27597.06it/s]
[32m2024-04-04 19:53:46,064 - INFO - Chunk #8[0m
100%|██████████| 100000/100000 [00:03<00:00, 27086.10it/s]
[32m2024-04-04 19:53:50,375 - INFO - Chunk #9[0m
100%|██████████| 100000/100000 [00:03<00:00, 27720.03it/s]
[32m2024-

Unnamed: 0,0,1,2,3,4
5,a woman walks her dog on the beach .,https://media.gettyimages.com/photos/woman-wal...,"water,beach,sea,shore,ocean,canidae,dog,sky,wa...","/m/0838f,/m/0b3yr,/m/06npx,/m/02fm9k,/m/05kq4,...","0.9512578248977661,0.9311308860778809,0.916622..."
6,using shrubs and hedges as fences is a very po...,http://ift.tt/1URq5fp,"garden,shrub,tree,natural landscape,grass,bota...","/m/0bl0l,/m/0gqbt,/m/07j7r,/m/03d28y3,/m/08t9c...","0.9539984464645386,0.9404484033584595,0.888916..."
7,a beautiful day with some buildings and plants .,https://d1tq208oegmb9e.cloudfront.net/site_pho...,"residential area,property,home,building,house,...","/m/02nfxt,/m/05wrt,/m/01l0mw,/m/0cgh4,/m/03jm5...","0.9630937576293945,0.9594130516052246,0.942743..."
30,giraffes explore the newly extended plains of ...,http://images.archant.co.uk/polopoly_fs/1.5091...,"terrestrial animal,giraffe,vertebrate,wildlife...","/m/0fbf1m,/m/03bk1,/m/09686,/m/01280g,/m/01v46...","0.9922448396682739,0.9903286099433899,0.985110..."
31,after the fall - the riots,https://i.pinimg.com/736x/04/51/b5/0451b578557...,"smoke,stunt performer,sky,fun,event,photograph...","/m/06q40,/m/01tkqy,/m/01bqvp,/m/0ds99lh,/m/081...","0.7935611605644226,0.7750824689865112,0.732611..."


In [7]:
filtered_dataset.to_csv(os.path.join(FILTERED_DATASET_DIRECTORY, file_with_timestamp('filtered_dataset.tsv')), sep='\t', header=False)

In [12]:
import tensorflow_datasets as tfds

def filter_labels(sample):
    return sample["label"] >= 0

snli_train = tfds.load("snli", split="train[:20%]")
snli_val = tfds.load("snli", split="validation")
snli_test = tfds.load("snli", split="test")

# Here's an example of how our training samples look like, where we randomly select
# four samples:
sample = snli_test.batch(1).take(1).get_single_element()
sample

[32m2024-04-04 20:29:24,618 - INFO - Load dataset info from /home/emiliia/tensorflow_datasets/snli/1.1.0[0m
[32m2024-04-04 20:29:24,621 - INFO - Reusing dataset snli (/home/emiliia/tensorflow_datasets/snli/1.1.0)[0m
[32m2024-04-04 20:29:24,623 - INFO - Creating a tf.data.Dataset reading 1 files located in folders: /home/emiliia/tensorflow_datasets/snli/1.1.0.[0m
[32m2024-04-04 20:29:24,648 - INFO - Constructing tf.data.Dataset snli for split train[:20%], from /home/emiliia/tensorflow_datasets/snli/1.1.0[0m
[32m2024-04-04 20:29:24,650 - INFO - Load dataset info from /home/emiliia/tensorflow_datasets/snli/1.1.0[0m
[32m2024-04-04 20:29:24,652 - INFO - Reusing dataset snli (/home/emiliia/tensorflow_datasets/snli/1.1.0)[0m
[32m2024-04-04 20:29:24,653 - INFO - Creating a tf.data.Dataset reading 1 files located in folders: /home/emiliia/tensorflow_datasets/snli/1.1.0.[0m
[32m2024-04-04 20:29:24,677 - INFO - Constructing tf.data.Dataset snli for split validation, from /home/emil

{'hypothesis': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'A girl is entertaining on stage'], dtype=object)>,
 'label': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([0])>,
 'premise': <tf.Tensor: shape=(1,), dtype=string, numpy=
 array([b'A girl in a blue leotard hula hoops on a stage with balloon shapes in the background.'],
       dtype=object)>}

In [5]:
import tensorflow as tf

def split_labels(sample):
    x = (sample["hypothesis"], sample["premise"])
    y = sample["label"]
    return x, y


train_ds = (
    snli_train.filter(filter_labels)
    .map(split_labels, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(16)
)
val_ds = (
    snli_val.filter(filter_labels)
    .map(split_labels, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(16)
)
test_ds = (
    snli_test.filter(filter_labels)
    .map(split_labels, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(16)
)

In [6]:
from keras_nlp.models import RobertaClassifier

classifier = RobertaClassifier.from_preset("roberta_base_en", num_classes=3)

In [None]:
sample = (sample["hypothesis"], sample["premise"])
sample

In [None]:
import numpy as np
predictions = classifier.predict(sample)


def softmax(x):
    return np.exp(x) / np.exp(x).sum(axis=0)


# Get the class predictions with maximum probabilities
predictions = softmax(predictions)

In [8]:
import keras

restored_model = keras.models.load_model("models/bert_classifier.keras")
# restored_model.evaluate(test_ds)

  instance.compile_from_config(compile_config)
  trackable.load_own_variables(weights_store.get(inner_path))


    410/Unknown [1m20s[0m 47ms/step - loss: 0.5012 - sparse_categorical_accuracy: 0.8034

KeyboardInterrupt: 

In [10]:
print((sample["hypothesis"], sample["premise"]))

(<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'A girl is entertaining on stage',
       b'A group of people posing in front of a body of water.',
       b"The group of people aren't inide of the building.",
       b'The people are taking a carriage ride.'], dtype=object)>, <tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'A girl in a blue leotard hula hoops on a stage with balloon shapes in the background.',
       b'A group of people taking pictures on a walkway in front of a large body of water.',
       b'Many people standing outside of a place talking to each other in front of a building that has a sign that says "HI-POINTE."',
       b'Three people are riding a carriage pulled by four horses.'],
      dtype=object)>)


In [9]:
import numpy as np

predictions = restored_model.predict((sample["hypothesis"], sample["premise"]))


def softmax(x):
    return np.exp(x) / np.exp(x).sum(axis=0)


# Get the class predictions with maximum probabilities
predictions = softmax(predictions)
print(predictions)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 854ms/step
[[0.1605997  0.5688096  0.04371636]
 [0.49701223 0.13395983 0.06036862]
 [0.03271612 0.15026565 0.8166039 ]
 [0.30967197 0.14696485 0.07931111]]


In [16]:
(sample["hypothesis"], sample["premise"])

(<tf.Tensor: shape=(4,), dtype=string, numpy=
 array([b'A girl is entertaining on stage',
        b'A group of people posing in front of a body of water.',
        b"The group of people aren't inide of the building.",
        b'The people are taking a carriage ride.'], dtype=object)>,
 <tf.Tensor: shape=(4,), dtype=string, numpy=
 array([b'A girl in a blue leotard hula hoops on a stage with balloon shapes in the background.',
        b'A group of people taking pictures on a walkway in front of a large body of water.',
        b'Many people standing outside of a place talking to each other in front of a building that has a sign that says "HI-POINTE."',
        b'Three people are riding a carriage pulled by four horses.'],
       dtype=object)>)

In [17]:
print(sample["label"])

tf.Tensor([0 0 0 0], shape=(4,), dtype=int64)


In [25]:
sample = snli_test.batch(4).take(3)
for el in sample:
    print(el)

{'hypothesis': <tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'A girl is entertaining on stage',
       b'A group of people posing in front of a body of water.',
       b"The group of people aren't inide of the building.",
       b'The people are taking a carriage ride.'], dtype=object)>, 'label': <tf.Tensor: shape=(4,), dtype=int64, numpy=array([0, 0, 0, 0])>, 'premise': <tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'A girl in a blue leotard hula hoops on a stage with balloon shapes in the background.',
       b'A group of people taking pictures on a walkway in front of a large body of water.',
       b'Many people standing outside of a place talking to each other in front of a building that has a sign that says "HI-POINTE."',
       b'Three people are riding a carriage pulled by four horses.'],
      dtype=object)>}
{'hypothesis': <tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'Two girls are getting ready to ride horses.',
       b'Two youths were pushing each othe

2024-03-31 12:07:11.833446: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [30]:
print(df_chunks().read(5)[0].tolist())
print(df_chunks().read(5)[2].tolist())

['christmas tree on a black background .', 'item : drawing of a figure surrounded by person', 'the sidewalk near the corner of streets has one of the few vending machines .', 'actor attends the season premiere', 'another possible invitation -- love the font , just need to switch to coral and aqua colors']
['christmas tree,christmas decoration,font,text,graphic design,illustration,interior design,tree,christmas eve,ornament,fir,plant,pine,pine family,graphics', 'drawing,modern art,line,visual arts,art,sketch,artwork,photographic paper,painting,illustration,black-and-white', 'mode of transport,transport,vehicle,street,neighbourhood,road surface,advertising,automotive exterior,asphalt,signage,sign,road,truck,car,city', 'musician,premiere,event,singer,suit,performance', 'text,font,material property,label']


In [13]:
print((df_chunks().read(5)[0].tolist(), df_chunks().read(5)[2].tolist()))

(['christmas tree on a black background .', 'item : drawing of a figure surrounded by person', 'the sidewalk near the corner of streets has one of the few vending machines .', 'actor attends the season premiere', 'another possible invitation -- love the font , just need to switch to coral and aqua colors'], ['christmas tree,christmas decoration,font,text,graphic design,illustration,interior design,tree,christmas eve,ornament,fir,plant,pine,pine family,graphics', 'drawing,modern art,line,visual arts,art,sketch,artwork,photographic paper,painting,illustration,black-and-white', 'mode of transport,transport,vehicle,street,neighbourhood,road surface,advertising,automotive exterior,asphalt,signage,sign,road,truck,car,city', 'musician,premiere,event,singer,suit,performance', 'text,font,material property,label'])


In [31]:
predictions = restored_model.predict((df_chunks().read(5)[0].tolist(), df_chunks().read(5)[2].tolist()))


def softmax(x):
    return np.exp(x) / np.exp(x).sum(axis=0)


# Get the class predictions with maximum probabilities
predictions = softmax(predictions)
print(predictions)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 907ms/step
[[0.0110986  0.3454727  0.4590159 ]
 [0.6146138  0.05491738 0.07915103]
 [0.20336917 0.11330616 0.1057053 ]
 [0.15276171 0.16087724 0.08710521]
 [0.01815683 0.32542652 0.2690226 ]]


In [32]:
dataset_info = tfds.builder("snli")
class_names = dataset_info.info.features['label'].names

[32m2024-03-31 12:18:37,810 - INFO - Load dataset info from /home/emiliia/tensorflow_datasets/snli/1.1.0[0m


In [33]:
print(class_names)

['entailment', 'neutral', 'contradiction']
