In [None]:
import cv2, numpy as np, pandas as pd, random
from typing import Tuple, Literal, List
from huggingface_hub import login, logout
from bertopic import BERTopic
from datasets import load_dataset
from collections import Counter


class HGFresource:

    def __init__(self, token):
        # keep private
        self.__token = token

    def load_data(self, repo: str, sets: Literal['train', 'test', 'all'], sample_fraction: float) -> Tuple[np.ndarray]:
        '''
        Load dataset(s) from Hugging Face organization repository

        Parameters
        ----------

        repo : str
            Path to the data repo on Hugging Face

        sets : Literal['train', 'test', 'all']
            Data sets to export \n
            Possible options: `['train', 'test', 'all']`

        sample_fraction : float
            Share of the dataset to return

        Returns
        -------

        `Tuple[np.ndarray]`
            If `sets='train'` or `sets='test'`, returns corresponding images (converted to array) and labels, i.e. `images, labels` \n
            Otherwise, returns all train and test images (converted to array) and labels, i.e. `train_images, train_labels, test_images, test_labels`
        '''

        dataset = load_dataset(repo, token=self.__token)
        return dataset

In [None]:
import os, tensorflow as tf

HGF_TOKEN = os.environ['HUGGINGFACE_TOKEN']
HGF_DATA_REPO = os.environ['HUGGINGFACE_DATASET_REPO']
HGF_TOPIC_MODEL_REPO = os.environ['HUGGINGFACE_TOPIC_MODEL_REPO']

hgf = HGFresource(token=HGF_TOKEN)
dataset = hgf.load_data(repo=HGF_DATA_REPO, sets='all', sample_fraction=1.0)

In [None]:
train_data = dataset['train'].with_format('tf')

In [None]:
def preprocess_images(examples):
    examples['image'] = [tf.cast(image.convert('RGB'), tf.float32) / 255.0 for image in examples['image']]
    return examples

def preprocess_labels(example):
    zeros = np.zeros(29)
    np.put(zeros, example, 1)
    zeros = tf.convert_to_tensor(zeros)
    example = {'label': zeros}
    return example

In [None]:
train_data = train_data.map(preprocess_labels, input_columns=['label'])
train_data = train_data.with_transform(preprocess_images, ['image'], True)

In [None]:
train_data = train_data.to_tf_dataset(
    columns=["image"],
    label_cols=["label"],
    batch_size=32,
    # shuffle=True,
    prefetch=False
)

In [None]:
OPTIMIZER = 'adam'
LOSS = 'categorical_crossentropy'
METRICS = [tf.keras.metrics.F1Score('weighted')]

EPOCHS = 20
BATCH_SIZE = 32

In [None]:
model = tf.keras.models.Sequential(
    [
        tf.keras.layers.Conv2D(16, (3, 3), activation='relu', input_shape=(256, 219, 3)),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(29, activation='softmax')
    ]
)

In [None]:
model.compile(
    OPTIMIZER,
    LOSS,
    METRICS
)

In [None]:
history = model.fit(
    train_data,
    epochs=EPOCHS,
    # validation_data=(test_images, test_labels)
)

In [1]:
import os, numpy as np, tensorflow as tf
from data_loader.hgf_export import HGFresource
from dotenv import load_dotenv
load_dotenv()

HGF_TOKEN = os.environ['HUGGINGFACE_TOKEN']
HGF_DATA_REPO = os.environ['HUGGINGFACE_DATASET_REPO']
HGF_TOPIC_MODEL_REPO = os.environ['HUGGINGFACE_TOPIC_MODEL_REPO']

hgf = HGFresource(token=HGF_TOKEN)
train_images, train_labels, test_images, test_labels = hgf.load_data(repo=HGF_DATA_REPO, sets='all', sample_fraction=0.3)

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
train_labels

array([27, 11, 11, ..., 11, 21,  0])

In [8]:
class A:
    def __init__(self, a) -> None:
        self.a = a
    
    def print_res(self):
        def fun():
            return self.a
        print(fun())

In [9]:
a = A(1)
a.print_res()

1


In [None]:
OPTIMIZER = 'adam'
LOSS = 'categorical_crossentropy'
METRICS = [tf.keras.metrics.F1Score('weighted')]

EPOCHS = 20
BATCH_SIZE = 32

n_classes = np.unique(test_labels).shape[0]
input_shape = test_images[0].shape

In [None]:
train_images = train_images / 255.0
test_images = test_images / 255.0

train_labels = tf.keras.utils.to_categorical(train_labels, n_classes)
test_labels = tf.keras.utils.to_categorical(test_labels, n_classes)

In [None]:
train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels))
test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels))

In [None]:
del train_images, train_labels, test_images, test_labels

In [None]:
model = tf.keras.models.Sequential(
    [
        tf.keras.layers.Conv2D(16, (3, 3), activation='relu', input_shape=input_shape),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(n_classes, activation='softmax')
    ]
)

In [None]:
model.summary()

In [None]:
model.compile(
    OPTIMIZER,
    LOSS,
    METRICS
)

In [None]:
history = model.fit(
    train_data.batch(BATCH_SIZE),
    epochs=EPOCHS,
    # validation_data=(test_images, test_labels)
)

In [None]:
import matplotlib.pyplot as plt

def history_plot(history):
    train_loss = history.history['f1_score']
    epochs = range(1, len(train_loss) + 1)
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, train_loss, 'bo-')
    plt.title('Training Weighted F1 Score')
    plt.xlabel('Epochs')
    plt.ylabel('F1 Score')
    plt.grid(True)
    plt.show()

In [None]:
history_plot(history)

In [10]:
topic_model = hgf.load_model(HGF_TOPIC_MODEL_REPO)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/grigoryturchenko/.cache/huggingface/token
Login successful
Successfully logged out.


In [None]:
topics_info = topic_model.get_topic_info()
topics_info = topics_info[topics_info['Topic'] != -1]
topics_info['Representation'] = topics_info['Representation'].apply(lambda x: [i for i in x if len(i) > 2])

In [None]:
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
captions = topics_info['Representation'].apply(lambda x: ' '.join(x)).to_numpy()

nltk.download('punkt')
tokenized_captions = [nltk.word_tokenize(caption) for caption in captions]

In [None]:
vocab_size = len(
    set(
        [word for sublist in topics_info['Representation'] for word in sublist]
    )
)
tokenizer = Tokenizer(num_words=vocab_size)

In [None]:
tokenizer.fit_on_texts(tokenized_captions)
vocab = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(tokenized_captions)

In [None]:
max_sequence_length = topics_info['Representation'].apply(lambda x: len(x)).max()
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

In [None]:
topic_to_seq = dict(zip(
    [i for i in range(padded_sequences.shape[0])],
    [i for i in padded_sequences]
))

In [None]:
train_labels_seq = np.array([topic_to_seq[i] for i in train_labels])
test_labels_seq = np.array([topic_to_seq[i] for i in test_labels])

In [None]:
# Define the encoder (CNN) input
encoder_input = Input(shape=train_images[0].shape)

# Define the CNN layers
conv1 = Conv2D(64, (3, 3), activation='relu', padding='same')(encoder_input)
pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(pool1)
pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
conv3 = Conv2D(64, (3, 3), activation='relu', padding='same')(pool2)
pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)
flatten = Flatten()(pool3)
hidden1 = Dense(128, activation='relu')(flatten)

# Define the embedding layer to convert words to vectors
embedding = Embedding(
    input_dim=vocab_size + 1,
    output_dim=128,
    input_length=max_sequence_length,
)(hidden1)

# Define the LSTM layer for sequence generation
lstm = LSTM(128, return_sequences=True)(embedding)

# Define the output layer for generating sequences
decoder_output = Dense(vocab_size, activation='softmax')(lstm)

# Combine the encoder and decoder models to create the Seq2Seq model
seq2seq_model = Model(encoder_input, decoder_output)

In [None]:
OPTIMIZER = 'adam'
LOSS = 'categorical_crossentropy'
METRICS = ['f1_score']

EPOCHS = 20
BATCH_SIZE = 32

In [None]:
seq2seq_model.compile(
    OPTIMIZER,
    LOSS,
    METRICS
)

In [None]:
seq2seq_model.summary()

In [None]:
import gc

gc.collect()

In [None]:
history = seq2seq_model.fit(
    train_images,
    train_labels_seq,
    BATCH_SIZE,
    EPOCHS,
    validation_data=(test_images, test_labels_seq)
)