In [None]:
from google.colab import drive
drive.mount('/content/drive')

filename = '/content/drive/Shareddrives/UPC-video-caption/01-DATASET/train/captions.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**CSV Library**

In [None]:
import csv

# Read CSV file
with open(filename, newline='') as csvfile:
    reader = list(csv.reader(csvfile, delimiter=' ', quotechar='|'))

    # Process rows and create token dictionary
    token_dictionary = []
    for row in reader:
        last_element = row[-1].split(',')[0]
        clean_row = row[:-1] + [last_element]
        token_dictionary.extend(clean_row)

# Create sorted token dictionary
unique_token_dictionary = sorted(set(token_dictionary))

print(token_dictionary)
print(len(token_dictionary))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



**Pandad Library**

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv(filename)

# Splitting all caption cells into tokens
token_dictionary = np.concatenate(df.iloc[:, 0].str.split().values)

unique_token_dictionary = set(token_dictionary)
print("token_dictionary lenght: ")
print(len(token_dictionary))
print("unique_token_dictionary lenght:")
print(len(unique_token_dictionary))

token_dictionary lenght: 
1207727
unique_token_dictionary lenght:
23666


**List of words ordered by frecuency**

In [None]:
import collections

words_frequencies = collections.Counter(token_dictionary)
words_frequencies = sorted(words_frequencies, key=lambda x: words_frequencies[x], reverse=True)

words_frequencies.insert(0, "<eos>")
words_frequencies.insert(0, "<sos>")
words_frequencies.insert(0, "<unk>")
words_frequencies.insert(0, "<pad>")

print("words_frequencies lenght:", len(words_frequencies))

truncated_words_frequencies = words_frequencies[:10000]

print("truncated_words_frequencies lenght:", len(truncated_words_frequencies))
print(truncated_words_frequencies)

words_frequencies lenght: 23670
truncated_words_frequencies lenght: 10000


**Order sentences by length keeping videoId**

In [None]:
import csv

sentences_dict = {}

with open(filename, 'r') as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row if present

    for row in reader:
        sentence = row[0]  # Sentence is in the first column
        video_id = row[1]  # Video ID is in the second column

        sentence_length = len(sentence)
        sentences_dict[video_id] = sentence

# Create a new dictionary by sorting the items based on sentence length
sorted_sentences_dict = {k: v for k, v in sorted(sentences_dict.items(), key=lambda x: len(x[1]), reverse=True)}

print(sorted_sentences_dict)
# for key, value in sorted_dict.items():
    # print(key, value)

{'video141': 'it is dance programe where one male and on female do the performance  the girl is attractive dress turn to her left where the man stand and goes near him and he lift her with his right hand and hold her and she held her one leg straight up and another down after few minutes she roll herself and come to normal stage and walk and stand nearby another pair is standing', 'video6295': 'eight people are running in hurdel jumping race in 5 jumping stages and reach the destination both side people are standing and watching and person in blue shirt takes photo and red color shirt wearing person stand near him', 'video4315': 'a lorry with hydraulic jack lift its one end to decan some materialanother hydraulic equpment is destroying a building another jcb is collecting the debris and finally one hydraulic drilling machine put hole in the building', 'video2547': 'a persons say that in a swimming pool they have kept the very big round slide and they have to go to the swimming area thr

Encoder (https://buomsoo-kim.github.io/attention/2020/04/21/Attention-mechanism-19.md/)

In [None]:
class TransformerNet(nn.Module):
  def __init__(self, num_src_vocab, num_tgt_vocab, embedding_dim, hidden_size, nheads, n_layers, max_src_len, max_tgt_len, dropout):
    super(TransformerNet, self).__init__()
    # embedding layer
    self.dec_embedding = nn.Embedding(len(unique_token_dictionary), embedding_dim)

    # positional encoding layer
    self.dec_pe = PositionalEncoding(embedding_dim, max_len = max_tgt_len)

    # encoder/decoder layer
    dec_layer = nn.TransformerDecoderLayer(embedding_dim, nheads, hidden_size, dropout)
    self.decoder = nn.TransformerDecoder(dec_layer, num_layers = n_layers)

    # final dense layer
    self.dense = nn.Linear(embedding_dim, num_tgt_vocab)
    self.log_softmax = nn.LogSoftmax()

  def forward(self, src, tgt):
    tgt = self.dec_embedding(tgt).permute(1, 0, 2)
    tgt = self.dec_pe(tgt)
    memory = self.encoder(src)
    transformer_out = self.decoder(tgt, memory)
    final_out = self.dense(transformer_out)
    return self.log_softmax(final_out)

In [None]:
## source: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)