# **Descarga y Análisis de Datos + Dataloader**


In [1]:
import os
import pickle

if not os.path.exists('/content/dual_encoding'):
  !git clone https://github.com/danieljf24/dual_encoding.git
  

Cloning into 'dual_encoding'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 169 (delta 5), reused 22 (delta 3), pack-reused 142[K
Receiving objects: 100% (169/169), 307.21 KiB | 10.97 MiB/s, done.
Resolving deltas: 100% (58/58), done.


## Descargar MSR-VTT Dataset y Word2Vec

In [0]:
#Ejecutar para descargar los datos
%cd /content

# download and extract dataset
!wget http://lixirong.net/data/cvpr2019/msrvtt10k-text-and-resnet-152-img1k.tar.gz
!tar zxf msrvtt10k-text-and-resnet-152-img1k.tar.gz

# download and extract pre-trained word2vec
!wget http://lixirong.net/data/w2vv-tmm2018/word2vec.tar.gz
!tar zxf word2vec.tar.gz

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
#Ejecutar sólo si se tiene el dataset almacenado en un drive local
%cd /content/drive/My\ Drive/FCFM/Deep\ Learning/Proyecto

# extraer dataset desde local
!tar zxf msrvtt10k-text-and-resnet-152-img1k.tar.gz -C /content/dual_encoding

# extraer W2V desde local
!tar zxf word2vec.tar.gz -C /content/dual_encoding

/content/drive/My Drive/FCFM/Deep Learning/Proyecto


## Extraer el Vocabulario


In [4]:
%cd /content/dual_encoding

trainCollection = 'msrvtt10ktrain'
valCollection = 'msrvtt10kval'
rootpath = '/content/dual_encoding'

threshold=5
overwrite=0

#Extraer vocabulario del training set
text_styles = ['bow', 'rnn']
for text_style in text_styles:
  !python util/vocab.py --rootpath $rootpath $trainCollection --threshold $threshold --text_style $text_style --overwrite $overwrite
  

/content/dual_encoding
{
  "threshold": 5, 
  "text_style": "bow", 
  "rootpath": "/content/dual_encoding", 
  "collection": "msrvtt10ktrain", 
  "overwrite": 0
}
[29 Oct 16:18:56 - vocab.py:line 110] Saved vocabulary file to /content/dual_encoding/msrvtt10ktrain/TextData/vocabulary/bow/word_vocab_5.pkl
[29 Oct 16:18:56 - vocab.py:line 116] Saved vocabulary counter file to /content/dual_encoding/msrvtt10ktrain/TextData/vocabulary/bow/word_vocab_counter_5.txt
{
  "threshold": 5, 
  "text_style": "rnn", 
  "rootpath": "/content/dual_encoding", 
  "collection": "msrvtt10ktrain", 
  "overwrite": 0
}
[29 Oct 16:18:59 - vocab.py:line 110] Saved vocabulary file to /content/dual_encoding/msrvtt10ktrain/TextData/vocabulary/rnn/word_vocab_5.pkl
[29 Oct 16:18:59 - vocab.py:line 116] Saved vocabulary counter file to /content/dual_encoding/msrvtt10ktrain/TextData/vocabulary/rnn/word_vocab_counter_5.txt


## Importar esenciales para cargar data

In [5]:
%cd /content/dual_encoding/util

import data_provider as data
from text2vec import get_text_encoder
from basic.bigfile import BigFile
from vocab import Vocabulary

%cd ..

from model import get_model, get_we_parameter
from basic.util import read_dict


/content/dual_encoding/util
/content/dual_encoding


## Carga de data

In [43]:
visual_feature = 'resnet-152-img1k-flatten0_outputos'
vocab = 'word_vocab_5'

# collections: trian, val
collections = {'train': trainCollection, 'val': valCollection}
cap_file = {'train': '%s.caption.txt'%trainCollection, 
            'val': '%s.caption.txt'%valCollection}
# caption
caption_files = { x: os.path.join(rootpath, collections[x], 'TextData', cap_file[x])
                    for x in collections }

# Load visual features
visual_feat_path = {x: os.path.join(rootpath, collections[x], 'FeatureData', visual_feature)
                    for x in collections }
visual_feats = {x: BigFile(visual_feat_path[x]) for x in visual_feat_path}

visual_feat_dim = visual_feats['train'].ndims

# set bow vocabulary and encoding
bow_vocab_file = os.path.join(rootpath, trainCollection, 'TextData', 'vocabulary', 'bow', vocab+'.pkl')
bow_vocab = pickle.load(open(bow_vocab_file, 'rb'))
bow2vec = get_text_encoder('bow')(bow_vocab)
bow_vocab_size = len(bow_vocab)

# set rnn vocabulary 
rnn_vocab_file = os.path.join(rootpath, trainCollection, 'TextData', 'vocabulary', 'rnn', vocab+'.pkl')
rnn_vocab = pickle.load(open(rnn_vocab_file, 'rb'))
vocab_size = len(rnn_vocab)

w2v_data_path = os.path.join(rootpath, "word2vec", 'flickr', 'vec500flickr30m')
we_parameter = get_we_parameter(rnn_vocab, w2v_data_path)

'''# mapping layer structure'''

video2frames = {x: read_dict(os.path.join(rootpath, collections[x], 'FeatureData', visual_feature, 'video2frames.txt'))
                for x in collections }

[BigFile] 305462x2048 instances loaded from /content/dual_encoding/msrvtt10ktrain/FeatureData/resnet-152-img1k-flatten0_outputos
[BigFile] 305462x2048 instances loaded from /content/dual_encoding/msrvtt10kval/FeatureData/resnet-152-img1k-flatten0_outputos


[29 Oct 20:36:23 - text2vec.py:line 13] text2vec.py.Bow2Vec initializing ...


[BigFile] 1743364x500 instances loaded from /content/dual_encoding/word2vec/flickr/vec500flickr30m
('getting pre-trained parameter for word embedding initialization', (7811, 500))


## Generar Dataloader

In [0]:
batch_size = 128
workers = 5
n_caption = 20

# set data loader
data_loaders = data.get_data_loaders(
    caption_files, visual_feats, rnn_vocab, bow2vec, batch_size, workers, n_caption, video2frames=video2frames)

### Estructura de Dataloader

In [28]:
train_loader = data_loaders['train']

print("Estructura de un elemento del Dataloader")
print('Dataloader: {}'.format(type(train_loader)))
for i, train_data in enumerate(train_loader):
  if i > 0:
    break  
  captions, videos, b2v, v, v2f = train_data
  print('Len train_data: {}, {}'.format(len(train_data), type(train_data)))
  for j in range(len(train_data) - 3):
    print('Len train_data[{}]: {}, {}'.format(j, len(train_data[j]), type(train_data[j])))
    for k in range(len(train_data[j])):
      print('Len train_data[{}][{}]: {}, {}'.format(j, k, len(train_data[j][k]), type(train_data[j][k])))
  print('Len train_data[{}]: {}, {} de {}'.format(2, len(train_data[2]), type(train_data[2]), type(train_data[2][0])))
  print('Len train_data[{}]: {}, {} de {}'.format(3, len(train_data[3]), type(train_data[3]), type(train_data[3][0])))
  print('Len train_data[{}]: {}, {} de {}'.format(4, len(train_data[4]), type(train_data[4]), type(train_data[4][0])))

Estructura de un elemento del Dataloader
Dataloader: <class 'torch.utils.data.dataloader.DataLoader'>
Len train_data: 5, <type 'tuple'>
Len train_data[0]: 4, <type 'tuple'>
Len train_data[0][0]: 128, <class 'torch.Tensor'>
Len train_data[0][1]: 128, <class 'torch.Tensor'>
Len train_data[0][2]: 128, <type 'list'>
Len train_data[0][3]: 128, <class 'torch.Tensor'>
Len train_data[1]: 4, <type 'tuple'>
Len train_data[1][0]: 128, <class 'torch.Tensor'>
Len train_data[1][1]: 128, <class 'torch.Tensor'>
Len train_data[1][2]: 128, <type 'list'>
Len train_data[1][3]: 128, <class 'torch.Tensor'>
Len train_data[2]: 128, <type 'tuple'> de <type 'int'>
Len train_data[3]: 128, <type 'tuple'> de <type 'str'>
Len train_data[4]: 128, <type 'tuple'> de <type 'str'>


## Análisis del Vocabulario

In [0]:
print('Cantidad de palabras del BoW: {}'.format(bow_vocab_size))
print('Cantidad de palabras del RNN: {}'.format(vocab_size))

Cantidad de palabras del BoW: 7807
Cantidad de palabras del RNN: 7811


El vocabulario está compuesto por 7807 palabras. Vemos que en el caso del vocabulario del RNN, tiene 4 palabras agregadas que vienen siendo las palabras que denotan a las oraciones: \<pad>, \<start>, \<end>, \<unk>.

## Análisis de captions en training, validation y testing sets

### Training Set

In [0]:
#Training set
print('Análisis Training Set')

train_caption_path = caption_files['train']
train_caption = open(train_caption_path, 'r')
tc_cnt = 0
tc_dict = {}

train_line = train_caption.readline().split('#')[0]
while train_line:
  try:
    tc_dict[line] += 1
  except:
    tc_dict[line] = 1
  tc_cnt += 1
  train_line = train_caption.readline().split('#')[0]
    
key_max = max(tc_dict.keys(), key=(lambda k: tc_dict[k]))
key_min = min(tc_dict.keys(), key=(lambda k: tc_dict[k]))
print("Cantidad de captions en training set: {}".format(tc_cnt))
print("Máxima cantidad de captions de algún video: {}".format(tc_dict[key_max]))
print("Mínima cantidad de captions de algún video: {}".format(tc_dict[key_min]))



### Validation Set

In [0]:
#Validation set
print('Análisis Validation Set')

val_caption_path = caption_files['val']
val_caption = open(val_caption_path, 'r')
vc_cnt = 0
vc_dict = {}
val_line = val_caption.readline().split('#')[0]
while val_line:
  try:
    vc_dict[line] += 1
  except:
    vc_dict[line] = 1
  vc_cnt += 1
  val_line = val_line.readline().split('#')[0]
    
key_max = max(vc_dict.keys(), key=(lambda k: vc_dict[k]))
key_min = min(vc_dict.keys(), key=(lambda k: vc_dict[k]))
print("Cantidad de captions en validation set: {}".format(vc_cnt))
print("Máxima cantidad de captions de algún video: {}".format(vc_dict[key_max]))
print("Mínima cantidad de captions de algún video: {}".format(vc_dict[key_min]))
val_caption.close()

### Testing Set

In [0]:
#Testing set
print('Análisis Testing Set')

test_caption_path = '/content/dual_encoding/'
test_caption = open(test_caption_path, 'r')
test_cnt = 0
test_dict = {}
test_line = test_caption.readline().split('#')[0]
while test_line:
  try:
    test_dict[line] += 1
  except:
    test_dict[line] = 1
  test_cnt += 1
  test_line = val_line.readline().split('#')[0]
    
key_max = max(test_dict.keys(), key=(lambda k: test_dict[k]))
key_min = min(test_dict.keys(), key=(lambda k: test_dict[k]))
print("Cantidad de captions en validation set: {}".format(vc_cnt))
print("Máxima cantidad de captions de algún video: {}".format(test_dict[key_max]))
print("Mínima cantidad de captions de algún video: {}".format(test_dict[key_min]))
test_caption.close()

## Análisis W2V

In [42]:
import numpy as np
#rnn_vocab, w2v_data_path
w2v_reader = BigFile(w2v_data_path)
print('Tamaño del embedding: {}'.format(w2v_reader.shape()))
ndims = w2v_reader.ndims

we = []
for i in range(len(rnn_vocab)):
  #if i > 0:
    #break
  try:
      vec = w2v_reader.read_one(rnn_vocab.idx2word[i])
  except:
      vec = np.random.uniform(-1, 1, ndims)
  we.append(vec)
  
print('Tamaño vocabulario: {}'.format(len(we)))
print('Tamaño del vector representativo de cada palabra: {}'.format(len(we[0])))


[BigFile] 1743364x500 instances loaded from /content/dual_encoding/word2vec/flickr/vec500flickr30m
Tamaño del embedding: [1743364, 500]
Tamaño vocabulario: 7811
Tamaño del vector representativo de cada palabra: 500


Vemos que se cargan 1.743.364 de instancias del archivo W2V, donde cada instancia es un vector de dimensión 500 que representa una palabra.