In [3]:
import numpy as np
import pandas as pd
import os
from pathlib import Path

In [4]:
data_path = Path(os.getcwd()).parent.joinpath("data")
assert data_path.exists()

# Read WICSMMIR

In [5]:
wicsmmir_v2_test_p = data_path.joinpath("v2/test_set_v2.df.feather")
wicsmmir_v2_train_p = data_path.joinpath("v2/train_set_v2.df.feather")
assert wicsmmir_v2_test_p.exists()
assert wicsmmir_v2_train_p.exists()
wicsmmir_v2_test = pd.read_feather(wicsmmir_v2_test_p)
wicsmmir_v2_train = pd.read_feather(wicsmmir_v2_train_p)
wicsmmir_v2 = pd.concat([wicsmmir_v2_test, wicsmmir_v2_train], verify_integrity=True, ignore_index=True)

print(f"wicsmmir_v2.columns: {wicsmmir_v2.columns}")
print(f"len(wicsmmir_v2_test): {len(wicsmmir_v2_test)}")
print(f"len(wicsmmir_v2_train): {len(wicsmmir_v2_train)}")
print(f"len(wicsmmir_v2): {len(wicsmmir_v2)}")
print(f"len(wicsmmir_v2_test) + len(wicsmmir_v2_train): {len(wicsmmir_v2_test) + len(wicsmmir_v2_train)}")

wicsmmir_v2.columns: Index(['index', 'wikicaps_id', 'caption'], dtype='object')
len(wicsmmir_v2_test): 9380
len(wicsmmir_v2_train): 386494
len(wicsmmir_v2): 395874
len(wicsmmir_v2_test) + len(wicsmmir_v2_train): 395874


# Generate vocab for WICSMMIR V2 WITH SPACY

In [8]:
import sys
sys.path.append('..')
from utils import generate_corpus_vocab

In [9]:
n_spacy_workers = 8

vocab = generate_corpus_vocab(wicsmmir_v2, n_spacy_workers, backend='spacy')

2021-04-15 18:09:38.132 | INFO     | utils:generate_corpus_vocab:149 - Generating corpus vocabulary using SPACY...
100%|██████████| 395874/395874 [12:09<00:00, 542.95it/s]
2021-04-15 18:21:48.755 | INFO     | utils:generate_corpus_vocab:171 - Finished generating corpus vocabulary in 730.6183226108551 seconds!


In [17]:
vocab.sort_values(by=['count'], ascending=False, inplace=True)
vocab.reset_index().to_feather(data_path.joinpath('v2/vocab_spacy_v2.df.feather'))

Unnamed: 0_level_0,Unnamed: 1_level_0,count
token,pos,Unnamed: 2_level_1
",",PUNCT,1665285
the,DET,813731
.,PUNCT,719122
of,ADP,555006
in,ADP,393055
...,...,...
Sarrazin,PROPN,1
Coroutine,PROPN,1
communs,PROPN,1
Sustained,VERB,1
