In [1]:
from bwmd.tools import load_vectors, convert_vectors_to_dict
from bwmd.compressor import Compressor
from bwmd.partition import build_partitions_lookup_tables
from bwmd.distance import BWMD

In [2]:
PATH = 'crawl-300d-2M.vec'
ORIGINAL_DIM = 300
REDUCED_DIM = 512
COMPRESSION = 'bool_'

In [3]:
# Load real-valued vectors.
vectors, words = load_vectors(
    path=PATH,
    size=100_000,
    expected_dimensions=ORIGINAL_DIM,
    skip_first_line=True,
)

100%|████████████████████████████████████████████████████████████████████| 100000/100000 [00:20<00:00, 4903.58it/s]


In [4]:
# Instantiate compressor.
compressor = Compressor(
    original_dimensions=ORIGINAL_DIM,
    reduced_dimensions=REDUCED_DIM,
    compression=COMPRESSION,
)

In [5]:
# Fit compressor to the data.
compressor.fit(vectors, epochs=10)

Epoch:  0		Loss:  4156.976
Epoch:  1		Loss:  2285.002
Epoch:  2		Loss:  1316.982
Epoch:  3		Loss:  817.099
Epoch:  4		Loss:  573.012
Epoch:  5		Loss:  466.862
Epoch:  6		Loss:  436.262
Epoch:  7		Loss:  435.91
Epoch:  8		Loss:  435.609
Epoch:  9		Loss:  434.806


In [6]:
# Transform and save original vectors.
output_dir = compressor.transform(PATH, save=True, n_vectors=30_000)

100%|██████████████████████████████████████████████████████████████████████| 30000/30000 [00:04<00:00, 6214.70it/s]


Encoding vectors ...
Exporting compressed vectors ...


In [7]:
# Create a set of lookup tables from the exported model.
vectors, words = load_vectors(
    path=f'{output_dir}\\vectors.txtc',
    size=30_000,
    expected_dimensions=REDUCED_DIM,
    expected_dtype=COMPRESSION,
)
# Convert to dict.
vectors_compressed = convert_vectors_to_dict(vectors, words)

100%|█████████████████████████████████████████████████████████████████████| 30000/30000 [00:01<00:00, 28299.85it/s]


In [8]:
# Build and save the lookup tables.
model_path = build_partitions_lookup_tables(
    vectors_compressed,
    I=11,
    real_value_path=PATH,
    vector_dim=REDUCED_DIM,
)

Making 100 partitionings of size 2048


100%|████████████████████████████████████████████████████████████████████████████| 100/100 [09:00<00:00,  5.40s/it]


Time to compute partitionings:  540.234
Loading partitionings ...


100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:14<00:00,  7.08it/s]


Organizing associated words for all tokens ...


100%|███████████████████████████████████████████████████████████████████████| 28117/28117 [01:48<00:00, 258.06it/s]


Loading raw vectors ...


100%|██████████████████████████████████████████████████████████████████████| 28117/28117 [00:05<00:00, 5410.12it/s]


Loading wordlists ...


100%|███████████████████████████████████████████████████████████████████████| 28116/28116 [01:32<00:00, 303.46it/s]


Computing cosine distances for each token ...


100%|██████████████████████████████████████████████████████████████████████| 28116/28116 [00:13<00:00, 2096.15it/s]


Time to compute lookup tables:  241.579


'crawl-300d-2M'

In [12]:
# Instantiate bwmd object.
bwmd = BWMD(
    model_path=model_path,
    dim=REDUCED_DIM,
    size_vocab=30_000,
    language="english",
)

100%|█████████████████████████████████████████████████████████████████████| 30000/30000 [00:00<00:00, 30597.10it/s]


In [13]:
# Instantiate corpus of texts.
corpus = [
    'Obama speaks to the media in Illinois',
    'The President greets the press in Chicago',
    'This sentence is unrelated'
]
# Distance measures require docs as lists of strings.
corpus = [doc.split() for doc in corpus]
corpus = bwmd.preprocess_corpus(corpus)

In [14]:
# Get pairwise distance.
bwmd.pairwise(corpus)

100%|███████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 187.63it/s]


array([[0.        , 0.25683594, 0.29711914],
       [0.25683594, 0.        , 0.27783203],
       [0.29711914, 0.27783203, 0.        ]])