In [1]:
!pip install faiss-cpu torch sentence-transformers ipywidgets

You should consider upgrading via the '/Users/kalyan/.pyenv/versions/3.8.7/bin/python -m pip install --upgrade pip' command.[0m


In [1]:
import pandas as pd
import faiss
import torch
import gzip
import pickle
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.auto import tqdm
from IPython.display import display, Markdown

import time

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

## 1. Import data and load models

In [2]:
def load_embeddings(filename):
    """Load compressed embeddings
    """
    
    with gzip.open(filename, 'rb') as emb_f:
        embeddings = pickle.load(emb_f)
    file = gzip.GzipFile(filename, 'rb')
    data = file.read()
    obj = pickle.loads(data)
    file.close()
    return obj


In [3]:
datasets = [
    {
        "model_name": "multi-qa-MiniLM-L6-cos-v1",
        "vectors_path": "../../s3-buckets/cpr-datasets/policy_text_embeddings_multi-qa-MiniLM-L6-cos-v1.pkl.gz",
        "mappings_path": "../../s3-buckets/cpr-datasets/policy_text_embeddings_mapping_multi-qa-MiniLM-L6-cos-v1.pkl.gz",
    },
    {
        "model_name": "msmarco-distilbert-dot-v5",
        "vectors_path": "../../s3-buckets/cpr-datasets/policy_text_embeddings_msmarco-distilbert-dot-v5.pkl.gz",
        "mappings_path": "../../s3-buckets/cpr-datasets/policy_text_embeddings_mapping_msmarco-distilbert-dot-v5.pkl.gz",
    },
]

# because of RAM limitations we can only run one dataset at a time
dset = datasets[1]

dset["mapping"] = pd.read_pickle(dset['mappings_path'], compression="gzip")
dset["vectors"] = load_embeddings(dset['vectors_path'])

dset["model"] = SentenceTransformer(dset["model_name"])

print(f"{dset['model_name']}: vectors {dset['vectors'].shape}; mapping {dset['mapping'].shape}")

msmarco-distilbert-dot-v5: vectors torch.Size([1666918, 768]); mapping (1666918, 4)


### 1a. Process data

In [4]:
# Remove short phrases
min_no_words = 4

dset['mapping']['text_no_words'] = dset['mapping']['text'].apply(lambda i: len(i.split(" ")))
dset['mapping'] = dset['mapping'][dset['mapping']['text_no_words'] >= min_no_words]

dset['vectors'] = dset['vectors'][dset['mapping'].index, :]

## 2. Create faiss indices

In [5]:
if "-cos-" in dset["model_name"]:
    dset["vectors"] = torch.nn.functional.normalize(
            dset["vectors"], p=2, dim=1
        )

dset['faiss'] = faiss.IndexFlatIP(dset['vectors'].shape[1])
dset['faiss'].add(dset['vectors'].numpy())

print(f"{dset['model_name']} created")
        

msmarco-distilbert-dot-v5 created


In [6]:
def query_index(query: str, dset: dict, n_res: int = 20):
    start = time.time()
    query_emb = dset['model'].encode([query])
    if "-cos-" in dset["model_name"]:
        query_emb = query_emb/np.linalg.norm(query_emb, ord=2, axis=1, keepdims=True)
    
    distances, idxs = dset['faiss'].search(query_emb, n_res)
    
    results = []
    for i in range(len(idxs[0])):
        row = dset['mapping'].iloc[idxs[0][i]].to_dict()
        row.update(
            {"similarity": distances[0][i]}
        )
        
        results.append(row)
    end = time.time()
    print(f"took {round(end-start, 1)}s")
    
    return pd.DataFrame(results)

### 3.1 Run a query across all available models

In [15]:
queries = [
    "transport levies in europe",
    "renewable energy",
    "deforestation in Brazil",
    "carbon tax",
    "net zero target",
    "electric vehicle charging",
    "incentives for innovation",
    "research and development investment",
    "nature-based solutions",
]

In [16]:
N = 30

search_results = []

for query in queries:
    display(Markdown(f"### query: *{query}*"))

    res = query_index(
        query,
        dset = dset,
        n_res = N,
    )

    res['query'] = query
    
    search_results.append(res)


### query: *transport levies in europe*

took 0.3s


### query: *renewable energy*

took 0.1s


### query: *deforestation in Brazil*

took 0.1s


### query: *carbon tax*

took 0.1s


### query: *net zero target*

took 0.1s


### query: *electric vehicle charging*

took 0.2s


### query: *incentives for innovation*

took 0.1s


### query: *research and development investment*

took 0.1s


### query: *nature-based solutions*

took 0.1s


In [17]:
pd.concat(search_results)[['query', 'text', 'text_id', 'similarity', 'policy_id', 'page_id',]].to_csv(f"results_{dset['model_name']}.csv")

## Appendix 1: measuring query time
This can be run without running the rest of the notebook, except for the imports.

In [27]:
query_length_range = (1, 6) # inclusive
no_samples = 500

model_names = [d['model_name'] for d in datasets]
mapping_data = datasets[0]['mapping']
text_data = mapping_data.loc[
    (mapping_data['text_no_words'] >= query_length_range[0]) & (mapping_data['text_no_words'] <= query_length_range[1]),
    'text'
].sample(no_samples, random_state=42).tolist()

In [30]:
def measure_time(model_name, text_samples):
    encoder = SentenceTransformer(model_name)
    start = time.time()
    for t in text_samples:
        _ = encoder.encode(t)
    end = time.time()
    
    second_per_enc = round((end-start)/len(text_samples), 4)
    time_taken = round(end-start, 2)
    
    print(f"{model_name} completed {len(text_samples)} encodings in {time_taken}s ({second_per_enc}s/encoding)")


for model_name in model_names:
    measure_time(model_name, text_data)

multi-qa-MiniLM-L6-cos-v1 completed 500 encodings in 8.07s (0.0161s/encoding)
msmarco-distilbert-dot-v5 completed 500 encodings in 25.84s (0.0517s/encoding)


## Appendix 2: reducing dimension of embeddings

In [9]:
from sklearn.decomposition import PCA

In [8]:
best_model = "msmarco-distilbert-dot-v5"
assert dset['model_name'] == best_model

dset['vectors'].shape

torch.Size([1556223, 768])

In [12]:
# create dim reductions
random_state = 42
target_dims = [384, 192, 96, 48]

pcas = {}

# sklearn
# for t in tqdm(target_dims):
#     pcas[t] = PCA(
#         n_components=t,
#         random_state=random_state
#     ).fit_transform(dset['vectors'])

# cuml - needs GPU
from cuml.decomposition import PCA

for t in tqdm(target_dims):
    pca = PCA(n_components=t, random_state=random_state)
    pcas[t] = pca.fit_transform(dset['vectors'])

  0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: 