In [1]:
!pip install faiss-cpu torch sentence-transformers ipywidgets

Collecting torch
  Downloading torch-1.8.1-cp38-none-macosx_10_9_x86_64.whl (119.6 MB)
[K     |████████████████████████████████| 119.6 MB 446 kB/s  eta 0:00:01
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.9.1
    Uninstalling torch-1.9.1:
      Successfully uninstalled torch-1.9.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
policy-search 0.1.0 requires torch<2.0.0,>=1.9.1, but you have torch 1.8.1 which is incompatible.
farm-haystack 0.10.0 requires elasticsearch<=7.10,>=7.7, but you have elasticsearch 7.15.1 which is incompatible.[0m
Successfully installed torch-1.8.1
You should consider upgrading via the '/Users/kalyan/Documents/CPR/policy-search/.venv/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd
import faiss
import torch
import gzip
import pickle
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.auto import tqdm
from IPython.display import display, Markdown

import time

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

## 1. Import data and load models

In [3]:
def load_embeddings(filename):
    """Load compressed embeddings
    """
    
    with gzip.open(filename, 'rb') as emb_f:
        embeddings = pickle.load(emb_f)
    file = gzip.GzipFile(filename, 'rb')
    data = file.read()
    obj = pickle.loads(data)
    file.close()
    return obj


In [7]:
datasets = [
    {
        "model_name": "multi-qa-MiniLM-L6-cos-v1",
        "vectors_path": "../../../s3-buckets/cpr-datasets/policy_text_embeddings_multi-qa-MiniLM-L6-cos-v1.pkl.gz",
        "mappings_path": "../../../s3-buckets/cpr-datasets/policy_text_embeddings_mapping_multi-qa-MiniLM-L6-cos-v1.pkl.gz",
    },
    {
        "model_name": "msmarco-distilbert-dot-v5",
        "vectors_path": "../../../s3-buckets/cpr-datasets/policy_text_embeddings_msmarco-distilbert-dot-v5.pkl.gz",
        "mappings_path": "../../../s3-buckets/cpr-datasets/policy_text_embeddings_mapping_msmarco-distilbert-dot-v5.pkl.gz",
    },
]

# because of RAM limitations we can only run one dataset at a time
dset = datasets[1]

dset["mapping"] = pd.read_pickle(dset['mappings_path'], compression="gzip")
dset["vectors"] = load_embeddings(dset['vectors_path'])

dset["model"] = SentenceTransformer(dset["model_name"])

print(f"{dset['model_name']}: vectors {dset['vectors'].shape}; mapping {dset['mapping'].shape}")

msmarco-distilbert-dot-v5: vectors torch.Size([1666918, 768]); mapping (1666918, 4)


### 1a. Process data

In [8]:
# Remove short phrases
min_no_words = 4

dset['mapping']['text_no_words'] = dset['mapping']['text'].apply(lambda i: len(i.split(" ")))
dset['mapping'] = dset['mapping'][dset['mapping']['text_no_words'] >= min_no_words]

dset['vectors'] = dset['vectors'][dset['mapping'].index, :]

## 2. Create faiss indices

In [9]:
if "-cos-" in dset["model_name"]:
    dset["vectors"] = torch.nn.functional.normalize(
            dset["vectors"], p=2, dim=1
        )

dset['faiss'] = faiss.IndexFlatIP(dset['vectors'].shape[1])
dset['faiss'].add(dset['vectors'].numpy())

print(f"{dset['model_name']} created")
        

msmarco-distilbert-dot-v5 created


In [37]:
def query_index(query: str, dset: dict, n_res: int = 20):
    start = time.time()
    query_emb = dset['model'].encode([query])
    if "-cos-" in dset["model_name"]:
        query_emb = query_emb/np.linalg.norm(query_emb, ord=2, axis=1, keepdims=True)
    
    distances, idxs = dset['faiss'].search(query_emb, n_res)
    
    results = []
    for i in range(len(idxs[0])):
        row = dset['mapping'].iloc[idxs[0][i]].to_dict()
        row.update(
            {"similarity": distances[0][i]}
        )
        
        results.append(row)
    end = time.time()
    print(f"took {round(end-start, 1)}s")
    
    return pd.DataFrame(results)

### 3.1 Run a query across all available models

In [35]:
queries = [
    "transport levies in europe",
    "renewable energy",
    "deforestation in Brazil",
    "carbon tax",
    "net zero target",
    "electric vehicle charging",
    "incentives for innovation",
    "research and development investment",
    "nature-based solutions",
]

In [38]:
N = 30

search_results = []

for query in queries:
    display(Markdown(f"### query: *{query}*"))

    res = query_index(
        query,
        dset = dset,
        n_res = N,
    )

    res['query'] = query
    
    search_results.append(res)


### query: *transport levies in europe*

took 11.5s


### query: *renewable energy*

took 1.7s


### query: *deforestation in Brazil*

took 0.4s


### query: *carbon tax*

took 0.4s


### query: *net zero target*

took 0.3s


### query: *electric vehicle charging*

took 0.4s


### query: *incentives for innovation*

took 0.8s


### query: *research and development investment*

took 0.3s


### query: *nature-based solutions*

took 0.3s


In [17]:
# pd.concat(search_results)[['query', 'text', 'text_id', 'similarity', 'policy_id', 'page_id',]].to_csv(f"results_{dset['model_name']}.csv")

## Appendix 1: measuring query time
This can be run without running the rest of the notebook, except for the imports.

In [27]:
query_length_range = (1, 6) # inclusive
no_samples = 500

model_names = [d['model_name'] for d in datasets]
mapping_data = datasets[0]['mapping']
text_data = mapping_data.loc[
    (mapping_data['text_no_words'] >= query_length_range[0]) & (mapping_data['text_no_words'] <= query_length_range[1]),
    'text'
].sample(no_samples, random_state=42).tolist()

In [30]:
def measure_time(model_name, text_samples):
    encoder = SentenceTransformer(model_name)
    start = time.time()
    for t in text_samples:
        _ = encoder.encode(t)
    end = time.time()
    
    second_per_enc = round((end-start)/len(text_samples), 4)
    time_taken = round(end-start, 2)
    
    print(f"{model_name} completed {len(text_samples)} encodings in {time_taken}s ({second_per_enc}s/encoding)")


for model_name in model_names:
    measure_time(model_name, text_data)

multi-qa-MiniLM-L6-cos-v1 completed 500 encodings in 8.07s (0.0161s/encoding)
msmarco-distilbert-dot-v5 completed 500 encodings in 25.84s (0.0517s/encoding)


## Appendix 2: reducing dimension of embeddings with `faiss`

From below we can see that even reducing embeddings to 3/4 of their length leads to vastly different (and worse) results. When we run PCA, the returned phrases also seem to be limited to shorter phrases again.

In [46]:
target_dims = [576, 384,]# 192, 96, 48]

In [47]:
def query_index(index, query: str, dset: dict, n_res: int = 20):
    start = time.time()
    query_emb = dset['model'].encode([query])
    if "-cos-" in dset["model_name"]:
        query_emb = query_emb/np.linalg.norm(query_emb, ord=2, axis=1, keepdims=True)
    
    distances, idxs = index.search(query_emb, n_res)
    
    results = []
    for i in range(len(idxs[0])):
        row = dset['mapping'].iloc[idxs[0][i]].to_dict()
        row.update(
            {"similarity": distances[0][i]}
        )
        
        results.append(row)
    end = time.time()
    # print(f"took {round(end-start, 1)}s")
    
    return pd.DataFrame(results)

In [48]:
N = 30

pca_experiment_results = {}

for dim in tqdm(target_dims):
    pca = faiss.PCAMatrix(
        d_in=dset['vectors'].shape[1], 
        d_out=dim, 
        random_rotation=False
    )
    faiss_small = faiss.IndexFlatIP(dim)

    pipeline_small = faiss.IndexPreTransform(pca, faiss_small)
    pipeline_small.train(dset['vectors'].numpy())
    pipeline_small.add(dset['vectors'].numpy())

    temp_results = []

    for query in queries:
        # display(Markdown(f"### query: *{query}*"))

        res = query_index(
            pipeline_small,
            query,
            dset = dset,
            n_res = N,
        )

        res['query'] = query

        temp_results.append(res)
        
    pca_experiment_results[dim] = pd.concat(temp_results)


  0%|          | 0/2 [00:00<?, ?it/s]

In [55]:
pd.concat(search_results).iloc[30:49]

Unnamed: 0,policy_id,page_id,text,text_id,text_no_words,similarity,query
0,1217,0,"Renewable Energy or RE is any energy source generated from natural resources which are naturally regenerative or replenished such as solar, wind, ocean, geothermal, hydro and biomass.",1341542,27,82.803665,renewable energy
1,1175,76,"Renewable energy Is obtained from the continuing or repetitive currents of energy occurring in the natural environment and includes non-carbon technologies such as solar energy, hydropower, wind, tide and waves and geothermal heat, as well as carbon-neutral technologies such as biomass.",1319188,41,82.713837,renewable energy
2,925,24,"Renewable energy is energy that comes from any naturally occurring and replenishable source such as biomass, solar, wind, tidal, wave, flowing water (hydropower) and geothermal heat.",1060794,26,82.621262,renewable energy
3,445,3,"renewable energy means energy generated from natural non-depleting resources including solar energy, wind energy, biomass energy, biological waste energy, hydro energy, geothermal energy and ocean and tidal energy.",548884,28,82.470673,renewable energy
4,1327,3,"renewable energy means energy generated from natural non-depleting resources including solar energy, wind energy, biomass energy, biological waste energy, hydro energy, geothermal energy and ocean and tidal energy.",1422428,28,82.470673,renewable energy
5,1512,8,Renewable energy is a clean energy source that can be replenished naturally and used to produce electricity with minimal or nil greenhouse gas emissions.,1597330,24,82.441666,renewable energy
6,226,16,"Renewable energy refers to energy that comes from resources such as water, wind, sunlight, geothermal heat, tides, waves, biological fuels and other resources that can generate renewable energy.",256700,28,82.333565,renewable energy
7,1337,244,Renewable energy: Energy derived from resources that are regenerative or that cannot be depleted.,1441694,14,82.194717,renewable energy
8,126,177,"Renewable Energy (RE) is any energy source that is naturally replenished, that is, from a source that is not depleted when used, such as solar, wind, geothermal, biomass or hydroelectric generation.",134087,31,82.157097,renewable energy
9,231,283,"RENEWABLE ENERGY.The term renewable energy means energy derived from resources that are regenerative or that cannot be depleted, including solar, wind, ethanol, and biodiesel fuels. )",285809,26,82.132683,renewable energy


In [56]:
pca_experiment_results[576].iloc[30:49]

Unnamed: 0,policy_id,page_id,text,text_id,text_no_words,similarity,query
0,659,215,Renewable Energy Solar PV,795161,4,17.60936,renewable energy
1,912,54,Renewable energy heating and cooking,1039910,5,17.422514,renewable energy
2,296,30,What is Renewable Energy?,389307,4,17.289688,renewable energy
3,295,30,What is Renewable Energy?,387815,4,17.289688,renewable energy
4,1512,8,What is renewable energy?,1597332,4,17.289688,renewable energy
5,593,1,Renewable Energy shall mean the Energy deriving from Renewable Energy Sources.,701955,11,17.1064,renewable energy
6,878,9,renewable energy means electricity generated or produced from renewable resources.,1008526,10,16.751535,renewable energy
7,659,12,RES-H Renewable energy heat,792689,4,16.705727,renewable energy
8,1337,244,Renewable energy: Energy derived from resources that are regenerative or that cannot be depleted.,1441694,14,16.654448,renewable energy
9,845,154,Renewable energy: Power for a sustainable future.,981722,7,16.560579,renewable energy


In [59]:
def measure_search_results_diff(orig_df, compressed_df):
    set_diffs = []
    
    for query, group in orig_df.groupby('query'):
        orig_results = group['text_id'].tolist()
        compressed_pipeline_results = compressed_df.loc[compressed_df['query'] == query, 'text_id'].tolist()
        
        query_set_diff = len(set(compressed_pipeline_results) - set(orig_results))
        set_diffs.append(query_set_diff)
        
    return set_diffs
        
        
orig_df = pd.concat(search_results)
np.mean(measure_search_results_diff(orig_df, pca_experiment_results[384]))
        

13.555555555555555