In [1]:
!pip install faiss-cpu torch sentence-transformers ipywidgets



In [1]:
import pandas as pd
import faiss
import torch
import gzip
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.auto import tqdm

import time

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

## 1. Import data

In [2]:
DATASET_PATH = "../../s3-buckets/cpr-datasets/policy_dataset.csv.gz"
VECTORS_PATH = "../../s3-buckets/cpr-datasets/policy_text_embeddings.pkl.gz"

df = pd.read_csv(DATASET_PATH, compression="gzip", index_col=0)
df.shape

(1666918, 4)

In [3]:
df.head()

Unnamed: 0,policy_id,policy_name,page_id,text
0,0,14th Five-Year Plan,0,Outline of the People's Republic of China 14th Five-Year Plan for National Economic Part One Embarking on the new journey to build China into a modernized socialist country in an all-round way .
1,0,14th Five-Year Plan,0,1 Article I Development environment .
2,0,14th Five-Year Plan,0,1 Article II Guiding directives .
3,0,14th Five-Year Plan,0,4 Article III Main goals .
4,0,14th Five-Year Plan,0,"The following document is China's 14th Five-Year Plan, covering the years 2021-2025, as passed by the Chinese parliament, the National People's Congress, in March 2021."


## 2. Try calculating vectors for each sentence

In [6]:
n_sentences = 50000
df_small = df.head(n_sentences)
sentences = df_small["text"].tolist()

def compute_embeddings(sentences, model: SentenceTransformer, l2_normalise=True):
    """Use l2_normalise if the model uses cosine distance, to normalise the vectors for an inner-product faiss index"""

    start = time.time()
    pool = model.start_multi_process_pool()
    emb = model.encode_multi_process(sentences, pool)
    print(f"Embeddings computed. Shape: {emb.shape}")
    model.stop_multi_process_pool(pool)
    end = time.time()
    
    if l2_normalise:
        emb = emb/np.linalg.norm(emb, ord=2, axis=1, keepdims=True)
    print(f"took {int(end-start)}s")
    
    return emb

model_symm = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
model_asymm = SentenceTransformer('msmarco-MiniLM-L-6-v3')

# we can either compute these or load them from disk - see below
# emb_symm = compute_embeddings(sentences, model_symm)
# emb_asymm = compute_embeddings(sentences, model_asymm)

In [22]:
# optionally save to disk
with open("weak sentence classification/emb_asymm.npy", "wb") as f:
    np.save(f, emb_asymm)
    
with open("weak sentence classification/emb_symm.npy", "wb") as f:
    np.save(f, emb_symm)

with open("weak sentence classification/emb_asymm_dot.npy", "wb") as f:
    np.save(f, emb_asymm_dot)    

# or load from disk
# with open("weak sentence classification/emb_asymm.npy", "rb") as f:
#     emb_asymm = np.load(f)
    
# with open("weak sentence classification/emb_symm.npy", "rb") as f:
#     emb_symm = np.load(f)


In [7]:
model_asymm_dot = SentenceTransformer('msmarco-distilbert-dot-v5')
emb_asymm_dot = compute_embeddings(sentences, model_asymm_dot, l2_normalise=False)

# reduce size of minilm matrices to match the 50,000 texts for distilbert
emb_asymm = emb_asymm[:50000,:]
emb_symm = emb_symm[:50000,:]

Embeddings computed. Shape: (50000, 768)
took 1687s


## 3. Create faiss indices

In [9]:
faiss_index_symm = faiss.IndexFlatIP(emb_symm.shape[1])
faiss_index_symm.add(emb_symm)

faiss_index_asymm = faiss.IndexFlatIP(emb_asymm.shape[1])
faiss_index_asymm.add(emb_asymm)

faiss_index_asymm_dot = faiss.IndexFlatIP(emb_asymm_dot.shape[1])
faiss_index_asymm_dot.add(emb_asymm_dot)

In [11]:
def query_index(query: str, faiss_index, model,  n_res: int = 20, normalise: bool = True):
    start = time.time()
    query_emb = model.encode([query])
    if normalise:
        query_emb = query_emb/np.linalg.norm(query_emb, ord=2, axis=1, keepdims=True)
    
    distances, idxs = faiss_index.search(query_emb, n_res)
    
    results = []
    for i in range(len(idxs[0])):
        row = df_small.iloc[idxs[0][i]].to_dict()
        row.update(
            {"similarity": distances[0][i]}
        )
        
        results.append(row)
    end = time.time()
    print(f"took {round(end-start, 1)}s")
    
    return pd.DataFrame(results)

### 3.1 Run a query across all available models

In [21]:
from IPython.display import display, Markdown

query = "electric vehicle charging"

#####################

display(Markdown(f"### query: *{query}*"))

# # asymmetric
# asymm_res = query_index(
#     query,
#     faiss_index = faiss_index_asymm,
#     model = model_asymm,
#     n_res = N,
# )

# display(Markdown("#### asymmetric model"))
# display(asymm_res)

# asymmetric w/ dot product (more flexible to passage length)
asymm_dot_res = query_index(
    query,
    faiss_index = faiss_index_asymm_dot,
    model = model_asymm_dot,
    n_res = N,
)

display(Markdown("#### asymmetric model w/ dot product"))
display(asymm_dot_res)


# symmetric
symm_res = query_index(
    query,
    faiss_index = faiss_index_symm,
    model = model_symm,
    n_res = N,
)

display(Markdown("#### symmetric model"))
display(symm_res)


### query: *electric vehicle charging*

took 0.1s


#### asymmetric model w/ dot product

Unnamed: 0,policy_id,policy_name,page_id,text,similarity
0,6,"Long-Term Climate Strategy, 2021",35,Electric vehicles are mainly used in public urban transport.,8.446144
1,31,A Healthy Environment and a Healthy Economy,40,"ELECTRIC VEHICLES SALES ARE GROWING EVERY YEAR Bloomberg New Energy Finance (BNEF) forecasts that sales of EVs will reach 10% of global passenger vehicle sales by 2025 and 28% by 2030, representing between 20-30 million units sold in 2030.",8.334498
2,6,"Long-Term Climate Strategy, 2021",34,The range of an average battery-powered electric car of over 400 kilometres will be enough for most users.,8.332932
3,6,"Long-Term Climate Strategy, 2021",21,Electric vehicles also enjoy various benefits.,8.315709
4,6,"Long-Term Climate Strategy, 2021",19,The effectiveness of this mechanism is already evident: newly registered electric cars (battery-powered electric vehicles and plug-in hybrids) reached a new record high in Switzerland in 2019 with a share of 5.6 per cent.,8.301947
5,23,Iceland’s Climate Action Plan for 2018-2030 and 2020 update,5,The government has allocated 210 million ISK in the years 20162018 to support the build-up of charging-stations for electric cars.,8.273039
6,31,A Healthy Environment and a Healthy Economy,42,VALUE CHAIN: ELECTRIC VEHICLES EXPLORATION AND EXTRACTION EV SALES and OWNERSHIP RE-USE AND RECYCLING A HEALTHY ENVIRONMENT AND A HEALTHY ECONOMY,8.219527
7,6,"Long-Term Climate Strategy, 2021",21,"The Federal Council approved the report ""Voraussetzungen fr ein Schnellladenetz fr Elektroautos auf Nationalstrassen"" (Requirements for a fast-charging network for electric vehicles on national highways) on 28 June 2017.",8.215454
8,6,"Long-Term Climate Strategy, 2021",18,"To promote electric mobility, contributions for the installation of charging stations in apartment blocks and multi-family buildings and other multi-party developments are provided for.",8.205051
9,6,"Long-Term Climate Strategy, 2021",19,The share of electric vehicles stood at 12.1 per cent in the first ten months with the target of 10 per cent which the industry association 'auto-schweiz' set itself for 2020 already being surpassed.,8.204624


took 0.0s


#### symmetric model

Unnamed: 0,policy_id,policy_name,page_id,text,similarity
0,23,Iceland’s Climate Action Plan for 2018-2030 and 2020 update,6,The plan will also consider charging stations for electric bikes.,0.577427
1,23,Iceland’s Climate Action Plan for 2018-2030 and 2020 update,5,Regulations will be reviewed to ensure that new buildings will be designed allowing for infrastructure for charging electric cars.,0.559627
2,6,"Long-Term Climate Strategy, 2021",35,Electric vehicles are mainly used in public urban transport.,0.552675
3,8,Uganda Green Growth Development Strategy 2017/2018 - 2030-2031,54,utility charges.,0.550492
4,6,"Long-Term Climate Strategy, 2021",21,This aims to increase the share of electric vehicles amongst newly registered cars to 15 per cent by 2022.,0.535633
5,6,"Long-Term Climate Strategy, 2021",21,Electric vehicles also enjoy various benefits.,0.519762
6,31,A Healthy Environment and a Healthy Economy,41,LEADING THE CHARGE IN FUEL-EFFICIENCY,0.517887
7,23,Iceland’s Climate Action Plan for 2018-2030 and 2020 update,5,The government has allocated 210 million ISK in the years 20162018 to support the build-up of charging-stations for electric cars.,0.513011
8,26,Fiji NDC Implementation Roadmap (2018-2030),31,requirements for recycling and disposal policies/plans for electric vehicle batteries.,0.500288
9,31,A Healthy Environment and a Healthy Economy,39,California and Oregon have sold more than 528000 battery electric vehicles through December 2018.,0.494649


---

In [27]:
# this doesn't work
with gzip.open(VECTORS_PATH, "rb") as f:
    vectors = torch.load(f, map_location='cpu')

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.