In [3]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m84.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1

In [4]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import pickle

sns.set_theme()
tqdm.pandas()

# Modular function for the end-to-end combined pipeline:

In [9]:
MODEL_PATH = "../../models/combined_approach/kmeans.pkl"
THRESHOLD = 0.95
kmeans = pickle.load(open(MODEL_PATH, 'rb'))
mpnet = SentenceTransformer("all-mpnet-base-v2")

def compute_distance_of_closest_cluster(df):
    cluster_distances = kmeans.transform(mpnet.encode(df["query"].values))
    cluster_min_distances = cluster_distances.min(axis=1)
    df["min_distances"] = cluster_min_distances
    
def predict_oos(df):
  '''
  input df is a Pandas dataframe with the text input under the column named 'query'
  ''' 
  sentence_embeddings = []
  embedding_matrix = mpnet.encode(df["query"].values)
  for i in range(embedding_matrix.shape[0]):
    sentence_embeddings.append(embedding_matrix[i])  

  df["mpnet_embeddings_sentence"] = sentence_embeddings
  compute_distance_of_closest_cluster(df)
  df["pred"] = df.min_distances.apply(lambda x: "oos" if x>THRESHOLD else "in-scope")
  return df

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [10]:
df_test = pd.read_csv("../../data/original_dataset/original_test_data.csv")

In [11]:
df_test = predict_oos(df_test)

In [12]:
df_test["binary_label"] =  df_test.label.apply(lambda x: "oos" if x=="oos" else "in-scope")
pd.DataFrame(classification_report(df_test["binary_label"], df_test["pred"], output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
in-scope,0.950548,0.982444,0.966233,4500.0
oos,0.906949,0.77,0.832883,1000.0
accuracy,0.943818,0.943818,0.943818,0.943818
macro avg,0.928749,0.876222,0.899558,5500.0
weighted avg,0.942621,0.943818,0.941988,5500.0
