## Setup

### Install required libraries

The libraries should already be installed in the terminal after running 
```
pip install -r requirements.txt
```
in the root directory, but this will be useful if the notebook is used in other environments.

In [1]:
%%capture
!pip install InstructorEmbedding==1.0.1
!pip install scikit-learn==1.6.1
!pip install pandas==2.2.3
!pip install numpy==2.2.6
!pip install sentence-transformers==2.2.2
!pip install requests==2.32.3
!pip install transformers==4.37.2
!pip install huggingface-hub==0.25.2

### Import required libraries

In [2]:
import pandas as pd
import numpy as np
from InstructorEmbedding import INSTRUCTOR
from sklearn.metrics.pairwise import cosine_similarity
import requests
from huggingface_hub import configure_http_backend
import urllib3

pd.set_option('display.max_colwidth', 100)

  from tqdm.autonotebook import trange


### Other configurations

In [3]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# [OPTIONAL] Use if there is SSL certificate verification issues
def backend_factory() -> requests.Session:
    session = requests.Session()
    session.verify = False
    return session


configure_http_backend(backend_factory=backend_factory)

### Load the Instructor model

In [4]:
model = INSTRUCTOR('hkunlp/instructor-large')

load INSTRUCTOR_Transformer
max_seq_length  512


In [5]:
cv_csv_file = f'../asr/cv-valid-dev-with-generated-text.csv'
df = pd.read_csv(cv_csv_file)
df.head(5)

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration,generated_text
0,cv-valid-dev/sample-000000.mp3,be careful with your prognostications said the stranger,1,0,,,,,BE CAREFUL WITH YOUR PROGNOSTICATIONS SAID THE STRANGER
1,cv-valid-dev/sample-000001.mp3,then why should they be surprised when they see one,2,0,,,,,THEN WHY SHOULD THEY BE SURPRISED WHEN THEY SEE ONE
2,cv-valid-dev/sample-000002.mp3,a young arab also loaded down with baggage entered and greeted the englishman,2,0,,,,,A YOUNG ARAB ALSO LOADED DOWN WITH BAGGAGE ENTERED AND GREETED THE ENGLISHMAN
3,cv-valid-dev/sample-000003.mp3,i thought that everything i owned would be destroyed,3,0,,,,,I FELT THAT EVERYTHING I OWNED WOULD BE DESTROYED
4,cv-valid-dev/sample-000004.mp3,he moved about invisible but everyone could hear him,1,0,fourties,female,england,,HE MOVED ABOUT INVISIBLE BUT EVERY ONE COULD HEAR HIM


In [6]:
hot_words = ['BE CAREFUL', 'DESTROY', 'STRANGER']

In [7]:
instruction = "Represent the warning concept:"

In [9]:
hot_word_embeddings = model.encode([[instruction, hw] for hw in hot_words])

In [10]:
# sentences = df['generated_text'].tolist()
# sentence_embeddings = model.encode([[instruction, s] for s in sentences])

AttributeError: 'float' object has no attribute 'strip'

In [None]:
threshold = 0.45  # adjust based on desired sensitivity
similarity_flags = []

for emb in sentence_embeddings:
    sims = cosine_similarity([emb], hotword_embeddings)[0]
    is_similar = any(sim > threshold for sim in sims)
    similarity_flags.append(is_similar)

In [None]:
# Compute similarities
similarities = cosine_similarity(vectors)

# Turn into a dataframe
pd.DataFrame(similarities,
            index=sentences,
            columns=sentences) \
            .style \
            .background_gradient(axis=None)