In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

In [None]:
# Load the CSV file into a DataFrame
df = pd.read_csv('../asr-train/cv-valid-dev_with_predicted_transcriptions.csv')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-large")
model = AutoModel.from_pretrained("hkunlp/instructor-large")

In [None]:
# Function to extract words/phrases from a sentence
def extract_phrases(sentence, ngram_range=(1, 2)):
    # CountVectorizer to extract 1-gram and 2-gram phrases
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    ngrams = vectorizer.fit_transform([sentence])
    feature_names = vectorizer.get_feature_names_out()

    return feature_names


# Function to compute similarity between a word/phrase and hot words
def compute_similarity(phrase, hot_words):
    # Encode the phrase and hot words using the tokenizer and model
    phrase_tokens = tokenizer([phrase], return_tensors="pt", padding=True, truncation=True).to(device)
    hotword_tokens = tokenizer(hot_words, return_tensors="pt", padding=True, truncation=True).to(device)

    # embed the phrase and hot words using the model
    with torch.no_grad():
        phrase_embedding = model.encoder(**phrase_tokens).last_hidden_state.mean(dim=1).cpu().numpy()
        hotword_embeddings = model.encoder(**hotword_tokens).last_hidden_state.mean(dim=1).cpu().numpy()

    # Compute cosine similarity between the phrase and each hot word
    similarities = cosine_similarity(phrase_embedding, hotword_embeddings)

    # Return True if the maximum similarity is above 0.7 (similarity threshold)
    return np.max(similarities) > 0.7


# Apply the similarity check to the 'generated_text' from fine-tuned model
def check_similarity_in_sentence(sentence, hot_words):
    phrases = extract_phrases(sentence)

    # Check similarity for each phrase in the sentence
    for phrase in phrases:
        if compute_similarity(phrase, hot_words):
            return True
    return False

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device: ", device)
model = model.to(device)

hotwords = ['be careful', 'destroy', 'stranger']

tqdm.pandas()

# Apply the similarity check for each sentence in the DataFrame
df['similarity'] = df['predicted_transcription'].progress_apply(lambda x: check_similarity_in_sentence(x, hotwords))

# Save the updated DataFrame
output_file_path = '/content/drive/My Drive/cv-valid-dev-with-similarity.csv'  # Output file name
df.to_csv(output_file_path, index=False)

print(f"Updated dataset saved to {output_file_path}")
