In [None]:
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import tqdm

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
filter_sent = "Politics"
list_of_videos = ["Who'se Really Supporting Russia","The Perfect Hillary Clinton Analogy","The Evolution of Alex Jones",\
                    "Patrick Bet David on The Breakfast Club","The Truth About The 2020 Election","Kobe Bryant's Last Great Interview"]

In [None]:
def filter_out_embed(model, user_english_query : str, ground_truth_english_options : list,top_k=3):
    r"""
    Takes in a filter sentence and a list of video string titles and returns
    video titles that are less than 0.19 cosine similarity.
    
    Parameters
    ----------
    model : sentence_transformers.SentenceTransformer.SentenceTransformer
        The HuggingFace Senctence Transformer language model to perform
        encoding of the text.
    user_english_query : str
        The filter sentence written in natural language to remove from the youtube
        search query.
    ground_truth_english_options : list
        The list of titles to be encoded by the `model`.
    threshold : int, default=0.19
        The threshold to filter videos by cosine similarity comparison.

    Returns
    -------
    results : list
        The list of titles left after the filter pass.
    """

    results = []
    #Compute embedding for both lists
    embedding_filter= model.encode(user_english_query, convert_to_tensor=True)
    for sent in ground_truth_english_options:
        embedding_uniq_vid = model.encode(sent, convert_to_tensor=True)
        result = util.pytorch_cos_sim(embedding_filter, embedding_uniq_vid)
        results.append((sent,result.item()))
    results.sort(key=lambda x: x[1])

    return results[-top_k:]

In [None]:
print(filter_out_embed(model,'hello', list_of_videos,3))

In [None]:
path = '/Users/gabrielalon/Desktop/clonetown/youtube_llm/machine_learning/'

In [None]:
ground_truth_path = '/Users/gabrielalon/Downloads/english_to_spanish.parquet'

In [None]:
df = pd.read_parquet(ground_truth_path)
english_sentences = df['english'].tolist()
spanish_sentences = df['spanish'].tolist()

In [None]:
english_to_spanish = dict(zip(english_sentences, spanish_sentences))

print(english_to_spanish.values())

In [None]:
print(english_sentences[:5])

In [None]:
print(len)

In [None]:
test_english_sample = english_sentences#[:1000] #109226
print(test_english_sample)
specific_ground_truth = english_sentences[109226]
test_english_sample.append(specific_ground_truth)

In [None]:
#print(test_english_sample)

In [None]:
user_english_query = 'you need to run'
#user_english_query : str, ground_truth_english_options

Time on 139k ground truth english sentences: 

In [None]:
print(filter_out_embed(model,user_english_query,test_english_sample,3))
#def filter_out_embed(model, user_english_query : str, ground_truth_english_options : list, threshold : int = 0.19):


Consider these results in the ground truth data for high similarity to "you need to run"
[('Start running.', 0.6700537204742432), ('I must run.', 0.7214442491531372), ('I have to run.', 0.7606613636016846)]
The first could be the best given the context of 'you', while the others are "interesting" from an educational perspective for the user
