In [18]:
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import tqdm

In [19]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [20]:
filter_sent = "Politics"
list_of_videos = ["Who'se Really Supporting Russia","The Perfect Hillary Clinton Analogy","The Evolution of Alex Jones",\
                    "Patrick Bet David on The Breakfast Club","The Truth About The 2020 Election","Kobe Bryant's Last Great Interview"]

In [21]:
def filter_out_embed(model, user_english_query_1 : str, ground_truth_english_options : list,top_k=3):
    r"""
    Takes in a filter sentence and a list of video string titles and returns
    video titles that are less than 0.19 cosine similarity.
    
    Parameters
    ----------
    model : sentence_transformers.SentenceTransformer.SentenceTransformer
        The HuggingFace Senctence Transformer language model to perform
        encoding of the text.
    user_english_query_1 : str
        The filter sentence written in natural language to remove from the youtube
        search query.
    ground_truth_english_options : list
        The list of titles to be encoded by the `model`.
    threshold : int, default=0.19
        The threshold to filter videos by cosine similarity comparison.

    Returns
    -------
    results : list
        The list of titles left after the filter pass.
    """

    results = []
    #Compute embedding for both lists
    embedding_filter= model.encode(user_english_query_1, convert_to_tensor=True)
    for sent in ground_truth_english_options:
        embedding_uniq_vid = model.encode(sent, convert_to_tensor=True)
        result = util.pytorch_cos_sim(embedding_filter, embedding_uniq_vid)
        results.append((sent,result.item()))
    results.sort(key=lambda x: x[1])

    return results[-top_k:]

In [22]:
print(filter_out_embed(model,'hello', list_of_videos,3))

[('The Truth About The 2020 Election', 0.11621567606925964), ('The Evolution of Alex Jones', 0.12610024213790894), ("Kobe Bryant's Last Great Interview", 0.21592138707637787)]


In [23]:
path = '/Users/gabrielalon/Desktop/clonetown/youtube_llm/machine_learning/'

In [24]:
ground_truth_path = '/Users/gabrielalon/Downloads/english_to_spanish.parquet'

In [25]:
df = pd.read_parquet(ground_truth_path)
english_sentences = df['english'].tolist()
spanish_sentences = df['spanish'].tolist()

In [26]:
english_to_spanish = dict(zip(english_sentences, spanish_sentences))

In [27]:
print(english_sentences[:5])

['Go.', 'Go.', 'Go.', 'Go.', 'Hi.']


In [28]:
print(len)

<built-in function len>


In [29]:
test_english_sample = english_sentences[:1000] #109226
print(test_english_sample)
specific_ground_truth = english_sentences[109226]
test_english_sample.append(specific_ground_truth)

['Go.', 'Go.', 'Go.', 'Go.', 'Hi.', 'Hi.', 'Run!', 'Run!', 'Run!', 'Run!', 'Run.', 'Who?', 'Wow!', 'Duck!', 'Fire!', 'Fire!', 'Fire!', 'Help!', 'Help!', 'Help!', 'Hide.', 'Jump!', 'Jump.', 'Jump.', 'Stay.', 'Stop!', 'Stop!', 'Stop!', 'Wait!', 'Wait!', 'Wait.', 'Wait.', 'Begin.', 'Go on.', 'Go on.', 'Hello!', 'Hurry!', 'Hurry!', 'Hurry!', 'I hid.', 'I hid.', 'I hid.', 'I hid.', 'I ran.', 'I ran.', 'I try.', 'I won!', 'Oh no!', 'Relax.', 'Shoot!', 'Shoot!', 'Shoot!', 'Shoot!', 'Shoot!', 'Shoot!', 'Shoot!', 'Smile.', 'Sorry?', 'Attack!', 'Attack!', 'Attack!', 'Attack!', 'Attack!', 'Buy it.', 'Eat it.', 'Eat it.', 'Eat it.', 'Eat it.', 'Get up.', 'Go now.', 'Go now.', 'Go now.', 'Go now.', 'Go now.', 'Go now.', 'Go now.', 'Go now.', 'Got it!', 'Got it?', 'Got it?', 'He ran.', 'He ran.', 'Hop in.', 'Hug me.', 'I care.', 'I fell.', 'I fled.', 'I fled.', 'I fled.', 'I fled.', 'I know.', 'I know.', 'I left.', 'I lied.', 'I lost.', 'I paid.', 'I quit.', 'I quit.', 'I quit.', 'I sang.', 'I wept.

In [30]:
#print(test_english_sample)

In [38]:
user_english_query_1 = 'you need to run'
#user_english_query_1 : str, ground_truth_english_options

Time on 139k ground truth english sentences: 

In [39]:
test_case_1 = (filter_out_embed(model,user_english_query_1,test_english_sample,3))
print(test_case_1)
#def filter_out_embed(model, user_english_query_1 : str, ground_truth_english_options : list, threshold : int = 0.19):
#[('I can run.', 0.6228374242782593), ('I can run.', 0.6228374242782593), ('You run.', 0.651080846786499)]
#notice data duplication error

[('I can run.', 0.6228374242782593), ('I can run.', 0.6228374242782593), ('You run.', 0.651080846786499)]


In [37]:
print(f"User input: {user_english_query_1}")
for case in test_case_1:
    print(f"{case[0]} -- Spanish: {english_to_spanish[case[0]]}")
#Note google translate corre and it says 'runs' but a legit translation is also you run in a formal context which is a lost
#possibilitiy because google translate gives only one prediction

User input: you need to run
I can run. -- Spanish: Sé correr.
I can run. -- Spanish: Sé correr.
You run. -- Spanish: Corre.


Consider these results in the ground truth data for high similarity to "you need to run"

[('Start running.', 0.6700537204742432), ('I must run.', 0.7214442491531372), ('I have to run.', 0.7606613636016846)]

The first could be the best given the context of 'you', while the others are "interesting" from an educational perspective 
for the user

