In [None]:
import polars as pl
import numpy as np
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
video_country = pl.read_csv('edu-data/video_with_channelcountry.csv').filter(pl.col("country").is_not_null())

In [None]:
with open('.././data/filtered_stem_lectures.json', 'r') as file:
    keywords = json.load(file)
anchors = list(keywords.keys())

In [None]:
def encode_tags(tags, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    return model.encode(tags, device='cpu', batch_size=128, show_progress_bar=True)

def get_stem_anchors(anchors: list):
    return anchors, encode_tags(anchors)
    
def detect_stem_tags(tag_vectors, anchor_vectors, threshold=0.5):   
    similarities = cosine_similarity(tag_vectors, anchor_vectors)
    max_similarities = np.max(similarities, axis=1)
    return max_similarities

def get_unique_tags(df: pl.DataFrame, tag_column: str = 'tags') -> list:
    return (df.lazy()
        .select(pl.col(tag_column))
        .filter(pl.col(tag_column).is_not_null())
        .select(
            pl.col(tag_column).str.split(',').alias('split_tags')
        )
        .explode('split_tags')
        .select(
            pl.col('split_tags').str.strip_chars()
        )
        .unique().collect().to_series().to_list()
    )

def process_tags_in_batches(tags, anchors, batch_size=1000, checkpoint=True):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    anchor_texts, anchor_vectors = get_stem_anchors(anchors)

    scores = []
    
    for i in range(0, len(tags), batch_size):
        batch = tags[i:i + batch_size]
        batch_vectors = model.encode(batch, device='cpu', show_progress_bar=True)
        batch_scores = detect_stem_tags(batch_vectors, anchor_vectors)
        
        scores.extend(batch_scores)
        cos_sim = np.array(scores)
        
        if checkpoint==True:
            np.save('tags_cos_sim.npy', cos_sim)
        print(f"Processed {i + len(batch)} / {len(tags)} tags")
    
    return cos_sim

In [None]:
unique_tags = get_unique_tags(video_country)

In [None]:
scores = process_tags_in_batches(unique_tags, anchors, 10000)

In [None]:
res = scores > 0.5
stem_tags = np.array(unique_tags)[res]

In [None]:
tags_dict = {'tags': list(stem_tags)}
with open(".././data/stem_tags.json", 'w') as file:
    json.dump(tags_dict, file)