# A Tiny Project Meant to Get Familiar with HuggingFace

- Have a local .env file created, with `CMS_URL` and `ADMIN_APIKEY` defined inside.
- Data are blog data are stored as `.data/blogs.jsonl`, along with embeddings.

In [1]:
from dotenv import load_dotenv
import os
from typing import Any, Iterable

load_dotenv(".env.local")
CMS_URL = os.environ["CMS_URL"]
API_KEY = os.environ["ADMIN_APIKEY"]

print(f"will fetch contents from {CMS_URL}")

will fetch contents from https://zane-n-kiyo-admin.kiyo-n-zane.com


### Fetch data from remote to local

In [2]:
import json
import requests

def graphql_fetch(query: str, variables: dict[str, Any] = {}) -> dict:
    graphql_url = f"{CMS_URL}/api/graphql"

    response = requests.post(
        graphql_url,
        data=json.dumps({
            "query": query,
            "variables": variables
        }),
        headers={
            'Content-Type': 'application/json',
            "Accept": 'application/json',
            "Authorization": f"users API-Key {API_KEY}",
        }
    )

    response_obj = json.loads(response.content)["data"]
    return response_obj

In [3]:
from pathlib import Path

get_all_blogs_gql = """
query {
    ZaneDevBlogs {
        docs {
            link
        }
    }
}"""

get_blog_detail_gql = """
query ZaneDevBlogByLink($link: String!) {
    ZaneDevBlogs (
        where: {
            link: {
                equals: $link
            }
        }
    ) {
    docs {
        title
        tags
        link
        createdDate
        description
        content
    }
  }
}
"""

DATA_DIR = Path(".data_dir")
BLOG_DATA_PATH = DATA_DIR.joinpath(".blogs.jsonl")


def fetch_blogs_content_to_local() -> None:
    blog_links: list[str] = [doc["link"] for doc in graphql_fetch(get_all_blogs_gql)["ZaneDevBlogs"]["docs"]]
    print(blog_links)

    if not os.path.exists(DATA_DIR):
        os.mkdir(DATA_DIR)
    if os.path.exists(BLOG_DATA_PATH):
        os.remove(BLOG_DATA_PATH)

    with open(BLOG_DATA_PATH, 'a') as j_file:
        for link in blog_links:
            j_file.write(
                json.dumps(
                    graphql_fetch(get_blog_detail_gql, {"link": link})["ZaneDevBlogs"]["docs"][0]
                )
            )
            j_file.write("\n")

def read_blogs_from_local() -> Iterable[dict]:
    with open(BLOG_DATA_PATH, 'r') as j_file:
        while line := j_file.readline():
            yield json.loads(line)


Execute following only when data is not local available.

In [None]:
# fetch_blogs_content_to_local

['back_to_css_after_everything', '5_wtf_moments_in_python', 'decorator_design_pattern', 'cross_component_styling_with_react_compound_pattern', 'from_monolithic_to_react_compound_pattern', 'solutions_to_bring_asynchronism_into_pytest']


### Generate Embeddings for each blog

Embeddings are stored in a seperate json file, just for ease of this tiny project.

In [43]:
import transformers
import torch 

# checkpoint = "distilbert-base-uncased"
# model: transformers.DistilBertModel = transformers.DistilBertModel.from_pretrained(checkpoint)
# tokenizer: transformers.DistilBertTokenizer = transformers.DistilBertTokenizer.from_pretrained(checkpoint)
checkpoint = "sentence-transformers/all-MiniLM-L6-v2"
model: transformers.AutoModel = transformers.AutoModel.from_pretrained(checkpoint)
tokenizer: transformers.AutoTokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)


def tokenize_text_with_chunks(text: str, max_chunk_size: int=500, stride: int=50):
    all_tokens: torch.Tensor = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt")[0]

    start_idx = 0
    while start_idx < len(all_tokens):
        end_idx = start_idx + max_chunk_size
        chunk = all_tokens[start_idx: end_idx]
        yield chunk.reshape(1, len(chunk))
        start_idx = end_idx - stride

def generate_embeddings(text: str) -> torch.Tensor:
    chunk_embeddings = [
        model(input_ids=ids).last_hidden_state.mean(dim=1, keepdim=True) 
        for ids in tokenize_text_with_chunks(text)
    ]
    embedding = torch.cat(chunk_embeddings, dim=1).mean(dim=1)

    return embedding.reshape(-1)



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [44]:
BLOG_EMBEDDING_PATH = DATA_DIR.joinpath(".embeddings.jsonl")

def write_embeddings_for_blogs_to_local():
    if os.path.exists(BLOG_EMBEDDING_PATH):
        os.remove(BLOG_EMBEDDING_PATH)

    with open(BLOG_EMBEDDING_PATH, "a") as j_file:
        for blog in read_blogs_from_local():
            embedding = generate_embeddings(blog["content"])
            j_file.write(
                json.dumps(
                    {"link": blog["link"], "embeddings": embedding.tolist()}
                )
            )
            j_file.write("\n")

def read_embeddings_from_local():
    with open(BLOG_EMBEDDING_PATH, 'r') as j_file:
        while line := j_file.readline():
            yield json.loads(line)

Execute following only when embeddings is not local available.

In [45]:
# write_embeddings_for_blogs_to_local()

Token indices sequence length is longer than the specified maximum sequence length for this model (2379 > 512). Running this sequence through the model will result in indexing errors


### Get a string and find a blog most relevant to!

Yes I am excited!

In [29]:
# GPT give me this, I should cycle back to the math.
def cosine_similarity_torch(a, b):
    a = torch.nn.functional.normalize(a, p=2, dim=-1)
    b = torch.nn.functional.normalize(b, p=2, dim=-1)
    return torch.matmul(a, b.T)

In [46]:
def find_closest_blog(text: str) -> tuple[str, float]:
    target_emd = generate_embeddings(text)
    largest_cosine = 0
    blog_link = ""

    for blog in read_embeddings_from_local():
        blog_emd = torch.Tensor(blog["embeddings"])
        cosine = cosine_similarity_torch(blog_emd, target_emd)
        if cosine > largest_cosine:
            largest_cosine = cosine
            blog_link = blog["link"]
            
    return blog_link, largest_cosine

In [48]:
sentence = "pytest have its plugin to handle asyncio"
find_closest_blog(sentence)

('solutions_to_bring_asynchronism_into_pytest',
 tensor(0.4952, grad_fn=<DotBackward0>))