In [None]:
%pip install huggingface_hub transformers

In [None]:
from huggingface_hub import AsyncInferenceClient
from transformers import LlamaTokenizerFast

BATCH_SIZE = 1024
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.1"

client = AsyncInferenceClient(model="http://204.12.201.57:5278", timeout=120)
tokenizer = LlamaTokenizerFast.from_pretrained(MODEL_ID)


In [None]:
get_prompt_1 = (
    lambda search_string: f'Here is a recent Google search: "{search_string}". What could the user be referring to with this query? What is their intent?'
)
get_templated_prompt_1 = lambda search_string: tokenizer.apply_chat_template(
    [{"role": "user", "content": get_prompt_1(search_string)}],
    tokenize=False,
    add_generation_prompt=True,
)

prompt_2 = (
    "A google search can be classified in one of the following 4 categories:"
    ' 1. Informational: The user seeks information. Examples include "What is the capital of France?", "How to bake a chocolate cake?", or "Symptoms of the flu"'
    ' 2. Navigational: The user wants to navigate to a specific website or online platform. Examples are "Facebook login", "OpenAI website", or "YouTube Taylor Swift".'
    ' 3. Transactional: The user has the intention to buy or conduct a transaction. Searches like "Buy iPhone 13 online", "Pizza delivery near me", or "Book a flight to New York" fall into this category.'
    ' 4. Commercial Investigation: The user wants to compare and research products or services before a purchase. "iPhone vs. Samsung comparison", "Best DSLR cameras 2023", or "Top running shoes brands" are examples.'
    " Which one is the appropriate category in this case?"
)
get_templated_prompt_2 = lambda search_string, intent: tokenizer.apply_chat_template(
    [
        {"role": "user", "content": get_prompt_1(search_string)},
        {"role": "assistant", "content": intent},
        {"role": "user", "content": prompt_2},
    ],
    tokenize=False,
    add_generation_prompt=True,
)
prompt_3 = "Based on the previous answer, only reply with the relevant category if any. DO NOT output any other text. ONLY output the category"
get_templated_prompt_3 = (
    lambda search_string, intent, category_raw: tokenizer.apply_chat_template(
        [
            {"role": "user", "content": get_prompt_1(search_string)},
            {"role": "assistant", "content": intent},
            {"role": "user", "content": prompt_2},
            {"role": "assistant", "content": category_raw},
            {"role": "user", "content": prompt_3},
        ],
        tokenize=False,
        add_generation_prompt=True,
    )
)

In [None]:
import pandas as pd


df = pd.read_json('/content/drive/MyDrive/Colab Notebooks/data/MyActivity.json')
df = df[~df['title'].str.contains('Visited')]
df = df[~df['title'].str.contains('Viewed')]
df = df[~df['title'].str.contains('Defined')]
df = df[~df['title'].str.contains('Used')]
df['title'] = df['title'].str.replace('^Searched for ', '', regex=True)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from tqdm.asyncio import tqdm
from asyncio import Semaphore

semaphore = Semaphore(BATCH_SIZE)

In [None]:
async def generate(prompt):
    await semaphore.acquire()
    try:
        return await client.text_generation(
            prompt,
            temperature=0.1,
            top_p=0.95,
            repetition_penalty=1.2,
            top_k=50,
            max_new_tokens=512,
        )
    except Exception as e:
        print(e)
        return "ERROR"
    finally:
        semaphore.release()

In [None]:
prompts = list(map(get_templated_prompt_1, df["title"].values))
tasks = list(map(generate, prompts))

outputs = []
for result in await tqdm.gather(*tasks, total=len(tasks), smoothing=0):
    outputs.append(result)

df["intent"] = outputs

In [None]:
prompts = list(map(get_templated_prompt_2, df["title"].values, df["intent"].values))
tasks = list(map(generate, prompts))

outputs = []
for result in await tqdm.gather(*tasks, total=len(tasks), smoothing=0):
    outputs.append(result)

df["category_raw"] = outputs

In [None]:
prompts = list(
    map(
        get_templated_prompt_3,
        df["title"].values,
        df["intent"].values,
        df["category_raw"].values,
    )
)

tasks = list(map(generate, prompts))

outputs = []
for result in await tqdm.gather(*tasks, total=len(tasks), smoothing=0):
    outputs.append(result)

df["category"] = outputs

In [None]:
df.to_csv("seo_categories.csv", columns=["title", "intent", "category_raw", "category"])

In [None]:
df = pd.read_csv("seo_categories.csv")

In [None]:
df['category'] = df['category'].str.strip().str.replace(r'\.+$', '', regex=True)

In [None]:
df['category'].value_counts()

In [None]:
df.loc[df['category'] == 'Informational']['category_raw'].values