In [1]:
import pandas as pd

import mlx.core as mx
from outlines import generate, models

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tweets_with_topics = pd.read_csv("topic_modeled_tweets.csv")

In [3]:
def get_prompt(topic_df):
    # all_text = "\nNew Document: \n".join(topic_df['Document'])
    all_text = "\n".join(topic_df['Document'])

    return f"""
    You are an assistant who generates search queries for an embedding search database to help a user find content of interest.

    Below is content that has been on the user's screen recently:

    {all_text}

    Task:
        - Analyze the content above.
        - Identify the main themes and topics.
        - Generate a single, concise sentence that summarizes the user's interests.
        - Include as many relevant keywords and concepts from the content as possible.
        - The sentence should be suitable as a search query for finding similar content. 

    Return a JSON object with the following structure:
    {{
        "query": "sentence for querying the embedding search database",
        "highlights": "Quick list of highlights used to generate the query",
        "quality": "float between 0 and 1 estimating the quality of the generated query for finding content of interest"
    }}

    Rules:
    - Do not add backticks to the JSON eg \`\`\`json\`\`\` is WRONG
    - DO NOT RETURN ANYTHING BUT JSON. NO COMMENTS BELOW THE JSON.
    """

In [4]:
json_schema = """
    {
    "type": "object",
    "properties": {
        "query": {"type": "string"},
        "highlights": {"type": "string"},
        "quality" : {"type" : "number"}
    }
}
"""

In [5]:
LLM_MODEL_NAME = "mlx-community/Llama-3.2-3B-Instruct"

In [6]:
mlx_model = models.mlxlm(LLM_MODEL_NAME)
generator = generate.json(mlx_model, json_schema)

Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 98855.65it/s]


In [7]:
topic_df = tweets_with_topics.loc[tweets_with_topics['Topic'] ==  1]

In [8]:
prompt = get_prompt(topic_df=topic_df)

In [9]:
print(prompt)


    You are an assistant who generates search queries for an embedding search database to help a user find content of interest.

    Below is content that has been on the user's screen recently:

    She needs to leave his ass FAST
how tf did this man have angelina jolie acting like this (1)
how tf did this man have angelina jolie acting like this t
Taylor Swift emits 8,205 tons of CO2 in a year. An average American emits 16. This is who lectures us on the climate.
taylor was in that ugly ass blazer getting down
no way katy perry and doechii just scissored each other on stage
"I have a husband"
"I have a husband" 
You can't date her unless you are okay with waking up next to a little boyfriend, a little fem girlfriend, or an androgynous partner... depending on the day. We have a mental illness epidemic in this country.
Groom sees his best friend in wedding dress
She doesn't like nice guys
Mila Kunis has been cast in 'KNIVES OUT 3' (Source: Deadline)
When did you realize that your part

In [10]:
response = generator(prompt)

In [13]:
response['quality']

0.82106268

In [13]:
tweets_with_topics.groupby(['Topic']).Document.count()

Topic
-1     1205
 0      429
 1      103
 2       85
 3       59
 4       56
 5       56
 6       52
 7       50
 8       45
 9       39
 10      34
 11      34
 12      34
 13      31
 14      29
 15      27
 16      26
 17      25
 18      24
 19      24
 20      23
 21      23
 22      21
 23      21
 24      20
 25      19
 26      17
 27      17
 28      17
 29      16
 30      16
 31      16
 32      15
 33      14
 34      14
 35      14
 36      14
 37      13
 38      13
 39      13
 40      13
 41      12
 42      12
 43      12
 44      11
 45      10
 46      10
 47      10
Name: Document, dtype: int64