In [1]:
import importlib
from pathlib import Path

import pandas as pd
from cuery import Field, Prompt, ResponseModel, Task, cost, pprint, utils

GDRIVE = Path("~/Google Drive/Shared drives/Solutions").expanduser()

importlib.reload(cost)
importlib.reload(utils)

<module 'cuery.utils' from '/Users/thomas/code/cuery/src/cuery/utils.py'>

In [17]:
system = """
You will receive a tweet text from a larger dataset of tweets about electric vehicles, and a list of top-level topics in markdown format.
Your task is to identify new generalizable topics within the document that can act as top-level topics in the hierarchy.
If any topic mentioned is similar enough to an existing topic (paraphrasing it), return the existing one instead.
I.e. avoid duplicating topics with similar meanings, but different phrasing.

# Examples

## Example 1 (new topic, returning "Battery safety issues):
### Existing topics
- Charging infrastructure
### Document
I don't want my car to explode on contact, the cars are expensive enough hahaha
### Your response
[Safety issues, Pricing]

## Example 2 (no identifiable topic, returning an empty list):
### Existing topics
- Charging infrastructure
- Safety issues
### Document
You don't say. D'uh!
### Your response
[]

## Example 2 (different dataset context, returning an existing topic):
### Existing topics
- For sale
### Document
A friend of mine would like to sell his 850 MB SCSI drive for $800 + S/H.It is a full-height drive,
and has been used for about one and a half years.If anyone is interested, please e-mail me.
### Your response
[For sale]


# Instructions
Step 1: Determine topics mentioned in the document.
- The topic labels must be as GENERALIZABLE as possible. They must NOT be document-specific.
- The topics must reflect a SINGLE topic instead of a combination of topics.
- The topics must be broad enough to accommodate future subtopics.

Step 2: Perform ONE of the following operations:
- If there are already very similar, duplicate or relevant topics in the hierarchy, output those topics and stop here. 
- If the document contains no topic, return en empty list ([]).
- Otherwise, stop here and output the new topic(s) as a list
"""


user = """
Extract new topics from the text below if they are not already amongst existing topics.
### Existing topics
- Charging infrastructure
### Document
{{text}}
### Your response

"""

prompt = Prompt(
    messages=[
        {"role": "system", "content": system},
        {"role": "user", "content": user},
    ],
    required=["text"],
)

pprint(prompt)

In [18]:
class Topics(ResponseModel):
    names: list[str] = Field(
        ..., description="(Possibly empty) list of top-level, generalizable topics."
    )


topics = Task(prompt=prompt, response=Topics, model="gpt-4.1-mini")
pprint(topics)

In [19]:
def split_prompt_topics(text: str) -> tuple[list, str, str]:
    pre, post = text.split("### Existing topics", maxsplit=1)
    topics, post = post.split("### Document", maxsplit=1)
    topics = topics.strip().replace("- ", "")
    topics = [topic.strip() for topic in topics.split("\n") if topic.strip()]
    pre = pre + "### Existing topics\n"
    post = "\n### Document" + post
    return topics, pre, post


def update_prompt(response: ResponseModel, prompt: Prompt, context: dict | None = None) -> None:
    """Update the prompt with the given text."""
    new_topics = response.names
    if new_topics:
        message = prompt.messages[1].content
        old_topics, pre, post = split_prompt_topics(message)
        topics = set(old_topics) | set(new_topics)
        new_message = pre + "\n".join(f"- {x}" for x in sorted(topics)) + post
        prompt.messages[1].content = new_message


# Electric vehicles

In [2]:
DATA_DIR = Path("/Users/thomas/data/text")

veh = pd.read_csv(DATA_DIR / "Electric Vehicles Tweets.csv")
veh

  veh = pd.read_csv(DATA_DIR / "Electric Vehicles Tweets.csv")


Unnamed: 0,createdAt,text,url,viewCount,retweetCount,replyCount,likeCount,quoteCount,author.type,author.userName,...,extendedEntities,expanded_url_0,author.profile_bio.entities.url.urls,author.profile_bio.entities.description.user_mentions,author.profile_bio.entities.description.hashtags,expanded_url_1,expanded_url_2,author.profile_bio.entities.description.urls,expanded_url_3,author.profile_bio.entities.description.symbols
0,Mon Apr 08 23:43:31 +0000 2024,🇨🇳 La fábrica de coches eléctricos de Xiaomi e...,https://x.com/rosa_9900/status/177748243528389...,23,0.0,0,0,0,user,rosa_9900,...,[object Object],https://twitter.com/rosa_9900/status/177748243...,,,,,,,,
1,Mon Apr 08 23:38:10 +0000 2024,@McapitalC Mira con tanta energía que no se pu...,https://x.com/ioniqelectrico/status/1777481089...,20,0.0,1,0,0,user,ioniqelectrico,...,[object Object],,"[{""display_url"":""ts.la/ivanno40347"",""expanded_...",,,,,,,
2,Mon Apr 08 23:15:38 +0000 2024,"@HdLG94 @Davidmartin341 ""Además eliminar mucho...",https://x.com/casiopeaexpres/status/1777475419...,42,0.0,1,1,0,user,casiopeaexpres,...,[object Object],,,,,,,,,
3,Mon Apr 08 23:15:09 +0000 2024,@Ma_WuKong Estos drones son la verdadera revol...,https://x.com/PedroJobRom/status/1777475297299...,188,0.0,1,1,0,user,PedroJobRom,...,[object Object],,,,,,,,,
4,Mon Apr 08 23:12:10 +0000 2024,🇨🇳Buenísimo! La fábrica de coches eléctricos d...,https://x.com/david_ordaz/status/1777474547421...,60,0.0,0,0,0,user,david_ordaz,...,[object Object],https://twitter.com/david_ordaz/status/1777474...,"[{""display_url"":""davidordaznoticias.wordpress....","[{""id_str"":""0"",""indices"":[47,50],""name"":"""",""sc...",,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22651,Mon Apr 07 21:34:11 +0000 2025,Si estáis pensando adquirir productos tecnológ...,https://x.com/Nico_Lalin/status/19093590706219...,26,0.0,0,0,0,user,Nico_Lalin,...,[object Object],,,,,,,,,
22652,Mon Apr 07 21:26:53 +0000 2025,@cmrda_torres Un minero con pin 2030 es como u...,https://x.com/P84508149/status/190935723446468...,7,0.0,0,0,0,user,P84508149,...,[object Object],,,,,,,,,
22653,Mon Apr 07 21:25:51 +0000 2025,Los coches eléctricos son bombas inestables ba...,https://x.com/AbelDomiR/status/190935697534477...,22,0.0,0,1,0,user,AbelDomiR,...,[object Object],,,,"[{""indices"":[72,83],""text"":""malahostia""}]",,,,,
22654,Mon Apr 07 21:25:27 +0000 2025,@Mara74063949221 @mencabrono noruega es el que...,https://x.com/Bok3Ron84/status/190935687547005...,6,0.0,1,0,0,user,Bok3Ron84,...,[object Object],,,,,,,"[{""display_url"":""youtube.com/Bok3RonVR"",""expan...",,


In [23]:
res = await topics.iter(veh[:100], callback=update_prompt)
res.to_pandas()

Iterating context:   0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,text,names
0,🇨🇳 La fábrica de coches eléctricos de Xiaomi e...,Factory automation
1,@McapitalC Mira con tanta energía que no se pu...,Energy consumption awareness
2,"@HdLG94 @Davidmartin341 ""Además eliminar mucho...",Urban infrastructure redesign
3,@Ma_WuKong Estos drones son la verdadera revol...,Drones as transportation
4,@Ma_WuKong Estos drones son la verdadera revol...,Road infrastructure removal
...,...,...
141,Enfocados en un futuro más limpio y sostenible...,Corporate electric vehicle fleets
142,@patrims Que si quiero tirar el dinero en gili...,Consumer purchasing behavior
143,@patrims Que si quiero tirar el dinero en gili...,Financial attitudes towards electric vehicles
144,Acabo de salir a correr y me he cruzado con es...,Electric vehicle brand recognition


In [24]:
sorted(res.to_pandas().names.unique())

['Advertising representation in electric vehicle marketing',
 'Affordable clean technologies',
 'Air pollution and public health',
 'Automaker production strategy',
 'Automated manufacturing',
 'Automated passenger transport',
 'Battery costs',
 'Battery degradation',
 'Brand competition',
 'Charging power limitations',
 'Charging station congestion',
 'Charging station queueing',
 'Charging time and habits',
 "China's role in electric vehicle development",
 'Clean energy',
 'Consumer perceptions',
 'Consumer purchasing behavior',
 'Consumer skepticism',
 'Corporate electric vehicle fleets',
 'Corporate investment',
 'Corporate partnerships',
 'Corporate sustainability initiatives',
 'Drones as transportation',
 'Economic self-sabotage',
 'Electric autonomous vehicles',
 "Electric utility companies' role",
 'Electric vehicle adoption rates',
 'Electric vehicle affordability',
 'Electric vehicle bans',
 'Electric vehicle brand recognition',
 'Electric vehicle charging challenges',
 'Ele

# All in one

In [31]:
importlib.reload(cost)
importlib.reload(utils)

tweets = utils.concat_up_to(veh.text[:1000], model="o4-mini", max_dollars=1)
print(tweets)

🇨🇳 La fábrica de coches eléctricos de Xiaomi en China.Apenas quedan ya humanos haciendo nada; todo está automatizado. https://t.co/MpySMQ6g4H
@McapitalC Mira con tanta energía que no se puede almacenar , aún así y con 15000 vehículos eléctricos sin contar bicis, patinetes , quads, motos , cuatriciclos , segwey etc , ni cortacésped. Sábes qué es lo que más consume energía en el mundo...La ignorancia. Los #VE son la solución. #V2H
@HdLG94 @Davidmartin341 "Además eliminar muchos aparcamientos y transformarlos en carriles bici y espacios peatonales." Entonces que le den al vehículo eléctrico, ¿no?
@Ma_WuKong Estos drones son la verdadera revolución en transporte, no los coches eléctricos.  Ya es hora de irnos librando de las carreteras.
🇨🇳Buenísimo! La fábrica de coches eléctricos de Xiaomi en China tiene todo automatizado. https://t.co/R6EH4b2qjY
Si crees que el coche eléctrico es cosa del futuro, debes leer esto¿Sabías que en 1884 se presentó el primer coche eléctrico estándar de producc

In [None]:
meta_topic = "Tweets about electric vehicles"

p = """From the list of texts below (separated by line breaks), extract a two-level nested markdown list of topics.
The top-level should not contain more than 10 topics, and each top-level should not contain more than 5 subtopics.
The texts come from a dataset of {{meta_topic}}, so the topics should be relevant to that domain. Make sure
top-level topics are generalizable and not too specific, so they can be used as a hierarchy for the subtopics. Make sure
also that subtopics are not redundant (no similar ones within the the same top-level topic).

# Texts

{{texts}}
"""

prompt = Prompt.from_string(utils.dedent(p))
pprint(prompt)


class MarkdownList(ResponseModel):
    topics: str = Field(..., description="A two-level nested markdown list of topics.")


task = Task(prompt=prompt, response=MarkdownList, model="gpt-4.1-mini")

# Newsgroups

In [15]:
DATA_DIR = Path("/Users/thomas/data/text")
news = pd.read_csv(DATA_DIR / "newsgroups.csv")
display(news.label.value_counts())

sample = news.sample(500)
sample

label
rec.sport.hockey            999
soc.religion.christian      997
rec.motorcycles             996
rec.sport.baseball          994
sci.crypt                   991
sci.med                     990
rec.autos                   990
comp.windows.x              988
sci.space                   987
comp.os.ms-windows.misc     985
sci.electronics             984
comp.sys.ibm.pc.hardware    982
misc.forsale                975
comp.graphics               973
comp.sys.mac.hardware       963
talk.politics.mideast       940
talk.politics.guns          910
alt.atheism                 799
talk.politics.misc          775
talk.religion.misc          628
Name: count, dtype: int64

Unnamed: 0,text,label
9795,"When I use telix (or kermit) in WIN 3.1, or us...",comp.os.ms-windows.misc
8948,We are interested in purchasing a grayscale pr...,comp.graphics
17982,"I remember seeing it several months ago, and i...",sci.crypt
12213,Items for sale.....Nishika 3D camera\tIt takes...,misc.forsale
45,I have one complaint for the cameramen doing t...,rec.sport.hockey
...,...,...
4959,For Sale:David Clark H10-40 Aviation HeadsetEx...,misc.forsale
9540,The pricing of parts reminds me of something a...,sci.electronics
17284,When did the BATF say this? Everything I've se...,talk.politics.misc
11779,Article #61214 (61317 is last):From: arnolm2@a...,comp.sys.ibm.pc.hardware
