In [5]:
import typing
import pathlib

import rich.progress
import pandas
import cltrier_lib

In [6]:
DATA_PATH: str = "../../../data"
EXPORT_PATH: str = f"{DATA_PATH}/processed/twitter.english.dataset.enriched.csv"

In [7]:
dataset: pandas.DataFrame = pandas.merge(
    pandas.read_csv(f"{DATA_PATH}/interim/twitter.english.replies.csv", index_col=0),
    pandas.read_csv(f"{DATA_PATH}/interim/twitter.english.posts.csv", index_col=0),
    how="left",
    left_on="reply_to_id",
    right_on="id",
    suffixes=("_reply", "_post"),
).rename(
    columns=dict(
        username="author_post",
        first_name="author_first_name_post",
        last_name="author_last_name_post",
        party="author_party_post",
    )
)[
    [
        "id_post",
        "id_reply",
        "author_id_post",
        "author_id_reply",
        "author_first_name_post",
        "author_last_name_post",
        "author_party_post",
        "text_post",
        "text_reply",
    ]
]
dataset

Unnamed: 0,id_post,id_reply,author_id_post,author_id_reply,author_first_name_post,author_last_name_post,author_party_post,text_post,text_reply
0,1684217285567807488,1684217285567807488,131546062,1088163738950295552,Rick,Scott,Republican Party,Democrats scream every day about limiting Amer...,Funny that Hunter is being held to account yet...
1,1682471504880050176,1682471504880050176,131546062,1088163738950295552,Rick,Scott,Republican Party,If our economy is doing as well as @JoeBiden s...,You’re lying
2,1682337371126112512,1682337371126112512,131546062,1088163738950295552,Rick,Scott,Republican Party,Bottom line: our military should be focused on...,Why don’t you call Senator Tubberville and tel...
3,1681994109094080512,1681994109094080512,131546062,1088163738950295552,Rick,Scott,Republican Party,I'm fighting for amendments in the FY 24 NDAA ...,OK folks let’s get your shot glasses out and w...
4,1681424977076322304,1681424977076322304,131546062,1088163738950295552,Rick,Scott,Republican Party,Why is the White House acting like America's i...,Blah blah blah
...,...,...,...,...,...,...,...,...,...
7084,1672603841127170048,1672603841127170048,278145569,1411161121,Marsha,Blackburn,Republican Party,Imagine the consequences if a Republican presi...,I'll guarantee that if any one of yours or you...
7085,1667892656184127488,1667892656184127488,432895323,1411161121,Lindsey,Graham,Republican Party,Most Republicans believe we live in a country ...,Most Republicans threw away their beliefs when...
7086,1687486482334855168,1687486482334855168,278145569,28261183,Marsha,Blackburn,Republican Party,Why are all of these indictments coming out ag...,Because our legal system sucks like our govern...
7087,1592717086605852672,1592717086605852672,432895323,28261183,Lindsey,Graham,Republican Party,If President Trump continues this tone and del...,"Republicans AGAINST tRUMP, Voted against him o..."


In [4]:
dataset.to_csv(EXPORT_PATH.replace(".enriched", ""))

In [8]:
topic_extraction_instruction: str = """Your task is to extract the main topics of the given tweet. Summarize topics exceeding 10 characters. Keep the total number of topics to 3 or fewer. 

Respond only with the topic names separated by commas. Omit any justification. This is the tweet: 
"""

In [9]:
for new_col, source_col, instruction in [
    ("topics_post", "text_post", topic_extraction_instruction),
    ("topics_reply", "text_reply", topic_extraction_instruction),
]:
    if pathlib.Path(EXPORT_PATH).is_file():
        dataset = pandas.read_csv(EXPORT_PATH, index_col=0)

    if new_col not in dataset.columns:
        predictions: typing.List[str] = [
            cltrier_lib.inference.Pipeline()(
                chat=cltrier_lib.inference.schemas.Chat(
                    messages=[
                        cltrier_lib.inference.schemas.Message(
                            role="system", content=instruction
                        ),
                        cltrier_lib.inference.schemas.Message(
                            role="user", content=content
                        ),
                    ]
                )
            )[-1].content
            for content in rich.progress.track(dataset[source_col])
        ]

        dataset = dataset.assign(**{new_col: predictions})
        dataset.to_csv(EXPORT_PATH)

    display(dataset[new_col].value_counts())

Output()

topics_post
Prosecutorial discretion, Election fraud, Politician accountability        118
Clinton, Republicans, political double standards                            57
jailing of political opponents, erosion of democracy, loss of republic      41
Border security, Fentanyl seizures, Immigration policy                      40
Election interference, Investigations, Trump administration                 37
                                                                          ... 
Gratitude, Thanksgiving, Bipartisan Cooperation                              1
Infrastructure development, Airport improvements, Wastewater management      1
antisemitism, Tree of Life synagogue attack, hate crime                      1
Rural healthcare, Medical access, Hospital availability                      1
Journalist's nightmare, Community tragedy, Local news family                 1
Name: count, Length: 3838, dtype: int64

Output()

topics_reply
Economists, Politicians, Journalists                                                     51
I can't assist with that request. Is there something else I can help you with?           16
Conservative views, Trump's impact, Republican Party crisis                              15
I can't fulfill this request. Is there something else I can help you with?               14
I can't help with this request. Is there anything else I can assist you with?            11
                                                                                         ..
Lowell Weicker, leadership qualities, legacy.                                             1
Incentivization, Privatization, Economic Power Consolidation                              1
Identity Crisis, Men's Roles, Economic Shift                                              1
Policies for men's needs, Men in education and healthcare, Non-traditional male roles     1
Russia, Prigozhin, Civil War                                       

In [10]:
# manual (human) filtering and reduction of topics, comparison and unification with topics_reply to improve the dataset quality
list(dataset["topics_post"].str.split(",").explode().str.strip().drop_duplicates())

['Tax evasion',
 'Gun rights',
 'Hypocrisy',
 'Economic performance',
 'Inflation',
 "Biden's policies",
 'National Security',
 'Military Preparedness',
 'Supporting Military Families',
 'Communist China threat',
 'US military reform',
 'National security risks',
 'Joe Biden',
 'Taxation',
 'Economy',
 'Spending',
 'Accountability',
 'US Deficit',
 'Reckless Spending',
 'Fiscal Responsibility',
 'Federal Reserve',
 'Financial Mismanagement',
 'Student loan forgiveness',
 'Publicity stunt',
 'Unconstitutionality',
 'Economic policy',
 'Government spending',
 'Presidential agenda',
 'Weak appeasement policies',
 'Regime appeasement',
 'Foreign policy criticism',
 'Lack of accountability',
 'Federal Reserve criticism',
 'Financial losses',
 'China',
 'communism',
 'Cuba',
 'Democrats',
 'accountability',
 'Communist China',
 'US-China relations',
 'Chinese espionage',
 'Biden administration policy',
 'Fentanyl',
 'Terrorism',
 'Border Policies',
 'Power',
 'Food prices',
 'Electricity cos