In [30]:
import typing
import pathlib

import rich.progress
import pandas
import cltrier_lib

In [31]:
EXPORT_PATH: str = "../data/interim/twitter.german.dataset.enriched.csv"

In [32]:
dataset: pandas.DataFrame = (
    pandas.merge(
        pandas.read_csv("../data/interim/twitter.german.replies.csv", index_col=0),
        pandas.read_csv("../data/interim/twitter.german.posts.csv", index_col=0),
        how="left",
        left_on="conversation_id",
        right_on="id",
        suffixes=("_reply", "_post")
    )
    .rename(columns=dict(username="author_post"))
    [["id_post", "id_reply", "author_id_post", "author_id_reply", "author_post", "text_post", "text_reply"]]
)
dataset

Unnamed: 0,id_post,id_reply,author_id_post,author_id_reply,author_post,text_post,text_reply
0,1631341994235633669,1631783632519016449,17535941,15891726,christianduerr,"Nicht der #Verbrenner schadet dem #Klima, sond...",Sie haben wirklich keine Ahnung.
1,1646766528895897600,1646771345164828672,713361366858481664,15891726,StBrandner,Wo waren die ganzen plötzlichen #Kernkraftbefü...,Nö. Die liegt permanent falsch.
2,1637145132397916161,1637168335048646658,18189342,15891726,f_schaeffler,Die FDP Ostwestfalen-Lippe spricht sich für di...,Dann bauen wir doch da ein KKW hin.
3,1616701560431542272,1617115491868512256,42698498,15891726,RKiesewetter,"Aus meinem Wahlkreis heute eine Bürgerfrage: ""...",Ok. Stimmt. Sie sind beschämend.
4,1634905930595991554,1635196482180780033,712941191,15891726,katjadler,"Kohle statt Gas, Kernkraft als Gegner, eFuels ...",Ja. Aber Grundsätzlich verstehen Sie mal wiede...
...,...,...,...,...,...,...,...
5770,1650041581120942084,1650046072499195904,22364234,1633533791900663809,MarcusFaber,"Liebe Freunde, über 600 einsatzbereite Transp...",Wir? Was hast Du denn erreicht? Couch Rambo!
5771,1633382997389983744,1633739862342881280,3002271760,1633533791900663809,nyke_slawik,"Trans Frauen sind Frauen! Tut weh, Selbstverst...",Madame Slawik. Frauentag = Frauentag und nicht...
5772,1647368824985886722,1647545875923607553,17752770,1633533791900663809,KonstantinNotz,An alle die AKW in 🇩🇪 befürworten: 1.Fair enau...,"Polen, Finnland, Belgien, Frankreich, Holland,..."
5773,1634268513849286656,1634333263220076549,22364234,1633533791900663809,MarcusFaber,Die #Schweiz bleibt dabei. Man unterstützt die...,"Die Schweiz....Respekt davor, sich nicht vor d..."


In [33]:
dataset.to_csv(EXPORT_PATH.replace(".enriched", ""))

In [34]:
inference: cltrier_lib.inference.Pipeline = cltrier_lib.inference.Pipeline(model="llama3.1:70b-instruct-q6_K")

typing.get_args(cltrier_lib.inference.schemas.Models)

('llama3.1:8b-instruct-q6_K',
 'llama2:70b-chat-q6_K',
 'llama3:70b-instruct-q6_K',
 'llama3.1:70b-instruct-q6_K',
 'mistral:7b-instruct-v0.2-q6_K',
 'mixtral:8x22b-instruct-v0.1-q6_K',
 'mixtral:8x7b-instruct-v0.1-q6_K',
 'phi3:14b-medium-128k-instruct-q6_K',
 'gemma:7b-instruct-q6_K',
 'gemma2:27b-instruct-q6_K',
 'qwen:72b-chat-v1.5-q6_K',
 'qwen2:72b-instruct-q6_K')

In [35]:
leaning_classification_instruction: str = """Your task is to determine the political leaning of the given tweet. Please classify each tweet into one of the following categories:

left: This content contains viewpoints typically associated with left-wing or center-left German politics. This includes advocacy for social welfare programs, environmental protection, progressive social policies, workers' rights, and increased public spending.
neutral: This content is either unpolitical, neutral, balanced, or purely factual. This includes objective news reporting, general information about government processes, or non-partisan public service announcements.
right: This content contains viewpoints typically associated with conservative or center-right German politics. This includes advocacy for fiscal conservatism, traditional values, business-friendly policies, and market liberalization.

Respond only with the class name. Omit any justification. This is the tweet: """


topic_extraction_instruction: str = """Your task is to extract the main topics of the given tweet. Summarize topics exceeding 10 characters. Keep the total number of topics to 3 or fewer. 

Respond only with the topic names separated by commas. Omit any justification. This is the tweet: 
"""

In [51]:
for new_col, source_col, instruction in [
    ("leaning_post", "text_post", leaning_classification_instruction),
    ("leaning_reply", "text_reply", leaning_classification_instruction),
    ("topics_post", "text_post", topic_extraction_instruction),
    ("topics_reply", "text_reply", topic_extraction_instruction),
]:  
    
    if pathlib.Path(EXPORT_PATH).is_file():
        dataset = pandas.read_csv(EXPORT_PATH, index_col=0)

    if new_col not in dataset.columns:
        
        predictions: typing.List[str] = [
            inference(
                chat= cltrier_lib.inference.schemas.Chat(messages=[
                    cltrier_lib.inference.schemas.Message(role="system", content=instruction),
                    cltrier_lib.inference.schemas.Message(role="user", content=content)
                ])
            )[-1].content
            for content in rich.progress.track(dataset[source_col])
        ]

        dataset = dataset.assign(**{new_col: predictions})
        dataset.to_csv(EXPORT_PATH)
    
    display(dataset[new_col].value_counts())
    

leaning_post
left       2767
right      1771
neutral    1237
Name: count, dtype: int64

leaning_reply
right      2096
left       1850
neutral    1829
Name: count, dtype: int64

topics_post
WDR, GEZ-Gebühren, Hetze                                                        59
AfD, Democracy, Division                                                        53
Berlinwahl2023, Freie Demokraten, FDP Berlin                                    49
Autos, Klimafreundliche Kraftstoffe, EU-Kommission                              46
Buergerrat, Bundestag, GrueneBundestag                                          42
                                                                                ..
Klimaschutz, Kapitalismus, Verkehrswende                                         1
Verkehrssektor, Klimaschutz, Tempolimit                                          1
Demokratie, Rechtstaat, Reichsbürger-Milieu                                      1
Energiepolitik, Ausstieg aus Öl und Gasheizungen, Kritik an Manuela Schwesig     1
Klimaschutz, Soziale Gerechtigkeit, Kapitalismus                                 1
Name: count, Length: 675, dtype: int64

topics_reply
FDP, Politik                                                                      17
Grünen, Politik                                                                   11
SPD, Politik                                                                      10
Laughter, Amusement                                                                8
Demokratie, Politik                                                                7
                                                                                  ..
Steuerpolitik, Ausgabenpolitik, Finanzlage Deutschlands                            1
Respektierung von Meinungsfreiheit, politische Verantwortung, Selbstbestimmung     1
FDP, Kennzeichen, Online                                                           1
Krieg, Ukraine, Panzer                                                             1
Ukraine Hilfe, Korruption, Wirtschaft                                              1
Name: count, Length: 5457, dtype: int64

In [55]:
# dataset.at[$index$, "leaning_reply"] = "neutral"
# dataset.to_csv(EXPORT_PATH)

dataset[~dataset["leaning_reply"].isin(["left", "right", "neutral"])]

Unnamed: 0,id_post,id_reply,author_id_post,author_id_reply,author_post,text_post,text_reply,leaning_post,leaning_reply,topics_post,topics_reply


In [76]:
# manual (human) filtering and reduction of topics, comparison and unification with topics_reply to improve the dataset quality
list(dataset["topics_post"].str.split(",").explode().str.strip().drop_duplicates())

['Klima',
 'eFuels',
 'Verbrenner',
 'Kernkraft',
 'Altparteien',
 'AfD',
 'FDP',
 'Kernkraftwerke',
 'Ostwestfalen-Lippe',
 'Waffenkontrolle',
 'Leopard-Bestand',
 'Regierungstransparenz',
 'Klimapolitik',
 'Energiewende',
 'Kollektive Mobilität',
 'Corona',
 'Impfung',
 'Politik',
 'Adolf Hitler',
 'Antisemitismus',
 'Rechtsextremismus',
 'Cannabis',
 'Ampel',
 'Electric cars',
 'Road infrastructure',
 'Transportation policy',
 'Lützerath',
 'Protest',
 'Environment',
 'Flüge und Züge',
 'Streiken',
 'Wirtschaft',
 'Klimawandel',
 'Umwelt',
 'Ampel Politik',
 'Putin',
 'Baerbock',
 'Wende',
 'Urananreicherung',
 'Grüne',
 'Russland',
 'Streikrecht',
 'Staatliches Monopol',
 'Tarifpartnerei',
 'Aufstand der Letzten Generation',
 'Moralische Grenzen',
 'Rechtfertigung von Straftaten',
 'Elektroautos',
 'Deutschland',
 'Kernkraftgegner',
 'Sozialismus',
 'Migrationspolitik',
 'Entschuldigung',
 'Verteidigungsminister Pistorius',
 'Leopard2',
 'Ukraine-Krieg',
 'Nuclear Phase-out',
 'Ger