In [1]:
import pandas
import seaborn

import cltrier_lib

import twon_agents

In [2]:
SEED: int = 1
DATA_PATH: str = "../../../data"

In [3]:
post_ids: pandas.DataFrame = pandas.read_csv(
    f"{DATA_PATH}/interim/twitter.german.posts.csv", index_col=0
)[["id"]]
post_ids

Unnamed: 0,id
0,1625772644120334336
1,1624882807259467777
2,1649799899679014915
3,1612788201877282817
4,1647187822753861632
...,...
863,1641705169422303232
864,1616759620143292417
865,1610607783254298624
866,1659447745755721734


In [4]:
replies: pandas.DataFrame = (
    pandas.read_csv(
        f"{DATA_PATH}/raw/twitter.german/GermanyReplies2023.csv", index_col=0
    )
    .pipe(twon_agents.data.filter_tweets, remove_w_mentions=False)
    .merge(post_ids, left_on="conversation_id", right_on="id")
    .rename(columns={"id_x": "id"})
)
replies

Unnamed: 0,id,author_id,conversation_id,text,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,public_metrics.bookmark_count,public_metrics.impression_count,created_at,username,in_reply_to_user_id,id_y
0,1626210251342053377,1498678918374735878,1626182469757140993,@n_roettgen Kämpfen für unsere Freiheit? So wi...,0,0,0,0,0,7,2023-02-16T13:21:52.000Z,n_roettgen,1040160799208161280,1626182469757140993
1,1626210339351150592,1585985488577503243,1626182469757140993,@n_roettgen Weil auch andere Länder Rückzieher...,0,0,0,0,0,10,2023-02-16T13:22:13.000Z,n_roettgen,1040160799208161280,1626182469757140993
2,1615837811294507008,1577540347437129728,1615836681713897472,@Einschlag22's account is temporarily unavaila...,0,0,0,0,0,26,2023-01-18T22:25:29.000Z,n_roettgen,1040160799208161280,1615836681713897472
3,1626213768190386178,1495081753022799875,1626182469757140993,@n_roettgen Warum wollen die anderen Länder wo...,1,0,8,0,0,89,2023-02-16T13:35:50.000Z,n_roettgen,1040160799208161280,1626182469757140993
4,1626211962123059200,1174913102728454144,1626182469757140993,@n_roettgen Die Ukraine kämpft für sich selber...,0,0,0,0,0,1,2023-02-16T13:28:39.000Z,n_roettgen,1040160799208161280,1626182469757140993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184638,1657432545997332481,1451289309424803840,1657421695181897728,@AnAudretsch @_FriedrichMerz Sagt ein GRÜNER 🤣,0,0,2,0,0,8,2023-05-13T17:08:07.000Z,AnAudretsch,995519538,1657421695181897728
184639,1657431761331134464,212688037,1657421695181897728,@AnAudretsch @_FriedrichMerz Andreas Du sprich...,1,0,2,0,0,16,2023-05-13T17:05:00.000Z,AnAudretsch,995519538,1657421695181897728
184640,1657491488870105089,3373809729,1657421695181897728,@AnAudretsch @_FriedrichMerz Sie Linkspopulist...,0,0,0,0,0,21,2023-05-13T21:02:20.000Z,AnAudretsch,995519538,1657421695181897728
184641,1657455359198474242,1266617819132841984,1657421695181897728,@AnAudretsch @_FriedrichMerz Das sie aktuell ü...,0,0,1,0,0,44,2023-05-13T18:38:46.000Z,AnAudretsch,995519538,1657421695181897728


In [8]:
# calculate the distribution of user activity (number of posts)
user_activity_dist: pandas.Series = (
    replies["author_id"]
    .value_counts()
    .describe(
        percentiles=[
            0.25,
            0.5,
            0.75,
            0.8,
            0.85,
            0.9,
            0.95,
            0.96,
            0.97,
            0.98,
            0.99,
            0.995,
        ]
    )
)
user_activity_dist

count    57093.000000
mean         3.234074
std          4.836542
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
80%          4.000000
85%          5.000000
90%          7.000000
95%         11.000000
96%         13.000000
97%         15.000000
98%         17.000000
99%         24.000000
99.5%       30.000000
max        194.000000
Name: count, dtype: float64

In [9]:
selected_percentile: str = "99%"

sampled_replies: pandas.DataFrame = (
    replies[
        # select only the 40% most active users
        replies["author_id"].isin(
            list(
                (
                    replies["author_id"].value_counts()
                    > user_activity_dist.loc[selected_percentile]
                )
                .pipe(lambda _s: _s[_s])
                .index
            )
        )
    ]
    .groupby("author_id")
    # sample by number of least active 40%' user posts
    .sample(n=int(user_activity_dist.loc[selected_percentile]), random_state=SEED)
)

len(sampled_replies), len(sampled_replies["author_id"].unique())

(12816, 534)

In [10]:
sampled_replies

Unnamed: 0,id,author_id,conversation_id,text,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,public_metrics.bookmark_count,public_metrics.impression_count,created_at,username,in_reply_to_user_id,id_y
136983,1647320441390788609,15891726,1646966214948802577,@maxmordhorst Nein. Deine eigene Dummheit.,0,0,1,0,0,4,2023-04-15T19:26:13.000Z,maxmordhorst,2174533082,1646966214948802577
139799,1646023351188738050,15891726,1645769884926717953,@maxmordhorst Traurig aber ehrlich. Wieder mal...,0,0,2,0,0,17,2023-04-12T05:32:03.000Z,maxmordhorst,2174533082,1645769884926717953
137464,1620124448643768340,15891726,1619987792976805888,@maxmordhorst @pmkru So ein Blödsinn.,0,0,0,0,0,4,2023-01-30T18:19:03.000Z,maxmordhorst,2174533082,1619987792976805888
139676,1653411669400231940,15891726,1653142947536199681,@maxmordhorst Haben Sie wirklich so wenig begr...,0,0,0,0,0,4,2023-05-02T14:50:35.000Z,maxmordhorst,2174533082,1653142947536199681
142168,1620740556799328256,15891726,1620720408558436352,@maxmordhorst Mordhorst der Rechenkünstler. Je...,0,0,0,0,0,16,2023-02-01T11:07:15.000Z,maxmordhorst,2174533082,1620720408558436352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35316,1659103494333243395,1644215187908489217,1659098493976477696,@MarcoBuschmann Und jetzt stell Dir vor: Ein G...,0,1,2,0,0,92,2023-05-18T07:47:52.000Z,MarcoBuschmann,106136813,1659098493976477696
148739,1649664766049148929,1644215187908489217,1649661814106640385,@GoeringEckardt Das würdet ihr mit UMWELTSCHUT...,0,0,0,0,0,8,2023-04-22T06:41:44.000Z,GoeringEckardt,626287930,1649661814106640385
39792,1649663050171949056,1644215187908489217,1649384828859449344,@MarcoBuschmann @fdp @c_lindner Wo kann man si...,0,0,0,0,0,3,2023-04-22T06:34:55.000Z,MarcoBuschmann,106136813,1649384828859449344
147830,1651095556435894273,1644215187908489217,1650870107025833986,@GoeringEckardt DU BIST EINE KÜCHENHILFE. Spr...,0,0,0,0,0,2,2023-04-26T05:27:11.000Z,GoeringEckardt,626287930,1650870107025833986


In [8]:
(
    sampled_replies
    # remove linebreaks and mentions at beginning
    .assign(
        text=(
            sampled_replies["text"]
            .replace(r"\n", " ", regex=True)
            .replace(r"^@(\w){1,15}\s?(@(\w){1,15})*", "", regex=True)
        )
    )[["id", "author_id", "conversation_id", "created_at", "text"]].to_csv(
        f"{DATA_PATH}/interim/twitter.german.replies.csv"
    )
)

In [9]:
classified_data = pandas.json_normalize(
    [
        sample.model_dump()
        for sample in cltrier_lib.classify.Pipeline()(
            list(sampled_replies["text"]), threshold=0.0
        )
    ]
)
classified_data

Unnamed: 0,sample,results.topics.arts_&_culture,results.topics.business_&_entrepreneurs,results.topics.celebrity_&_pop_culture,results.topics.diaries_&_daily_life,results.topics.family,results.topics.fashion_&_style,results.topics.film_tv_&_video,results.topics.fitness_&_health,results.topics.food_&_dining,...,results.emotions.trust,results.sentiment.negative,results.sentiment.neutral,results.sentiment.positive,results.irony.non_irony,results.irony.irony,results.offensive.non-offensive,results.offensive.offensive,results.hate.NOT-HATE,results.hate.HATE
0,@RKiesewetter Wussten das die CDU Verteidigung...,0.010403,0.004250,0.017381,0.068277,0.001605,0.000734,0.011765,0.001554,0.000840,...,0.030379,0.270784,0.773874,0.374926,0.810347,0.285108,0.710429,0.309836,0.931027,0.080776
1,@StBrandner Da es keinen Staatsfunk gibt: Nein.,0.022527,0.006241,0.175455,0.079630,0.001431,0.005069,0.109173,0.001740,0.000917,...,0.011219,0.262889,0.737343,0.396053,0.827131,0.316757,0.713729,0.307276,0.920231,0.082769
2,"@f_schaeffler Steht da auch drinne, dass die D...",0.096684,0.008564,0.033465,0.211034,0.004652,0.003375,0.010797,0.003691,0.001490,...,0.026807,0.238850,0.750003,0.450912,0.885061,0.194130,0.739730,0.279449,0.948221,0.059228
3,@katjadler Leute die Begriffe wie „kulturelle ...,0.043447,0.003945,0.159271,0.141028,0.011590,0.001960,0.105020,0.004714,0.001554,...,0.008653,0.399478,0.719105,0.293288,0.793308,0.310786,0.665345,0.350036,0.886828,0.108899
4,@StBrandner Oh doch. Zum Beispiel könnte eine ...,0.055064,0.006693,0.132551,0.016529,0.002345,0.005889,0.022203,0.004730,0.001238,...,0.039536,0.264005,0.747439,0.410508,0.726433,0.366130,0.688613,0.331351,0.945741,0.061119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5215,@nyke_slawik Nein herrlich! Eventuell wählt di...,0.013116,0.009059,0.028200,0.042101,0.001510,0.001874,0.012149,0.001188,0.000672,...,0.058914,0.162306,0.727314,0.622772,0.816428,0.267064,0.744944,0.269698,0.955697,0.044004
5216,@Tino_Chrupalla @RusBotschaft Das ZDF verstrah...,0.077125,0.015609,0.017731,0.078705,0.001460,0.004026,0.027874,0.001110,0.001389,...,0.028390,0.108666,0.594224,0.832509,0.680777,0.416570,0.710064,0.295652,0.908470,0.098297
5217,@GoeringEckardt @hartaberfair @DasErste Mit so...,0.038423,0.008604,0.027372,0.292372,0.002446,0.003985,0.009152,0.002418,0.003118,...,0.045368,0.175892,0.761234,0.533312,0.820887,0.244813,0.748947,0.271437,0.950803,0.055427
5218,@Tino_Chrupalla Was zum Teufel sind eigentlich...,0.102737,0.144527,0.012784,0.213372,0.004211,0.008295,0.020930,0.001312,0.012716,...,0.025128,0.277564,0.864931,0.253881,0.860845,0.226521,0.744517,0.269264,0.941371,0.063817
