In [None]:
import typing

import pandas
import seaborn

import cltrier_lib

import twon_agents

In [6]:
SEED: int = 1
DATA_PATH: str = "../../../data"

METRICS: typing.List[str] = [
    "public_metrics.impression_count",
    "public_metrics.like_count",
    "public_metrics.retweet_count",
    "public_metrics.reply_count",
]

In [9]:
posts: pandas.DataFrame = twon_agents.data.filter_tweets(
    pandas.read_csv(
        f"{DATA_PATH}/raw/twitter.german/GermanyMdBTweets_2023.csv", index_col=0
    )
    .merge(
        right=pandas.read_json(f"{DATA_PATH}/raw/twitter.german/mdb_meta.json"),
        left_on="username",
        right_on="twitter_username",
    )
)
posts.to_csv(f"{DATA_PATH}/interim/twitter.german.posts.csv")
posts

Unnamed: 0,id,author_id,conversation_id,text,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,public_metrics.impression_count,public_metrics.bookmark_count,created_at,username,twitter_username,first_name,last_name,party,role
3,1645675247595581440,1040160799208161280,1645675245141884928,Wenn wir Männern wie Xi &amp; Putin signalisie...,48,30,356,6,17359,,2023-04-11T06:28:48.000Z,n_roettgen,n_roettgen,Norbert,Röttgen,CDU/CSU,MdB
4,1621500636197003265,1040160799208161280,1621500633403604992,Als Opposition ist es unsere demokratische Auf...,71,18,402,1,7569,,2023-02-03T13:27:32.000Z,n_roettgen,n_roettgen,Norbert,Röttgen,CDU/CSU,MdB
7,1613459277477302273,1040160799208161280,1613459266932703232,"5/6 Der Idealfall wäre, wenn Deutschland jetzt...",68,15,649,2,10753,,2023-01-12T08:54:03.000Z,n_roettgen,n_roettgen,Norbert,Röttgen,CDU/CSU,MdB
8,1613459271672266752,1040160799208161280,1613459266932703232,"3/6 Immer mehr Staaten begreifen, dass der Kri...",47,8,522,0,8803,,2023-01-12T08:54:01.000Z,n_roettgen,n_roettgen,Norbert,Röttgen,CDU/CSU,MdB
9,1645389288551358465,1040160799208161280,1645389286978600964,"Selbiger 🇫🇷 Präsident, der vor einiger Zeit di...",61,24,792,4,25793,,2023-04-10T11:32:30.000Z,n_roettgen,n_roettgen,Norbert,Röttgen,CDU/CSU,MdB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37208,1610326064605310976,97530140,1610326064605310976,Wieviele Silvesternächte mit Straftaten &amp; ...,31,20,196,1,3127,,2023-01-03T17:23:46.000Z,MalteKaufmann,MalteKaufmann,Malte,Kaufmann,AfD,MdB
37209,1610129842099585024,97530140,1610129842099585024,Klar ist: Wer hier als Ausländer unseren Staat...,145,45,1073,6,16841,,2023-01-03T04:24:03.000Z,MalteKaufmann,MalteKaufmann,Malte,Kaufmann,AfD,MdB
37210,1609821802993442816,97530140,1609821802993442816,Die Lösung für die Angriffe auf Rettungskräfte...,56,32,398,4,9542,,2023-01-02T08:00:01.000Z,MalteKaufmann,MalteKaufmann,Malte,Kaufmann,AfD,MdB
37258,1663108743758790657,97530140,1663108743758790657,#Freiheit statt Verbote und Bevormundung. Imme...,51,7,212,0,2087,0.0,2023-05-29T09:03:18.000Z,MalteKaufmann,MalteKaufmann,Malte,Kaufmann,AfD,MdB


In [95]:
# calculate the distribution of replies
post_reply_dist: pandas.Series = (
    posts["public_metrics.reply_count"]
    .value_counts()
    .describe(percentiles=[0.25, 0.5, 0.75, 0.8, 0.85, 0.9, 0.95])
)
post_reply_dist

count    797.000000
mean       7.764115
std       34.252016
min        1.000000
25%        1.000000
50%        2.000000
75%        4.000000
80%        5.000000
85%        7.000000
90%       12.000000
95%       20.200000
max      680.000000
Name: count, dtype: float64

In [96]:
# generate a subset containing only the 90% most commented tweets
posts = posts[posts["public_metrics.reply_count"] > post_reply_dist.loc["90%"]]
len(posts)

3432

In [97]:
# calculate the distribution of user activity (number of posts)
user_activity_dist: pandas.Series = (
    posts["username"]
    .value_counts()
    .describe(percentiles=[0.25, 0.4, 0.5, 0.6, 0.75, 0.8, 0.85, 0.9, 0.95])
)
user_activity_dist

count     61.000000
mean      56.262295
std       56.101367
min       16.000000
25%       24.000000
40%       33.000000
50%       38.000000
60%       47.000000
75%       62.000000
80%       63.000000
85%       89.000000
90%      108.000000
95%      161.000000
max      327.000000
Name: count, dtype: float64

In [98]:
selected_percentile: str = "75%"

sampled_posts: pandas.DataFrame = (
    posts[
        # select only the 80% most active users
        posts["username"].isin(
            list(
                (
                    posts["username"].value_counts()
                    > user_activity_dist.loc[selected_percentile]
                )
                .pipe(lambda _s: _s[_s])
                .index
            )
        )
    ]
    .groupby("username")
    # sample by number of least active 80%' user posts
    .sample(n=int(user_activity_dist.loc[selected_percentile]), random_state=SEED)
)

len(sampled_posts), len(sampled_posts["username"].unique())

(868, 14)

In [99]:
sampled_posts

Unnamed: 0,id,author_id,conversation_id,text,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,public_metrics.impression_count,public_metrics.bookmark_count,created_at,username,twitter_username,first_name,last_name,party,role
30977,1625772644120334336,805308596,1625772644120334336,Beim Bunkerbesuch in 🇫🇮 fängt unsere Außenmini...,315,391,3328,13,75171,,2023-02-15 08:22:58+00:00,Beatrix_vStorch,Beatrix_vStorch,Beatrix,von Storch,AfD,MdB
30972,1624882807259467777,805308596,1624882807259467777,AfD-stabil mit Zugewinnen in einer Stadt im li...,108,230,1528,5,42833,,2023-02-12 21:27:04+00:00,Beatrix_vStorch,Beatrix_vStorch,Beatrix,von Storch,AfD,MdB
30956,1649799899679014915,805308596,1649799899679014915,"Der heutige #EarthDay2023 unter dem Motto ""Woh...",124,154,969,3,21361,,2023-04-22 15:38:42+00:00,Beatrix_vStorch,Beatrix_vStorch,Beatrix,von Storch,AfD,MdB
30856,1612788201877282817,805308596,1612788201877282817,Linksgrüner #Staatsfunk schürt #Klimapanik. Li...,551,627,3254,23,87306,,2023-01-10 12:27:26+00:00,Beatrix_vStorch,Beatrix_vStorch,Beatrix,von Storch,AfD,MdB
30865,1647187822753861632,805308596,1647187822753861632,Das heutige Ende der #Kernkraft ist auch ein A...,116,178,756,4,15604,,2023-04-15 10:39:14+00:00,Beatrix_vStorch,Beatrix_vStorch,Beatrix,von Storch,AfD,MdB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,1641705169422303232,1040160799208161280,1641705166930866177,"Auf den Bürger zu verweisen, der beim #Klimasc...",28,14,192,0,5002,,2023-03-31 07:33:08+00:00,n_roettgen,n_roettgen,Norbert,Röttgen,CDU/CSU,MdB
194,1616759620143292417,1040160799208161280,1616759620143292417,Dass der Kanzler in #Ramstein nicht geliefert ...,1159,1070,5369,77,136574,,2023-01-21 11:28:26+00:00,n_roettgen,n_roettgen,Norbert,Röttgen,CDU/CSU,MdB
414,1610607783254298624,1040160799208161280,1610607774643224577,Das Auswärtige Amt täuscht die Öffentlichkeit ...,994,96,2597,17,29047,,2023-01-04 12:03:13+00:00,n_roettgen,n_roettgen,Norbert,Röttgen,CDU/CSU,MdB
412,1659447745755721734,1040160799208161280,1659447740458319872,"Wenn wir der #Ukraine nicht mit Waffen helfen,...",26,27,215,0,7267,0.0,2023-05-19 06:35:48+00:00,n_roettgen,n_roettgen,Norbert,Röttgen,CDU/CSU,MdB


In [None]:
(
    sampled_posts.reset_index()[
        [
            "id",
            "author_id",
            "conversation_id",
            "first_name",
            "last_name",
            "username",
            "party",
            "created_at",
            "text",
        ]
    ].to_csv(f"{DATA_PATH}/interim/twitter.german.posts.sampled.csv")
)