In [1]:
import calendar
import csv
import glob
import json
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tweepy

from utils import build_df, get_users

## Data processing notebook

This notebook describes how to get from the tweet IDs provided by [Banda et al.](https://github.com/thepanacealab/covid18_twitter) to the network presented in our paper.

We hydrated the tweet IDs using `hydrator.py` with the command below for every month of 2021:

```sh
python hydrator.py -m <month> -y <year> -t <BEARER TOKEN> -c  
```
```sh
  -h   show this help message and exit
  -m   month (number) for which the data should be hydrated
  -y   year (number) for which the data should be hydrated
  -c   whether or not to hydrate clean data (without retweets)
  -t   Twitter API Bearer Token
```

The hydrated data would then be found under `data/clean/de/hydrated/csv`.

In [2]:
# build tweets dataframe

tweets_df = build_df("data/clean/de/hydrated/csv", "csv")

In [3]:
# remove username's with tweets in languages other the German from the dataset

tweets_df = tweets_df[tweets_df["username"] != "benshapiro"]
tweets_df = tweets_df[tweets_df["username"] != "mir_ocall"]

In [4]:
# hydrate user_ids

all_user_ids = list(tweets_df["author_id"].drop_duplicates())
users = get_users(all_user_ids, bearer_token)

In [5]:
# create pandas dataframe of users

users_df = pd.DataFrame(data=users)

In [6]:
# add month and year column to the dataframe

months = pd.DatetimeIndex(tweets_df["created_at"]).month
tweets_df["month"] = months.apply(lambda x: calendar.month_abbr[x])
tweets_df["year"] = pd.DatetimeIndex(tweets_df["created_at"]).year

In [7]:
# add user meta data to tweets dataframe

tweets_df = tweets_df.merge(
    users_df[["id", "name", "username"]],
    left_on="author_id",
    right_on="id",
    how="inner",
)
tweets_df.drop(columns=["id_y"], inplace=True)
tweets_df.rename(columns={"id_x": "tweet_id"}, inplace=True)

In [8]:
# compute tweet popularity
# popularity = like_count + retweet_count + reply_count + quote_count

tweets_df["tweet_popularity"] = df.loc[
    :, ["like_count", "retweet_count", "reply_count", "quote_count"]
].sum(axis=1)

tweets_df.sort_values("tweet_popularity", ascending=False, inplace=True)
tweets_df.reset_index(inplace=True, drop=True)

In [9]:
# save users df to users.csv and tweets data frame augemented with user info to tweets.csv

users_df.to_csv("data/users.csv", index=False)
tweets_df.to_csv("data/tweets.csv", index=False)

Read-in the saved data

In [10]:
tweets_df = pd.read_csv("data/tweets.csv", encoding="utf-8")
users_df = pd.read_csv("data/users.csv", encoding="utf-8")

In [11]:
# get tweets with a popularity scoare of a 2000

top_1024_tweets = tweets_df.nlargest(1024, "tweet_popularity")
top_1024_tweets.tweet_popularity.describe()

count     1024.000000
mean      4162.578125
std       3108.897249
min       2000.000000
25%       2457.000000
50%       3147.000000
75%       4546.750000
max      29422.000000
Name: tweet_popularity, dtype: float64

In [12]:
# share of tweets per top user

_ = top_1024_tweets.groupby(["username"], as_index=False).size()
top_1024_tweets_usernames = _.sort_values("size", ascending=False).reset_index(
    drop=True
)


top_1024_tweets_usernames[:10].to_csv("data/top_1024_tweets_usernames.csv", index=False)
top_1024_tweets_usernames.head(5)

Unnamed: 0,username,size
0,DrPuerner,136
1,Karl_Lauterbach,133
2,EckerleIsabella,79
3,Flying__Doc,24
4,tagesschau,23


In [13]:
# get the users associated with these tweets

top_1024_tweets_users = top_1024_tweets_usernames.merge(
    users_df, on="username", how="left"
)
len(top_1024_tweets_users)

372

In [14]:
# get their followers and save them in the followings dictionary.

# replace bearer token with your twitter API token
client = tweepy.Client(bearer_token="", wait_on_rate_limit=True)

followings = {}

for _, user in top_1024_tweets_users.iterrows():
    friends = []
    for u in tweepy.Paginator(
        client.get_users_following, id=user["id"], max_results=1000
    ).flatten(limit=1000):
        friends.append({"id": u.id, "username": u.username})
    followings[user["username"]] = {"id": user["id"], "friends": friends}

In [15]:
# save followings object

with open("data/objects/followings", "wb") as f:
    pickle.dump(followings, f)

In [16]:
# load followings object

followings = {}
with (open("data/objects/followings", "rb")) as openfile:
    followings = pickle.load(openfile)
len(followings.keys())

372

In [17]:
def friend_filter(ar, df):
    x = [n for n in ar if n["username"] in list(df["username"])]
    return x

In [18]:
# filtered friends list, only use friends that belong those who authored the top 1024 tweets

followings_filtered = {
    key: {
        "id": value["id"],
        "friends": friend_filter(value["friends"], top_1024_tweets_users),
    }
    for key, value in followings.items()
    if key in list(top_1024_tweets_users["username"])
}
len(followings_filtered.keys())

372

### Build edge list

In [19]:
def build_graph(edges_path, df, followings_filtered):
    header = ["source", "target"]
    with open(edges_path, "w", encoding="UTF8") as f:
        writer = csv.writer(f)
        writer.writerow(header)
        for username, d in followings_filtered.items():
            for friend in d["friends"]:
                row = [str(username), str(friend["username"])]
                writer.writerow(row)

In [20]:
edges_path = "../network_edgelist.csv"

build_graph(edges_path, top_1024_tweets_users, followings_filtered)