In [1]:
import calendar
import csv
import glob
import json
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tweepy

from utils import build_df, get_users

## Data processing notebook

This notebook describes how to get from the tweet IDs provided by [Banda et al.](https://github.com/thepanacealab/covid18_twitter) to the network presented in our paper.

We hydrated the tweet IDs using `hydrator.py` with the command below for every month of 2021:

```sh
python hydrator.py -m <month> -y <year> -t <BEARER TOKEN> -c  
```
```sh
  -h   show this help message and exit
  -m   month (number) for which the data should be hydrated
  -y   year (number) for which the data should be hydrated
  -c   whether or not to hydrate clean data (without retweets)
  -t   Twitter API Bearer Token
```

The hydrated data would then be found under `data/clean/de/hydrated/csv`.

In [2]:
%%script false --no-raise-error
# build tweets dataframe

tweets_df = build_df("data/clean/de/hydrated/csv", "csv")

Couldn't find program: 'false'


In [3]:
%%script false --no-raise-error
# remove username's with tweets in languages other the German from the dataset

tweets_df = tweets_df[tweets_df["username"] != "benshapiro"]
tweets_df = tweets_df[tweets_df["username"] != "mir_ocall"]

Couldn't find program: 'false'


In [4]:
%%script false --no-raise-error
# hydrate user_ids

all_user_ids = list(tweets_df["author_id"].drop_duplicates())
users = get_users(all_user_ids, bearer_token)

Couldn't find program: 'false'


In [5]:
%%script false --no-raise-error
# create pandas dataframe of users

users_df = pd.DataFrame(data=users)

Couldn't find program: 'false'


In [6]:
%%script false --no-raise-error
# add month and year column to the dataframe

months = pd.DatetimeIndex(tweets_df["created_at"]).month
tweets_df["month"] = months.apply(lambda x: calendar.month_abbr[x])
tweets_df["year"] = pd.DatetimeIndex(tweets_df["created_at"]).year

Couldn't find program: 'false'


In [7]:
%%script false --no-raise-error
# add user meta data to tweets dataframe

tweets_df = tweets_df.merge(
    users_df[["id", "name", "username"]],
    left_on="author_id",
    right_on="id",
    how="inner",
)
tweets_df.drop(columns=["id_y"], inplace=True)
tweets_df.rename(columns={"id_x": "tweet_id"}, inplace=True)

Couldn't find program: 'false'


In [8]:
%%script false --no-raise-error
# compute tweet popularity
# popularity = like_count + retweet_count + reply_count + quote_count

tweets_df["tweet_popularity"] = df.loc[
    :, ["like_count", "retweet_count", "reply_count", "quote_count"]
].sum(axis=1)

tweets_df.sort_values("tweet_popularity", ascending=False, inplace=True)
tweets_df.reset_index(inplace=True, drop=True)

Couldn't find program: 'false'


In [9]:
%%script false --no-raise-error
# save users df to users.csv and tweets data frame augemented with user info to tweets.csv

users_df.to_csv("data/users.csv", index=False)
tweets_df.to_csv("data/tweets.csv", index=False)

Couldn't find program: 'false'


Read-in the saved data

In [10]:
tweets_df = pd.read_csv("data/tweets.csv", encoding="utf-8")
users_df = pd.read_csv("data/users.csv", encoding="utf-8")

In [11]:
%%script false --no-raise-error
# get their followers and save them in the followings dictionary.

# replace bearer token with your twitter API token
client = tweepy.Client(
    bearer_token="AAAAAAAAAAAAAAAAAAAAAF7uPAEAAAAAbtQx0EP7VaaPPUT%2FEFxvIxFtGZ8%3DXAQkzZPA8sVZ8qu0Vx9lWeqP2CxkAFs6qjQGzUondvLqWEZHLL",
    wait_on_rate_limit=True,
)

followings = {}

for _, user in top_tweets_users_1000.iterrows(): # biggest network
    friends = []
    for u in tweepy.Paginator(
        client.get_users_following, id=user["id"], max_results=1000
    ).flatten(limit=1000):
        friends.append({"id": u.id, "username": u.username})
    followings[user["username"]] = {"id": user["id"], "friends": friends}

Couldn't find program: 'false'


In [12]:
%%script false --no-raise-error
# save followings object

with open("data/objects/followings", "wb") as f:
    pickle.dump(followings, f)

Couldn't find program: 'false'


In [13]:
# load followings object

followings = {}
with open("data/objects/followings", "rb") as openfile:
    followings = pickle.load(openfile)
len(followings.keys())

785

In [14]:
def friend_filter(ar, df):
    x = [n for n in ar if n["username"] in list(df["username"])]
    return x

### Build edge list

In [15]:
def build_graph(edges_path, df, followings_filtered):
    header = ["source", "target"]
    with open(edges_path, "w", encoding="UTF8") as f:
        writer = csv.writer(f)
        writer.writerow(header)
        for username, d in followings_filtered.items():
            for friend in d["friends"]:
                row = [str(username), str(friend["username"])]
                writer.writerow(row)

In [58]:
# one loop to rule them all:
for threshold in [1000, 2000, 2200, 2500, 2800, 3000, 4000]:
    top_tweets = tweets_df.query("tweet_popularity>=@threshold")
    # share of tweets per top user
    _ = top_tweets.groupby(["username"], as_index=False).size()
    top_tweets_usernames = _.sort_values("size", ascending=False).reset_index(drop=True)
    # get the users associated with these tweets
    top_tweets_users = top_tweets_usernames.merge(users_df, on="username", how="left")
    print(len(top_tweets))

    # filtered friends list, only use friends that belong those who authored the top 1024 tweets
    followings_filtered = {
        key: {
            "id": value["id"],
            "friends": friend_filter(value["friends"], top_tweets_users),
        }
        for key, value in followings.items()
        if key in list(top_tweets_users["username"])
    }

    edges_path = f"data/networks/network_edgelist_{threshold}.csv"
    build_graph(edges_path, top_tweets_users, followings_filtered)

2511
1024
905
745
627
557
331


In [32]:
df = pd.read_csv("data/networks/network_edgelist_1000.csv")

In [37]:
df.query("source!=target").reset_index(drop=True).to_csv(
    "data/networks/network_edgelist_1000.csv", index=False
)