In [39]:
import os
import json
import datetime
import time

from dateutil import parser
import glob
import json
from dataclasses import dataclass, field
from dacite import from_dict, Config

CREDENTIALS_FILE = "./creds.txt"
USERDIR = "datasets/users"
QUERY_RESULTS = "queried_users.jsonl"
TWEET_QUERY_RESULTS = "queried_users_tweets.jsonl"
MAX_USERS = 5000
CONTROL = True

if CONTROL:
    USERDIR = f"{USERDIR}-control"
    QUERY_RESULTS = f"control_{QUERY_RESULTS}"
    TWEET_QUERY_RESULTS = f"control_{TWEET_QUERY_RESULTS}"
# TT_hongkong.nd json
# First Tweet retrieved Friday, August 9, 2019 8:33:46 PM EST
# Result_type: recent

In [40]:
@dataclass
class Tweet:
    id: int
    text: str
    created_at: str
    lang: str
    source: str
    retweeted: bool

@dataclass
class User:
    id: int
    screen_name: str
    name: str
    description: str
    location: str
    tweets: list[Tweet] = field(default_factory=list)

In [41]:
# Load all files in USERDIR and read user objects
users = {}
tweets = set([])
for filename in glob.glob(f"{USERDIR}/*"):
    with open(filename, "r") as f:
        for line in f:
            userdata = from_dict(data_class=User, data=json.loads(line.strip()))
            if userdata.id not in users:
                users[userdata.id] = userdata
            else:
                users[userdata.id].tweets += userdata.tweets
            tweets.update([tweet.id for tweet in userdata.tweets])

# Twitter API

In [42]:
# Load Twitter API OAuth and other details

from TwitterAPI import TwitterAPI, TwitterOAuth

o = TwitterOAuth.read_file(CREDENTIALS_FILE)
api = TwitterAPI(o.consumer_key, o.consumer_secret, o.access_token_key, o.access_token_secret, api_version="2")



def batch(l, n):
    for i in range(0, len(l), n):
        yield l[i:i+n]

## Tweet Queries

Here, we batch-query Tweet IDs to see if they still exist. We skip Tweet IDs for which we know the Tweeter either has a protected or non-existent account.

In [49]:
# Make sure we don't repeat queries to IDs we have already queried Twitter's API for!
if os.path.exists(TWEET_QUERY_RESULTS):
    with open(TWEET_QUERY_RESULTS, "r") as f:
        for line in f:
            line = line.strip()
            id = json.loads(line)["id"]
            if id in tweets:
                tweets.remove(id)
# Don't query tweets w/ deleted or protected users
if os.path.exists(QUERY_RESULTS):
    with open(QUERY_RESULTS, "r") as f:
        for line in f:
            datum = json.loads(line.strip())
            id = datum["id"]
            if id not in users:
                continue
            if not datum["found"] or datum["protected"]:
                for tweet in users[id].tweets:
                    if tweet.id in tweets:
                        tweets.remove(tweet.id)
print(len(tweets))

0


In [44]:
tweet_responses = {}
for batch_tweets in batch(list(tweets), 100):
    ids = ",".join([str(twt_id) for twt_id in batch_tweets])
    params = {
        "ids": ids,
        "tweet.fields": "id,author_id,withheld"
    }
    r = api.request(f"tweets", params)
    for item in r:
        item["queried_time"] = str(datetime.datetime.now())
        item["found"] = True
        tweet_responses[int(item["id"])] = item
    for twt_id in batch_tweets:
        if twt_id not in tweet_responses:
            tweet_responses[twt_id] = {"found": False, "queried_time": str(datetime.datetime.now())}
    if r.get_quota()["remaining"] < 1:
        print("Ran into quota, exiting for now")
    print(r.get_quota()["remaining"])


589
588
587
586
585
584
583
582
581
580
579
578
577
576
575
574
573
572
571
570
569
568
567
566
565
564
563
562
561
560


In [46]:
with open(TWEET_QUERY_RESULTS, "a") as f:
    for tweet, response in tweet_responses.items():
        response["id"] = tweet
        json.dump(response, f)
        f.write("\n")

## User Queries

Here, we batch-query Tweet IDs to see if they still exist. We skip Tweet IDs for which we know the Tweeter either has a protected or non-existent account.

In [7]:
# Make sure we don't repeat queries to IDs we have already queried Twitter's API for!
if os.path.exists(QUERY_RESULTS):
    with open(QUERY_RESULTS, "r") as f:
        for line in f:
            line = line.strip()
            id = json.loads(line)["id"]
            if id in users:
                del users[id]

TypeError: 'User' object is not subscriptable

In [32]:
# import random
# choose_users = {}
# if len(users) > 1000:
#     chosen = random.sample(list(users.keys()), k=1000)
#     users = {key: users[key] for key in chosen}
print(len(users))

2090


In [34]:

# Send user queries to Twitter in batches of 100.

userids = list(users.keys())
user_responses = {}

for id_batch in batch(userids, 100):
    ids = ",".join([str(id) for id in id_batch])
    params = {
        "ids": ids,
        "user.fields": "id,description,location,name,protected,verified,withheld,username"
    }
    r = api.request(f"users", params)
    for item in r:
        user_responses[int(item["id"])] = item
    for id in id_batch:
        if id not in user_responses:
            user_responses[id] = "Not found"
    print(r.get_quota())
    time.sleep(1)


{'remaining': 889, 'limit': None, 'reset': None}
{'remaining': 888, 'limit': None, 'reset': None}
{'remaining': 887, 'limit': None, 'reset': None}
{'remaining': 886, 'limit': None, 'reset': None}
{'remaining': 885, 'limit': None, 'reset': None}
{'remaining': 884, 'limit': None, 'reset': None}
{'remaining': 883, 'limit': None, 'reset': None}
{'remaining': 882, 'limit': None, 'reset': None}
{'remaining': 881, 'limit': None, 'reset': None}
{'remaining': 880, 'limit': None, 'reset': None}
{'remaining': 879, 'limit': None, 'reset': None}
{'remaining': 878, 'limit': None, 'reset': None}
{'remaining': 877, 'limit': None, 'reset': None}
{'remaining': 876, 'limit': None, 'reset': None}
{'remaining': 875, 'limit': None, 'reset': None}
{'remaining': 874, 'limit': None, 'reset': None}
{'remaining': 873, 'limit': None, 'reset': None}
{'remaining': 872, 'limit': None, 'reset': None}
{'remaining': 871, 'limit': None, 'reset': None}
{'remaining': 870, 'limit': None, 'reset': None}
{'remaining': 869, '

In [35]:
# Append any new results to QUERY_RESULTS.

with open(QUERY_RESULTS, "a") as f:
    for user, response in user_responses.items():
        if response == "Not found":
            response = {"found": False}
        else:
            response["found"] = True
        response["id"] = user
        response["queried_time"] = str(datetime.datetime.now())
        json.dump(response, f)
        f.write("\n")

'2021-12-20 22:59:14.417289'