In [1]:
import os
import json
import datetime
import time

from dateutil import parser
import glob
import json
from dataclasses import dataclass, field
from dacite import from_dict, Config
from typing import Optional

CREDENTIALS_FILE = "../creds.txt"
USERDIR = "../datasets/users/ia"
QUERY_RESULTS = "queried_users_ia.jsonl"
TWEET_QUERY_RESULTS = "queried_users_tweets_ia.jsonl"
MAX_USERS = 5000
CONTROL = True

if CONTROL:
    USERDIR = f"{USERDIR}-control"
    QUERY_RESULTS = f"control_{QUERY_RESULTS}"
    TWEET_QUERY_RESULTS = f"control_{TWEET_QUERY_RESULTS}"
# TT_hongkong.nd json
# First Tweet retrieved Friday, August 9, 2019 8:33:46 PM EST
# Result_type: recent

In [2]:
@dataclass
class Tweet:
    id: int
    text: str
    created_at: str
    lang: str
    source: str
    retweeted: bool

@dataclass
class User:
    id: int
    screen_name: str
    name: str
    description: Optional[str]
    location: str
    tweets: list[Tweet] = field(default_factory=list)

In [3]:
# Load all files in USERDIR and read user objects
users = {}
tweets = set([])
for filename in glob.glob(f"{USERDIR}/*"):
    with open(filename, "r") as f:
        for line in f:
            userdata = from_dict(data_class=User, data=json.loads(line.strip()))
            if userdata.id not in users:
                users[userdata.id] = userdata
            else:
                users[userdata.id].tweets += userdata.tweets
            tweets.update([tweet.id for tweet in userdata.tweets])

In [4]:
len(users)

11610

# Twitter API

In [8]:
# Load Twitter API OAuth and other details

from TwitterAPI import TwitterAPI, TwitterOAuth

o = TwitterOAuth.read_file(CREDENTIALS_FILE)
api = TwitterAPI(o.consumer_key, o.consumer_secret, o.access_token_key, o.access_token_secret, api_version="2")


def batch(l, n):
    for i in range(0, len(l), n):
        yield l[i:i+n]

## Tweet Queries

Here, we batch-query Tweet IDs to see if they still exist. We skip Tweet IDs for which we know the Tweeter either has a protected or non-existent account.

In [11]:
# Make sure we don't repeat queries to IDs we have already queried Twitter's API for!
if os.path.exists(TWEET_QUERY_RESULTS):
    with open(TWEET_QUERY_RESULTS, "r") as f:
        for line in f:
            line = line.strip()
            id = json.loads(line)["id"]
            if id in tweets:
                tweets.remove(id)
# Don't query tweets w/ deleted or protected users
if os.path.exists(QUERY_RESULTS):
    with open(QUERY_RESULTS, "r") as f:
        for line in f:
            datum = json.loads(line.strip())
            id = datum["id"]
            if id not in users:
                continue
            if not datum["found"] or datum["protected"]:
                for tweet in users[id].tweets:
                    if tweet.id in tweets:
                        tweets.remove(tweet.id)
print(len(tweets))

43030


In [12]:
tweet_responses = {}
for batch_tweets in batch(list(tweets), 100):
    ids = ",".join([str(twt_id) for twt_id in batch_tweets])
    params = {
        "ids": ids,
        "tweet.fields": "id,author_id,withheld"
    }
    r = api.request(f"tweets", params)
    for item in r:
        item["queried_time"] = str(datetime.datetime.now())
        item["found"] = True
        tweet_responses[int(item["id"])] = item
    for twt_id in batch_tweets:
        if twt_id not in tweet_responses:
            tweet_responses[twt_id] = {"found": False, "queried_time": str(datetime.datetime.now())}
    if r.get_quota()["remaining"] < 1:
        print("Ran into quota, exiting for now")
    print(r.get_quota()["remaining"])


899
898
897
896
895
894
893
892
891
890
889
888
887
886
885
884
883
882
881
880
879
878
877
876
875
874
873
872
871
870
869
868
867
866
865
864
863
862
861
860
859
858
857
856
855
854
853
852
851
850
849
848
847
846
845
844
843
842
841
840
839
838
837
836
835
834
833
832
831
830
829
828
827
826
825
824
823
822
821
820
819
818
817
816
815
814
813
812
811
810
809
808
807
806
805
804
803
802
801
800
799
798
797
796
795
794
793
792
791
790
789
788
787
786
785
784
783
782
781
780
779
778
777
776
775
774
773
772
771
770
769
768
767
766
765
764
763
762
761
760
759
758
757
756
755
754
753
752
751
750
749
748
747
746
745
744
743
742
741
740
739
738
737
736
735
734
733
732
731
730
729
728
727
726
725
724
723
722
721
720
719
718
717
716
715
714
713
712
711
710
709
708
707
706
705
704
703
702
701
700
699
698
697
696
695
694
693
692
691
690
689
688
687
686
685
684
683
682
681
680
679
678
677
676
675
674
673
672
671
670
669
668
667
666
665
664
663
662
661
660
659
658
657
656
655
654
653
652
651
650


In [13]:
with open(TWEET_QUERY_RESULTS, "a") as f:
    for tweet, response in tweet_responses.items():
        response["id"] = tweet
        json.dump(response, f)
        f.write("\n")
TWEET_QUERY_RESULTS

'control_queried_users_tweets_ia.jsonl'

## User Queries

Here, we batch-query Tweet IDs to see if they still exist. We skip Tweet IDs for which we know the Tweeter either has a protected or non-existent account.

In [5]:
# Make sure we don't repeat queries to IDs we have already queried Twitter's API for!
if os.path.exists(QUERY_RESULTS):
    with open(QUERY_RESULTS, "r") as f:
        for line in f:
            line = line.strip()
            id = json.loads(line)["id"]
            if id in users:
                del users[id]

In [6]:
# import random
# choose_users = {}
# if len(users) > 1000:
#     chosen = random.sample(list(users.keys()), k=1000)
#     users = {key: users[key] for key in chosen}
print(len(users))

11610


In [9]:

# Send user queries to Twitter in batches of 100.

userids = list(users.keys())
user_responses = {}

for id_batch in batch(userids, 100):
    ids = ",".join([str(id) for id in id_batch])
    params = {
        "ids": ids,
        "user.fields": "id,description,location,name,protected,verified,withheld,username"
    }
    r = api.request(f"users", params)
    for item in r:
        user_responses[int(item["id"])] = item
    for id in id_batch:
        if id not in user_responses:
            user_responses[id] = "Not found"
    print(r.get_quota())
    time.sleep(1)


{'remaining': 899, 'limit': None, 'reset': None}
{'remaining': 898, 'limit': None, 'reset': None}
{'remaining': 897, 'limit': None, 'reset': None}
{'remaining': 896, 'limit': None, 'reset': None}
{'remaining': 895, 'limit': None, 'reset': None}
{'remaining': 894, 'limit': None, 'reset': None}
{'remaining': 893, 'limit': None, 'reset': None}
{'remaining': 892, 'limit': None, 'reset': None}
{'remaining': 891, 'limit': None, 'reset': None}
{'remaining': 890, 'limit': None, 'reset': None}
{'remaining': 889, 'limit': None, 'reset': None}
{'remaining': 888, 'limit': None, 'reset': None}
{'remaining': 887, 'limit': None, 'reset': None}
{'remaining': 886, 'limit': None, 'reset': None}
{'remaining': 885, 'limit': None, 'reset': None}
{'remaining': 884, 'limit': None, 'reset': None}
{'remaining': 883, 'limit': None, 'reset': None}
{'remaining': 882, 'limit': None, 'reset': None}
{'remaining': 881, 'limit': None, 'reset': None}
{'remaining': 880, 'limit': None, 'reset': None}
{'remaining': 879, '

In [10]:
# Append any new results to QUERY_RESULTS.

with open(QUERY_RESULTS, "a") as f:
    for user, response in user_responses.items():
        if response == "Not found":
            response = {"found": False}
        else:
            response["found"] = True
        response["id"] = user
        response["queried_time"] = str(datetime.datetime.now())
        json.dump(response, f)
        f.write("\n")

'2021-12-20 22:59:14.417289'