In [1]:
import requests

In [2]:
import json

def dump_jsonl(data, output_path, append=False, progress=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        if progress:
            data = tqdm(data)
            
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

def load_jsonl(input_path, verbose=True, progress=False) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        if progress:
            f = tqdm(f)
            
        for line in f:
                data.append(json.loads(line.rstrip('\n|\r')))
    
    if verbose:
        print('Loaded {} records from {}'.format(len(data), input_path))
        
    return data


In [3]:
# !pip install tweepy

In [4]:
import tweepy

consumer_key = "*"
consumer_secret = "*"
bearer_token = "*"
access_token = "*"
access_token_secret = "*"

In [5]:
client = tweepy.Client(bearer_token, wait_on_rate_limit=True)

# Load conversations

In [6]:
conversations = load_jsonl("twarc2_conversation.jsonl")

Loaded 3391 records from twarc2_conversation.jsonl


In [7]:
def get_tweet_graph(conversation):
    
    tweet_graph = defaultdict(list)

    for tweet in conversation:
        for parent_tweet in tweet["referenced_tweets"]:
#             if parent_tweet["type"]=="quoted":
#                 continue
            tweet_graph[parent_tweet["id"]].append(tweet["id"])
                
    return tweet_graph

In [8]:
tweets = load_jsonl(f"conversations/tweets_v2.jsonl")

Loaded 185750 records from conversations/tweets_v2.jsonl


In [9]:
id2tweet = {}
for tweet in tweets:
    if tweet is None:
        continue
        
    if type(tweet)==list:
        id2tweet[tweet[0]["value"]] = tweet[0]
        continue

    if "id" not in tweet:
        id2tweet[tweet["resource_id"]] = tweet
        continue

    id2tweet[tweet["id"]] = tweet
    if "expansions" in tweet:
        for expantion_tweet in tweet["expansions"]["tweets"]:
            id2tweet[expantion_tweet["id"]] = expantion_tweet


## Build Tweet graph

In [10]:
def get_path(curr, tweet_graph):
    if curr not in tweet_graph:
        return [f"{curr}"]

    if len(tweet_graph[curr])==0:
        return [f"{curr}"]

    paths = []
    for next_tweet in tweet_graph[curr]:
        next_paths = get_path(next_tweet, tweet_graph)
        for path in next_paths:
            paths.append(f"{curr} > {path}")
    return paths


In [11]:
from collections import defaultdict

n_path = 0
n_skip = 0

path_depth = []
user_count = []
selected_paths = []
for conv in conversations:
    tweet_graph = get_tweet_graph(conv["data"])
    paths = get_path(conv["data"][0]["conversation_id"], tweet_graph)

    for p in paths:
        n_path += 1
        tweetids = p.split(" > ")
        
        user_ids_in_conv = set()
        for twid in tweetids:
            if twid not in id2tweet:
                user_ids_in_conv.add("deleted")
                continue
            elif id2tweet[twid] is None:
                user_ids_in_conv.add("deleted")
                continue
            elif "author_id" not in id2tweet[twid]:
                user_ids_in_conv.add("unknown")
                continue
            else:
                user_ids_in_conv.add(id2tweet[twid]["author_id"])
        
        nuser = len(user_ids_in_conv)
        if "unknown" in user_ids_in_conv:
            nuser -= 1
        
        if nuser==1:
            n_skip += 1
            continue
        
        if len(tweetids) > 10:
            n_skip += 1
            continue
            
        selected_paths.append(tweetids)
        path_depth.append(len(tweetids))
        user_count.append(nuser)

# Export for annotations

In [12]:
import random
import re

def htmlspecialchars(text):
    return (
        text.replace("&amp;", "&").
        replace("&quot;", '"').
        replace("&lt;", "<").
        replace("&gt;", ">")
    )

rows = []
for path in selected_paths:
    
    text = []
    userid = {}
    skip = False
    if len(path) <= 2:
        continue
        
    for twid in path:
        if twid not in id2tweet:
            skip = True
            break
            
        tweet = id2tweet[twid]
        if tweet is None:
            skip = True
            break
            
        if "author_id" not in tweet:
            # text.append(("unknown", tweet["text"]))
            skip = True
            break
        
        
        userid[tweet["author_id"]] = True
        text.append((tweet["author_id"], tweet["text"]))
        
    if skip:
        continue
        
    userid2name = {"unknown":"???"}
    
    if len(userid) > 2:
        continue
        
    for idx, u in enumerate(userid):
        userid2name[u] = chr(ord("A")+idx)
    
    if text[0][0] == text[1][0]: 
        continue
        
    s = ""
    for t in text:
        tt = ' '.join(re.sub("(@[A-Za-z0-9_]+)|(\w+:\/\/\S+)"," ", t[1]).split())
        tt = htmlspecialchars(tt)
        s += f"{userid2name[t[0]]}: {tt}\n\n"

    rows.append({
        "text": s,
        "label": [],
        "tweet": f"https://twitter.com/i/web/status/{path[-1]}",
        "user": set([t[0] for t in text])
    })
    
random.Random(41).shuffle(rows)

In [13]:
len(rows)

2907

In [14]:
rows[50]

{'text': 'A: ใครอารมณ์ดี มาดูคลิปนี้ ไม่ถึงนาที คิ้วขมวดแบบอยากต่อยกำแพง\n\nB: \n\nB: \n\nB: \n\n',
 'label': [],
 'tweet': 'https://twitter.com/i/web/status/1583442127739973632',
 'user': {'423283880', '999556736854867968'}}

# Fetch Followers 

In [62]:
userids = set()
for row in rows[:1050]:
    userids.update(row["user"])

In [63]:
from os import listdir
from os.path import isfile, join

users = {}

path = "./followings/"
for f in listdir(path):
    if isfile(join(path, f)) and f.endswith(".json"):
        uid = f[:-5]
        users[uid] = True

In [64]:
# following => acoount that you follow
# follower => acoount that follow you

In [67]:
len(users)/len(userids)

0.9985683607730852

In [68]:
import os
from tqdm import tqdm

cmd = []
for uid in tqdm(userids, total=len(userids)):
    if uid in users:
        continue
        
#     stream = os.popen(f'twarc2 following --limit 500 --hide-progress {uid} followings/{uid}.json')
#     output = stream.read()
#     output = json.loads(output)

#     stream = os.popen(f'twarc2 followers --limit 10 --hide-progress {uid} followers/{uid}.json')
#     output = stream.read()
#     output = json.loads(output)

    cmd.append(f'twarc2 following --limit 500 {uid} followings/{uid}.json')
    cmd.append(f'twarc2 followers --limit 10 {uid} followers/{uid}.json')
#     users[uid] = True

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1397/1397 [00:00<00:00, 687808.74it/s]


In [69]:
with open("get_user_info.sh", "w") as fin:
    fin.write("#!/bin/bash\n")
    for c in cmd:
        fin.write(c+"\n")

In [70]:
len(users)

1395