In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
from twittercrawler.replies.components import SearchEngine, UserTweetStore
from twittercrawler.replies.query import TweetQuery

from twittercrawler.crawlers import RecursiveCrawler
crawler = RecursiveCrawler()
crawler.authenticate("../api_key.json")
status = crawler.twitter_api.show_status(id="1347260116932976643", tweet_mode="extended")
q = TweetQuery(status)
print(q)

store = UserTweetStore("alma")
store.update(q, q.since_id+10000000)
store.save()
store.user_intervals

store_tmp = UserTweetStore("alma")
store_tmp.load()
assert store_tmp.user_intervals == store.user_intervals
store_tmp.reset()

In [None]:
from collections import deque

class ReplyCollector():
    def __init__(self, engine, tweet_id, min_engagement=5, postpone_day_limit=3, action_day_limit=5, drop_day_limit=7, reset=False, reload_status=True):
        self.engine = engine
        self.tweet_id = tweet_id
        self.reset = reset
        self.reload_status = reload_status
        self.min_engagement = min_engagement
        self.postpone_day_limit = postpone_day_limit
        self.action_day_limit = action_day_limit
        self.drop_day_limit = drop_day_limit
        if self.reset:
            #self.init()
            self.engine.store.reset()
        self.seed_tweet = self.engine.get_status(self.tweet_id)
        self.tweet_thread = [self.seed_tweet]
        print(self.seed_tweet['full_text'])
        self._queue = deque([TweetQuery(self.seed_tweet)])
        self.active_tweet_ids = []
        #else:
        #    raise NotImplementedError("TODO: collector loading and saving functionalities!!!")
        
    #def init(self):
    #    self.engine.store.reset()
    #    self.seed_tweet = self.engine.get_status(self.tweet_id)
    #    self.tweet_thread = [self.seed_tweet]
    #    print(self.seed_tweet['full_text'])
    #    self._queue = deque([TweetQuery(self.seed_tweet)])
    #    self.active_tweet_ids = []
        
    @property
    def queue(self):
        return self._queue
    
    @property
    def size(self):
        return len(self.queue)
    
    @property
    def active_queries(self):
        return [q for q in self.queue if q.priority > 0]
    
    @property
    def status(self):
        self.engine.store.save()
        return {
            "total_queries": self.size,
            "remaining queries": len(self.active_queries),
            "seed_tweet_id": self.tweet_id,
        }
    
    def _sort_queries(self):
        self._queue = deque(sorted(self._queue, key=lambda x: x.priority, reverse=True))
        
    def _decide_execution(self, query):
        if query.elapsed_days < self.postpone_day_limit:
            execute_now = False
        elif query.elapsed_days >= self.action_day_limit:
            # here we try to catch the begining of each thread
            execute_now = True
        else:
            if query.engagements >= self.min_engagement:
                execute_now = True
            else:
                execute_now = False
        return execute_now
        
    def run(self, feedback_interval=10, max_requests=1000):
        i = 0
        while len(self.queue) > 0:
            query = self.queue.popleft()
            if query.priority == 0:
                self._queue.appendleft(query)
                break
            if self.reload_status and query.accessed_since_days > -1:
                new_status = self.engine.get_status(query.id)
                query.update_metrics(new_status)
            print(query)
            execute_now = self._decide_execution(query)
            if execute_now:
                success, new_query, replies = self.engine.execute(query)
                print(query.id,len(replies))
                self.tweet_thread += replies
                df = pd.DataFrame(self.tweet_thread)
                df.to_pickle("%s.pkl" % self.tweet_id)
                for reply in replies:
                    q = TweetQuery(reply)
                    if not q.id in self.active_tweet_ids:
                        self._queue.append(q)
                if new_query.elapsed_days < self.drop_day_limit:
                    self._queue.append(new_query)
                self._sort_queries()
                if not success:
                    break
            else:
                self._queue.append(query)
            i += 1
            if i % feedback_interval == 0:
                print("\n### STATUS ###")
                print(self.status)
                print()
            if i >= max_requests:
                print("Exiting at %i executed queries!" % max_requests)
        print(self.status)

# TODO

- **export / load functions for store, collector etc.:**
   - store: save user intervals
   - collector: save tweet_thread
- write a few tests...
- include comet logging...

In [None]:
from twittercrawler.crawlers import RecursiveCrawler
crawler = RecursiveCrawler()
crawler.authenticate("../api_key.json")
store = UserTweetStore("test")
engine = SearchEngine(crawler, store)

In [None]:
tweet_ids = [
    "1346181687600939009",#https://twitter.com/FogCityMidge/status/1346181687600939009
    "1346188735961038848",#https://twitter.com/ThePlumLineGS/status/1346188735961038848
    "1346241519179718656",#https://twitter.com/CNN/status/1346241519179718656
    "1346199928331251715",#https://twitter.com/jennafarmeruk/status/1346199928331251715
    "1346189795261886472",#https://twitter.com/BBCNews/status/1346189795261886472
    "1346149033279160323",#https://twitter.com/MattHancock/status/1346149033279160323
    "1346242405465530368",#https://twitter.com/TeaPainUSA/status/1346242405465530368 (new rumor)
    "1346106929140150273",#https://twitter.com/nbcwashington/status/1346106929140150273
    "1345927336345137152",#https://twitter.com/chipfranklin/status/1345927336345137152
]

In [None]:
import time
for tweet_id in tweet_ids:
    collector = ReplyCollector(engine, tweet_id)
    collector.run()
    time.sleep(300)

#https://twitter.com/hadleywickham/status/1347260334227259394
collector = ReplyCollector(engine, "1347260116932976643", reset=True, min_engagement=1, postpone_day_limit=0, action_day_limit=1)
collector.run()

#https://twitter.com/nicvadivelu/status/1347267259266318336/photo/1
collector = ReplyCollector(engine, "1347267259266318336", min_engagement=1, postpone_day_limit=0, action_day_limit=0, reset=True)
collector.run()

#https://twitter.com/Dr2NisreenAlwan/status/1347833203080523776
collector = ReplyCollector(engine, "1347833203080523776", min_engagement=5, postpone_day_limit=0, action_day_limit=5, reset=False)
collector.run()

#https://twitter.com/indepdubnrth/status/1347979792839274499
collector = ReplyCollector(engine, "1347979792839274499", min_engagement=5, postpone_day_limit=0, action_day_limit=5, reset=True)

#https://twitter.com/RandPaul/status/1347930653904556036
collector = ReplyCollector(engine, "1347930653904556036", min_engagement=5, postpone_day_limit=0, action_day_limit=5, reset=True)

### TODO: stop after executing a few queries...

In [None]:
tweet_id = "1347260116932976643"
#tweet_id = "1347267259266318336"
#tweet_id = "1347833203080523776"
#tweet_id = "1347979792839274499"
#tweet_id = "1347930653904556036"
thread_df = pd.read_pickle("%s.pkl" % tweet_id)

files = [f for f in os.listdir(".") if ".pkl" in f]
thread_df = pd.concat(pd.read_pickle(f) for f in files)
thread_df.shape

In [None]:
#repliers = set()
edges = []
mentions = []
#thread_events = sorted(collector.tweet_thread, key=lambda x: x["id"])
#for tw in thread_events:
for _, tw in thread_df.iterrows():
    if tw["in_reply_to_status_id_str"] != None:
        edges.append((tw["id_str"],tw["in_reply_to_status_id_str"]))
        mentions.append((tw["user"]["id_str"],tw["in_reply_to_user_id_str"]))
    #repliers.add(tw["user"]["screen_name"])
    #print(tw["user"]["screen_name"], tw["created_at"])
    #print(tw["full_text"])
    #print()

In [None]:
import networkx as nx
G = nx.DiGraph()
H = nx.MultiDiGraph()
G.add_edges_from(edges)
_ = H.add_edges_from(mentions)

In [None]:
nx.draw(G, node_size=10)

In [None]:
nx.draw(H, node_size=10)

### covid vaccine min_replies:100 until:2021-01-05 since:2021-01-04