In [None]:
! pip install twython
! pip install backoff

In [None]:
from twython import Twython, TwythonError, TwythonRateLimitError
import pandas as pd
from time import sleep
from tqdm import tqdm
from glob import glob
import yaml
import backoff
import logging

In [None]:
with open('cred.yaml', 'r') as f:
    twitter_cred = yaml.safe_load(f)

In [None]:
twitter_oauth = Twython(twitter_cred['APP_KEY'], twitter_cred['APP_SECRET'], oauth_version=2)
ACCESS_TOKEN = twitter_oauth.obtain_access_token()

t = Twython(twitter_cred['APP_KEY'], access_token=ACCESS_TOKEN)

In [None]:
ids = [f.rstrip('.csv').lstrip('tweets/') for f in glob("tweets/*.csv")]
if ids:
    max_id = min([int(i) for i in ids])
else:
    max_id = None
max_id

In [None]:
crawled_tweets = 0
for f in glob("tweets/*.csv"):
    crawled_tweets += pd.read_csv(f).shape[0]
crawled_tweets

In [None]:
max_tweets = 1_100_000

In [None]:
@backoff.on_exception(backoff.expo, (TwythonError, TwythonRateLimitError))
def get_tweets(query, max_id = None, lang = 'en', count = 100, tweet_mode='compat'):
    result = t.search(q=query, lang = lang, count=count, 
                      result_type='recent', max_id=max_id, tweet_mode=tweet_mode)

    return pd.DataFrame(result['statuses'])

In [None]:
logging.getLogger('backoff').addHandler(logging.StreamHandler())

In [None]:
with tqdm(total = max_tweets - crawled_tweets) as pbar:
    while True: 
        tweets = get_tweets('china -filter:retweets', max_id=max_id, tweet_mode='extended')
        tweets = tweets[['id', 'created_at', 'full_text']]
        tweets.to_csv('tweets/%s.csv' % tweets['id'].max(), index=False)

        crawled_tweets += len(tweets)
        pbar.update(len(tweets))

        if crawled_tweets >= max_tweets: 
            break

        max_id = tweets['id'].min()

In [None]:
csv_files = [pd.read_csv(f) for f in glob("tweets/*.csv")]

In [None]:
sum([c.shape[0] for c in csv_files])