In [1]:
! pip install twython
! pip install backoff

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
from twython import Twython, TwythonError, TwythonRateLimitError
import pandas as pd
from time import sleep
from tqdm import tqdm
from glob import glob
import yaml
import backoff
import logging

In [3]:
with open('cred.yaml', 'r') as f:
    twitter_cred = yaml.safe_load(f)

In [4]:
twitter_oauth = Twython(twitter_cred['APP_KEY'], twitter_cred['APP_SECRET'], oauth_version=2)
ACCESS_TOKEN = twitter_oauth.obtain_access_token()

t = Twython(twitter_cred['APP_KEY'], access_token=ACCESS_TOKEN)

In [5]:
ids = [f.rstrip('.csv').lstrip('tweets/') for f in glob("tweets/*.csv")]
if ids:
    max_id = min([int(i) for i in ids])
else:
    max_id = None
max_id

1588846925738315776

In [6]:
crawled_tweets = 0
for f in glob("tweets/*.csv"):
    crawled_tweets += pd.read_csv(f).shape[0]
crawled_tweets

958589

In [7]:
max_tweets = 1_100_000

In [8]:
@backoff.on_exception(backoff.expo, (TwythonError, TwythonRateLimitError))
def get_tweets(query, max_id = None, lang = 'en', count = 100, tweet_mode='compat'):
    result = t.search(q=query, lang = lang, count=count, 
                      result_type='recent', max_id=max_id, tweet_mode=tweet_mode)
    return pd.DataFrame(result['statuses'])

In [9]:
logging.getLogger('backoff').addHandler(logging.StreamHandler())

In [10]:
with tqdm(total = max_tweets - crawled_tweets) as pbar:
    while True: 
        tweets = get_tweets('china', max_id=max_id, tweet_mode='extended')

        tweets = tweets[['id', 'created_at', 'full_text']]
        tweets.to_csv('tweets/%s.csv' % tweets['id'].max(), index=False)

        crawled_tweets += len(tweets)
        pbar.update(len(tweets))

        if crawled_tweets >= max_tweets: 
            break

        max_id = tweets['id'].min()

 32%|███▏      | 45100/141411 [04:10<08:48, 182.16it/s]Backing off get_tweets(...) for 0.9s (twython.exceptions.TwythonRateLimitError: Twitter API returned a 429 (Too Many Requests), Rate limit exceeded)
Backing off get_tweets(...) for 2.0s (twython.exceptions.TwythonRateLimitError: Twitter API returned a 429 (Too Many Requests), Rate limit exceeded)
Backing off get_tweets(...) for 1.0s (twython.exceptions.TwythonRateLimitError: Twitter API returned a 429 (Too Many Requests), Rate limit exceeded)
Backing off get_tweets(...) for 7.3s (twython.exceptions.TwythonRateLimitError: Twitter API returned a 429 (Too Many Requests), Rate limit exceeded)
Backing off get_tweets(...) for 5.7s (twython.exceptions.TwythonRateLimitError: Twitter API returned a 429 (Too Many Requests), Rate limit exceeded)
Backing off get_tweets(...) for 26.1s (twython.exceptions.TwythonRateLimitError: Twitter API returned a 429 (Too Many Requests), Rate limit exceeded)
 32%|███▏      | 45100/141411 [04:30<08:48, 182.16

In [11]:
csv_files = [pd.read_csv(f) for f in glob("tweets/*.csv")]

In [12]:
sum([c.shape[0] for c in csv_files])

1040264