## Import libraries and initial setup

In [2]:
import tweepy
import numpy as np
import pandas as pd
import time
import datetime
from tqdm import tqdm

from app_cred import CONSUMER_KEY, CONSUMER_SECRET #import user specific keys to access twitter
from app_cred import ACCESS_TOKEN, ACCESS_TOKEN_SECRET #import user specific keys to access twitter 

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit = True, 
                 wait_on_rate_limit_notify = True, 
                 timeout=900)

pd.set_option("display.max_columns", None)

## Split dataset into four parts

In [None]:
handles=pd.read_excel("Initial network.xlsx")
handles=handles["Twitter Handle (uden @)"]
handle1, handle2, handle3, handle4 = np.array_split(handles, 4)

i = 1
for handle in [handle1, handle2, handle3, handle4]:
    handle.to_csv(f"handle{i}.csv", index=False)
    i += 1

## Define functions

In [3]:
start_date = datetime.datetime(2019, 5, 6)

def limit_handled(cursor):
    """Generator to throttle scraping of Twitter-user timeline.
    Yields next tweet in timeline"""
    while True:
        try:
            yield next(cursor)
        except tweepy.RateLimitError as r: # If rate limit is reached sleep for 15 minutes
            print(r.reason) 
            time.sleep(15 * 60)
        except tweepy.TweepError as e: # If other error back off for 5 seconds, then continue
            print(e.reason)
            time.sleep(5)
        except StopIteration:
            break

def get_all_tweets(handle):
    """Function is supposed to return all possible tweets from a user in a df
    Handle is the handle of the account and n is the number of wished tweets"""
    global start_date
    timeline = tweepy.Cursor(api.user_timeline, screen_name=handle,
                             tweet_mode="extended",
                             since=start_date)
    tweet_list = [status._json for status in limit_handled(timeline.items())]
    return pd.DataFrame(tweet_list)

## Scrape timelines

In [None]:
def get_tweets_from_handles(handlefile)
    '''Give this function the csv file with the handle 
    it returns a df including the tweets from all the handles'''
    handles = pd.read_csv(handlefile)
    handles = handles.loc[:, 'Twitter Handle (uden @)'].to_list()
    df = pd.DataFrame()
    for handle in tqdm(handles):
        df = pd.concat([df, temp], ignore_index = True)
    return df

In [None]:
handle9 = pd.read_csv('handle9.csv')
df = pd.DataFrame()
for handle in tqdm(handle9.loc[:, 'Twitter Handle (uden @)'].to_list()):
    temp = get_all_tweets(handle)
    df = pd.concat([df, temp], ignore_index=True)

 18%|█▊        | 3/17 [04:55<23:09, 99.23s/it] 

## Export DataFrame to .parquet

In [None]:
df.to_parquet('df.parquet.gzip',
              compression='gzip') 