## Import libraries and initial setup

In [None]:
import tweepy
import numpy as np
import pandas as pd
import time
import datetime
from tqdm import tqdm

from app_cred import CONSUMER_KEY, CONSUMER_SECRET #import user specific keys to access twitter
from app_cred import ACCESS_TOKEN, ACCESS_TOKEN_SECRET #import user specific keys to access twitter 

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit = True, 
                 wait_on_rate_limit_notify = True, 
                 timeout=900)

pd.set_option("display.max_columns", None)

## Split dataset into four parts

In [None]:
# handles=pd.read_excel("Initial network.xlsx")
# handles=handles["Twitter Handle (uden @)"]
# handle1, handle2, handle3, handle4 = 
# i = 1
# for handle in np.array_split(handles, 16):
#     handle.to_csv(f"handle{i}.csv", index=False)
#     i += 1

## Define functions

In [None]:
start_date = datetime.datetime(2019, 5, 6)
list_of_keys_user=[
    "id","name","screen_name","location","description","followers_count","friends_count","statuses_count","created_at"
]
list_of_keys_tweet=[
    "created_at","id","lang","full_text",
    "retweeted","retweeted_status","retweet_count",
    "is_quote_status","quoted_status","quote_count",
    "entities"
]

def limit_handled(cursor):
    """Generator to throttle scraping of Twitter-user timeline.
    Yields next tweet in timeline"""
    while True:
        try:
            yield next(cursor)
        except tweepy.RateLimitError as r: # If rate limit is reached sleep for 15 minutes
            print(r.reason) 
            time.sleep(15 * 60)
        except tweepy.TweepError as e: # If other error back off for 5 seconds, then continue
            print(e.reason)
            time.sleep(5)
        except StopIteration:
            break

def get_all_tweets(handle):
    """Function is supposed to return all possible tweets from a user in a df
    Handle is the handle of the account"""
    timeline = tweepy.Cursor(api.user_timeline, screen_name=handle,
                             tweet_mode="extended",
                             since=start_date)
    tweet_list = [status._json for status in limit_handled(timeline.items())]
    tweets=list()
    for tweet in tweet_list:
        for key in tweet:
            temp_dict=dict()
            for user_key in list_of_keys_user: # Access information on user 
                try:
                    temp_dict["user_"+user_key]=tweet["user"][user_key]
                except KeyError:
                    temp_dict["user_"+user_key]=None

            for tweet_key in list_of_keys_tweet: # Access information on tweet
                try: 
                    temp_dict["tweet_"+tweet_key]=tweet[tweet_key]
                except KeyError:
                    temp_dict["tweet_"+tweet_key]=None

        tweets.append(temp_dict)
    df = pd.DataFrame(tweets) 
    df = df.fillna(value=np.nan)
    return df

def get_tweets_from_handles(handlefile,print_handle=False):
    """Give this function the csv file with the handle 
    it returns a df including the tweets from all the handles"""
    handles = pd.read_csv(handlefile)
    handles=handles["Twitter Handle (uden @)"].to_list()
    df = pd.DataFrame()
    for handle in tqdm(handles):
        if print_handle:
            print(handle)
        temp = get_all_tweets(handle)
        df = pd.concat([df, temp], ignore_index = True)
    return df

In [None]:
timeline = tweepy.Cursor(api.user_timeline, screen_name="COWIdk",
                             tweet_mode="extended",
                             since=start_date)
tweet_list = [status._json for status in limit_handled(timeline.items())]

## Scrape timelines

In [None]:
handle5=get_tweets_from_handles("handle5.csv")