## Import libraries and initial setup

In [2]:
import tweepy
import numpy as np
import pandas as pd
import time
import datetime
from tqdm import tqdm

from app_cred import CONSUMER_KEY, CONSUMER_SECRET #import user specific keys to access twitter
from app_cred import ACCESS_TOKEN, ACCESS_TOKEN_SECRET #import user specific keys to access twitter 

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit = True, 
                 wait_on_rate_limit_notify = True, 
                 timeout=900)

pd.set_option("display.max_columns", None)

## Split dataset into four parts

In [3]:
handles=pd.read_excel("Initial network.xlsx")
handles=handles["Twitter Handle (uden @)"]
handle1, handle2, handle3, handle4 = np.array_split(handles, 4)

i = 1
for handle in [handle1, handle2, handle3, handle4]:
    handle.to_csv(f"handle{i}.csv", index=False)
    i += 1

## Define functions

In [4]:
start_date = datetime.datetime(2019, 5, 6)

def limit_handled(cursor):
    """Generator to throttle scraping of Twitter-user timeline.
    Yields next tweet in timeline"""
    while True:
        try:
            yield next(cursor)
        except tweepy.RateLimitError as r: # If rate limit is reached sleep for 15 minutes
            print(r.reason) 
            time.sleep(15 * 60)
        except tweepy.TweepError as e: # If other error back off for 5 seconds, then continue
            print(e.reason)
            time.sleep(5)
        except StopIteration:
            break

def get_all_tweets(handle):
    """Function is supposed to return all possible tweets from a user in a df
    Handle is the handle of the account and n is the number of wished tweets"""
    global start_date
    timeline = tweepy.Cursor(api.user_timeline, screen_name=handle,
                             tweet_mode="extended",
                             since=start_date)
    tweet_list = [status._json for status in limit_handled(timeline.items())]
    return pd.DataFrame(tweet_list)

## Scrape timelines

In [5]:
def get_tweets_from_handles(handlefile):
    '''Give this function the csv file with the handle 
    it returns a df including the tweets from all the handles'''
    handles = pd.read_csv(handlefile)
    handles = handles.loc[:, 'Twitter Handle (uden @)'].to_list()
    df = pd.DataFrame()
    for handle in tqdm(handles):
        temp = get_all_tweets(handle)
        df = pd.concat([df, temp], ignore_index = True)
    return df

In [None]:
handle9 = pd.read_csv('handle9.csv')
df = pd.DataFrame()
for handle in tqdm(handle9.loc[:, 'Twitter Handle (uden @)'].to_list()):
    temp = get_all_tweets(handle)
    df = pd.concat([df, temp], ignore_index=True)

 18%|█▊        | 3/17 [04:55<23:09, 99.23s/it] 

In [9]:
handle5=get_tweets_from_handles('handle5.csv')

 35%|█████████████████████████████▎                                                     | 6/17 [04:53<09:14, 50.44s/it]Rate limit reached. Sleeping for: 260
100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [17:42<00:00, 62.49s/it]


## Export DataFrame to .parquet

In [10]:
handle5

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,source,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,in_reply_to_screen_name,user,geo,coordinates,place,contributors,retweeted_status,is_quote_status,retweet_count,favorite_count,favorited,retweeted,lang,possibly_sensitive,extended_entities,quoted_status_id,quoted_status_id_str,quoted_status_permalink,quoted_status
0,Wed May 19 11:29:26 +0000 2021,1394978500902367234,1394978500902367234,RT @lpsoebye: Det er via et stærkt #offentligp...,False,"[0, 140]","{'hashtags': [{'text': 'offentligprivat', 'ind...","<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 95405710, 'id_str': '95405710', 'name':...",,,,,{'created_at': 'Wed May 19 08:11:39 +0000 2021...,False,1,0,False,False,da,,,,,,
1,Tue May 18 15:27:40 +0000 2021,1394676064794906626,1394676064794906626,RT @Region_Midt: Oprensning af en #generations...,False,"[0, 140]",{'hashtags': [{'text': 'generationsforurening'...,"<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 95405710, 'id_str': '95405710', 'name':...",,,,,{'created_at': 'Mon May 17 05:11:17 +0000 2021...,False,3,0,False,False,da,,,,,,
2,Thu Apr 29 12:30:56 +0000 2021,1387746220974776321,1387746220974776321,RT @lpsoebye: Skal vi lykkes med #grønomstilli...,False,"[0, 140]","{'hashtags': [{'text': 'grønomstilling', 'indi...","<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 95405710, 'id_str': '95405710', 'name':...",,,,,{'created_at': 'Thu Apr 29 07:46:21 +0000 2021...,False,2,0,False,False,da,,,,,,
3,Tue Apr 27 11:59:53 +0000 2021,1387013631146143747,1387013631146143747,"RT @lpsoebye: Til årsdag i DI Energi, hvor Knu...",False,"[0, 140]","{'hashtags': [{'text': 'sektorkobling', 'indic...","<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 95405710, 'id_str': '95405710', 'name':...",,,,,{'created_at': 'Tue Apr 27 11:47:01 +0000 2021...,False,4,0,False,False,da,,,,,,
4,Thu Apr 22 12:06:25 +0000 2021,1385203335263817736,1385203335263817736,"Der skal fart på udviklingen af #PtX, hvis vi ...",False,"[0, 277]","{'hashtags': [{'text': 'PtX', 'indices': [32, ...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,,,,"{'id': 95405710, 'id_str': '95405710', 'name':...",,,,,,False,0,3,False,False,da,False,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24967,Mon Oct 26 20:37:12 +0000 2009,5182153406,5182153406,chili con carne og easy rider i ghettoen... najs!,False,"[0, 49]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,,,,,"{'id': 29490660, 'id_str': '29490660', 'name':...",,,,,,False,0,0,False,False,da,,,,,,
24968,Sun Sep 13 19:11:53 +0000 2009,3959950165,3959950165,can't get eurosport player to work with mac. f...,False,"[0, 104]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,,,,,"{'id': 29490660, 'id_str': '29490660', 'name':...",,,,,,False,0,0,False,False,en,,,,,,
24969,Fri Jun 26 00:03:49 +0000 2009,2335557796,2335557796,R.I.P Michael,False,"[0, 13]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,,,,,"{'id': 29490660, 'id_str': '29490660', 'name':...",,,,,,False,0,0,False,False,en,,,,,,
24970,Sat May 02 14:00:44 +0000 2009,1678535077,1678535077,2-årig pige i MENSA. IQ på 152 - forældrene ha...,False,"[0, 71]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,,,,,"{'id': 29490660, 'id_str': '29490660', 'name':...",,,,,,False,0,0,False,False,da,,,,,,


In [16]:
handle5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24972 entries, 0 to 24971
Data columns (total 31 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   created_at                 24972 non-null  object 
 1   id                         24972 non-null  int64  
 2   id_str                     24972 non-null  object 
 3   full_text                  24972 non-null  object 
 4   truncated                  24972 non-null  bool   
 5   display_text_range         24972 non-null  object 
 6   entities                   24972 non-null  object 
 7   source                     24972 non-null  object 
 8   in_reply_to_status_id      7914 non-null   object 
 9   in_reply_to_status_id_str  7914 non-null   object 
 10  in_reply_to_user_id        8113 non-null   object 
 11  in_reply_to_user_id_str    8113 non-null   object 
 12  in_reply_to_screen_name    8113 non-null   object 
 13  user                       24972 non-null  obj

In [18]:
handle5.to_parquet('handle5.parquet.gzip', engine='fastparquet', compression='gzip')

ImportError: Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [None]:
pip install fastparquet