# RetrieveTweets.ipynb

### This notebook can be used to retrieve Tweets from the analyzed accounts using Twitter API.

Author: Erik Puijk <br>
Date  : February 5, 2022

In [34]:
""" Install required packages. """
!pip install twitter
!pip install requests
!pip install matplotlib



In [35]:
import twitter
import json
import requests
import os
import csv
import time
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import datetime
import random
import re

In [36]:
def read_token():
    """ Read the Twitter API Token from a preset text file. """
    
    token = ""
    
    try:
        with open("source/token.txt", 'r') as f:
            token = f.read()
    except IOError:
        print("I/O error")
        
    os.environ['TOKEN'] = token

In [37]:
def merge_media_fields(tweets, media):
    """ Merge media object information into corresponding Tweet JSON-object. """
    
    for tweet in tweets:
        # Some Tweets contain media
        if 'attachments' in tweet:
            # Media types are the same for each Tweet
            media_type = [medium['type'] for medium in media if medium['media_key'] in tweet['attachments']['media_keys']][0]
            public_metrics = [medium['public_metrics'] for medium in media if medium['media_key'] in tweet['attachments']['media_keys'] and 'public_metrics' in medium]
            
            tweet.update({"media_type": media_type})
            
            if len(public_metrics) == 0:
                tweet.update({"media_public_metrics": "none"})
            else:
                # Only videos contain public_metrics
                tweet.update({"media_public_metrics": public_metrics})
            
            tweet.pop('attachments')
        
        # Other Tweets do not contain media
        else:
            tweet.update({"media_type": "none"})
            tweet.update({"media_public_metrics": "none"})
    
    return tweets

In [38]:
def request_tweets(query, start_time, end_time, max_results, sleep_time):
    """ Create requests for Twitter API to obtain Tweets matching the query and start/end times. """
    
    bearer_token = os.getenv('TOKEN')
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    
    search_url = "https://api.twitter.com/2/tweets/search/all"
    next_token = ""
    tweets = []
    i = 0
    
    # Continue until no more data is received
    while True:
        if i > 0:
            query_params = {'query': query,
                'start_time': start_time,
                'end_time': end_time,
                'max_results': max_results,
                'expansions': 'attachments.media_keys',
                'tweet.fields': 'public_metrics,created_at,author_id,conversation_id',
                'media.fields': 'public_metrics',
                'pagination_token': next_token}
            
            # Wait before the next request
            time.sleep(sleep_time)
        else:            
            query_params = {'query': query,
                'start_time': start_time,
                'end_time': end_time,
                'max_results': max_results,
                'expansions': 'attachments.media_keys',
                'tweet.fields': 'public_metrics,created_at,author_id,conversation_id',
                'media.fields': 'public_metrics'}

        response = requests.request("GET", search_url, headers = headers, params = query_params)
        
        # Stop if error occurs
        if response.status_code != 200:
            raise Exception(response.status_code, response.text)
            break
    
        # Stop of no data found
        res = response.json()
        if 'data' not in res:
            print("Data not found:")
            print(res)
            break
        else:
            res_tweets = merge_media_fields(res['data'], res['includes']['media'])
        
        # Append new Tweets to Tweets list
        for tweet in res_tweets:
            # Remove a Tweet that is not in Dutch
            if tweet['id'] != "1370841025829298176":
                tweets.append(tweet)
            
        # Stop if no next token
        res = response.json()
        if 'next_token' not in res['meta']:
            break

        next_token = res['meta']['next_token']
        i += 1
    
    return tweets

In [39]:
def read_users():
    """ Read user information contained in preset text file. """
    
    content = ""
    
    try:
        with open("source/users.txt", 'r') as f:
            content = json.loads(f.read())
    except IOError:
        print("I/O error")
    
    return content

In [40]:
read_token()
users = read_users()

start_time = "2021-02-04T00:00:00.000Z"
end_time = "2021-03-18T00:00:00.000Z"

In [41]:
def remove_collections(tweets):
    """ Remove all collection Tweets (messages that do not fit in one Tweet and are split into a collection). """
    
    tweets2 = []

    # To find a collection, we need to temporarily remove some parts of a Tweet
    for tweet in tweets:
        # Remove hashtags
        text = re.sub('#[A-Za-z0-9_]+','', tweet['text'])
        
        # Remove links
        text = re.sub('http[A-Za-z0-9_:/.]+','', text)
        
        # Remove trailing whitespaces
        text = text.strip()
        
        # Match patterns of collections
        m = re.search('[0-9]*/[0-9]+\]$|[0-9]+/[0-9]*\]$|[0-9]*/[0-9]+\)$|[0-9]+/[0-9]*\)$|[0-9]+/$|/[0-9]+$|[0-9]+/[0-9]+$', text)
        
        if not m:
            tweets2.append(tweet)
    
    return tweets2

In [42]:
def calc_total_engagement(tweet, author_handle):
    """ Calculate total engagement of a Tweet. """
    
    # Add up all interactions
    interactions = tweet['public_metrics']['retweet_count'] + \
                    tweet['public_metrics']['reply_count'] + \
                    tweet['public_metrics']['like_count'] + \
                    tweet['public_metrics']['quote_count']
    
    followers = users[author_handle]['followers']
    
    # Relate to nr. of followers so comparison is possible
    return interactions / followers

In [43]:
def get_tweets():
    """ Prepares parameters for Twitter API request for each user. """
    
    sleep_time = 1.1
    max_results = 500
    tweets = []
    
    for handle, info in users.items():
        query = "from:%s lang:nl -is:retweet -is:reply -is:quote" % (handle)
        res = request_tweets(query, start_time, end_time, max_results, sleep_time)
        res_clean = remove_collections(res)
        
        # Add additional data to Tweets
        for tweet in res_clean:
            tweet.update({"author_handle": handle})
            tweet.update({"total_engagement": calc_total_engagement(tweet, handle)})
            
            # Category data is empty for now
            tweet.update({"cat_con": ""})
            tweet.update({"cat_act": ""})
        
        print("%s: %s Tweets" % (handle, len(res_clean)))
        
        tweets.extend(res_clean)
        
        # Wait before the next request
        time.sleep(sleep_time)
        
    print("Total Tweets: %s" % (len(tweets)))
    
    random.shuffle(tweets)
    
    return tweets

In [44]:
tweets = get_tweets()

VVD: 226 Tweets
markrutte: 34 Tweets
D66: 334 Tweets
SigridKaag: 59 Tweets
geertwilderspvv: 187 Tweets
cdavandaag: 148 Tweets
WBHoekstra: 43 Tweets
SPnl: 218 Tweets
MarijnissenL: 120 Tweets
PvdA: 288 Tweets
PloumenLilianne: 89 Tweets
groenlinks: 297 Tweets
jesseklaver: 98 Tweets
fvdemocratie: 241 Tweets
thierrybaudet: 349 Tweets
PartijvdDieren: 337 Tweets
estherouwehand: 48 Tweets
christenunie: 39 Tweets
gertjansegers: 66 Tweets
VoltNederland: 298 Tweets
DassenLaurens: 27 Tweets
JuisteAntwoord: 108 Tweets
Eerdmans: 7 Tweets
SGPnieuws: 124 Tweets
keesvdstaaij: 27 Tweets
DenkNL: 86 Tweets
F_azarkan: 73 Tweets
50pluspartij: 87 Tweets
LianedenHaan: 24 Tweets
BoerBurgerB: 80 Tweets
lientje1967: 219 Tweets
PolitiekBIJ1: 203 Tweets
SylvanaBIJ1: 80 Tweets
Total Tweets: 4664


In [45]:
def write_tweets(path):
    """ Write obtained Tweets to a text file in JSON-format. """
    
    try:
        with open(path, 'w') as f:
            json.dump(tweets, f)
    except IOError:
        print("I/O error")

In [46]:
write_tweets("source/tweets_all.txt")