In [1]:
import twitter
import json
import requests
import os
import csv
import time
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import datetime
import random
import re

In [17]:
def read_token():
    """ Read the Twitter API Token from a preset text file. """
    
    token = ""
    
    try:
        with open("source/token.txt", 'r') as f:
            token = f.read()
    except IOError:
        print("I/O error")
        
    os.environ['TOKEN'] = token

In [3]:
# This function takes a query and other relevant request info and requests all Tweets matching the query and the
# given start and end time. Between each request, a short sleep time is installed to prevent breaking query limits
# of Twitter API.
def request_tweets(query, start_time, end_time, max_results, sleep_time):
    bearer_token = os.getenv('TOKEN')
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    
    search_url = "https://api.twitter.com/2/tweets/search/all"
    next_token = ""
    tweets = []
    i = 0
    
    while True:
        if i > 0:
            query_params = {'query': query,
                'start_time': start_time,
                'end_time': end_time,
                'max_results': max_results,
                'tweet.fields': 'public_metrics,created_at,author_id,conversation_id',
                'pagination_token': next_token}
            
            # Wait before the next request
            time.sleep(sleep_time)
        else:            
            query_params = {'query': query,
                'start_time': start_time,
                'end_time': end_time,
                'max_results': max_results,
                'tweet.fields': 'public_metrics,created_at,author_id,conversation_id'}

        response = requests.request("GET", search_url, headers = headers, params = query_params)
        
        # Stop if error occurs
        if response.status_code != 200:
            raise Exception(response.status_code, response.text)
            break
    
        # Stop of no data found
        res = response.json()
        if 'data' not in res:
            print("Data not found:")
            print(res)
            break
        else:
            res_tweets = res['data']
        
        # Append new Tweets to Tweets list
        for tweet in res_tweets:
            tweets.append(tweet)
            
        # Stop if no next token
        res = response.json()
        if 'next_token' not in res['meta']:
            break

        next_token = res['meta']['next_token']
        i += 1
    
    return tweets

In [4]:
# This function reads the file that contains user information.
def read_users():
    
    content = ""
    
    try:
        with open("source/users.txt", 'r') as f:
            content = json.loads(f.read())
    except IOError:
        print("I/O error")
    
    return content

In [5]:
# This function removes collection Tweets (messages that do not fit in one Tweet) from a given list of Tweets.
def remove_collections(tweets):
    
    tweets2 = []
    
    for tweet in tweets:
        # Remove hashtags
        text = re.sub('#[A-Za-z0-9_]+','', tweet['text'])
        
        # Remove links
        text = re.sub('http[A-Za-z0-9_:/.]+','', text)
        
        # Remove trailing whitespaces
        text = text.strip()
        
        # Match patterns of collections
        m = re.search('[0-9]+/[0-9]+\]$|[0-9]+/\]$|[0-9]+/[0-9]+\)$|[0-9]+/\)$|[0-9]+/$|[0-9]+/[0-9]+$', text)
        n = re.search('[0-9]+/', text)
        
        if not m:
            tweets2.append(tweet)
    
    return tweets2

In [18]:
read_token()
users = read_users()

start_time = "2021-02-04T00:00:00.000Z"
end_time = "2021-03-18T00:00:00.000Z"

In [19]:
# This function loops through users and generates a query which is used to request Tweets. All Tweets are appended
# to one single list which is later shuffled to randomize the order.
def get_tweets(users, start_time, end_time):
    
    sleep_time = 1.1
    max_results = 500
    tweets = []
    
    # Retrieve Tweets for each user
    for handle, info in users.items():
        query = "from:%s lang:nl -is:retweet -is:reply -is:quote" % (handle)
        res = request_tweets(query, start_time, end_time, max_results, sleep_time)
        res_clean = remove_collections(res)
        
        # Add author_handle to Tweets
        for tweet in res_clean:
            tweet.update({"author_handle": handle})
        
        print("%s: %s Tweets" % (handle, len(res_clean)))
        
        tweets.extend(res_clean)
        
        # Wait before the next request
        time.sleep(sleep_time)
        
    print("Total Tweets: %s" % (len(tweets)))
    
    # Shuffle order of Tweets
    random.shuffle(tweets)
    
    return tweets

In [20]:
tweets = get_tweets(users, start_time, end_time)

VVD: 226 Tweets
markrutte: 34 Tweets
D66: 334 Tweets


KeyboardInterrupt: 

In [9]:
# This function writes the Tweets to a local file.
def write_tweets(tweets):
    
    try:
        with open("source/all_tweets.txt", 'w') as f:
            json.dump(tweets, f)
    except IOError:
        print("I/O error")

In [10]:
write_tweets(tweets)