In [1]:
# import libaries
import tweepy
import json
import requests
from kafka import KafkaProducer
from time import sleep

# create keyword and language variables
KEYWORD = '#COVID19'
LANGUAGE = 'en'

# create Kafka topic variable
KAFKA_TOPIC = 'texts_covid19'

# create Twitter authentication credentials variables
ACCESS_TOKEN = "**********"
ACCESS_TOKEN_SECRET = "**********""
CONSUMER_KEY = "**********"
CONSUMER_SECRET = "**********"

# create sleep time variable to control number of produced tweets
SLEEP_TIME = 1

# create authentication handler
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

# create Kafka producer
producer = KafkaProducer(bootstrap_servers = ['localhost:9092'],
                         value_serializer = lambda x: x.encode('utf-8'))

# create a custom StreamListener class that writes tweets to MongoDB
class KafkaStreamListener(tweepy.StreamListener):

    def on_status(self, status):
        print(status)
    
    def on_data(self, data):
        # turn string into JSON
        json_data = json.loads(data)
        
        # if the tweet is a retweet, then get the original tweet
        if 'retweeted_status' in json_data.keys():
            
            # if the tweet is longer than 140 characters, get the extended tweet
            if 'extended_tweet' in json_data['retweeted_status']:
                try:
                    producer.send(KAFKA_TOPIC, value = json_data['retweeted_status']['extended_tweet']['full_text'])
                    sleep(SLEEP_TIME)
                except KeyError:
                    print('Missing key. Existing keys are: {}'.format(json_data['retweeted_status']['extended_tweet'].keys()))
            
            # if the tweet is shorter than 140 characters, gets the normal tweet
            else:
                try:
                    producer.send(KAFKA_TOPIC, value = json_data['retweeted_status']['text'])
                    sleep(SLEEP_TIME)
                except KeyError:
                    print('Missing key. Existing keys are: {}'.format(json_data['retweeted_status'].keys()))

        
        # continues here if the tweet is not a retweet
        else:
            
            # if the tweet is longer than 140 characters, get the extended tweet
            if 'extended_tweet' in json_data.keys():
                try:
                    producer.send(KAFKA_TOPIC, value = json_data['extended_tweet']['full_text'])
                    sleep(SLEEP_TIME)
                except KeyError:
                    print('Missing key. Existing keys are: {}'.format(json_data['extended_tweet'].keys()))
            
            # if the tweet is shorter than 140 characters, gets the normal tweet
            else:
                try:
                    producer.send(KAFKA_TOPIC, value = json_data['text'])
                    sleep(SLEEP_TIME)
                except KeyError:
                    print('Missing key. Existing keys are: {}'.format(json_data.keys()))

# create a start stream function that automatically restarts the stream on failure
def start_stream(stream, **kwargs):
    """Adapted from: https://github.com/tweepy/tweepy/issues/1053"""
    try:
        stream.filter(**kwargs)
    except Exception:
        stream.disconnect()
        start_stream(stream, **kwargs)

# create a StreamListener object
KafkaStreamListener = KafkaStreamListener()
stream = tweepy.Stream(auth = auth, listener = KafkaStreamListener)

# start stream
start_stream(stream, track = [KEYWORD], languages = [LANGUAGE])

KeyboardInterrupt: 