In [None]:
!pip install tweepy
!pip install --upgrade google-cloud-storage
!pip install --upgrade google-cloud-pubsub

# Tweets extractor

In [None]:
import time
import json
from random import randint

import tweepy

from google.cloud import storage
from google.cloud import pubsub_v1

### Utils functions

In [None]:
# Function to write a json as a file in the bucket
def write_to_bucket(json_object, filename):
    # Creating a blob
    blob = bucket.blob(filename + '.json')
    # Uploading the blob 
    blob.upload_from_string(data = json.dumps(json_object), content_type = 'application/json')
    return { 'response': filename + ' upload complete' }

In [None]:
# Function to publish a message in Pub/Sub topic
def publish_message(message):
    future = publisher_client.publish(topic_path, message.encode('utf-8'))
    return { 'response': future.result() + ' message published' } 

### Connecting to Twitter API

In [None]:
# Twitter credentials
API_KEY = ''
API_SECRET_KEY = ''

ACCESS_TOKEN = ''
ACCESS_TOKEN_SECRET = ''

In [None]:
# Creating authentication object
auth = tweepy.OAuthHandler(API_KEY, API_SECRET_KEY)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

In [None]:
# Connecting to Twitter API
api = tweepy.API(auth)

### Connecting to Google Cloud Storage

In [None]:
# Creating the Storage client
storage_client = storage.Client.from_service_account_json('unbosque.json')

In [None]:
# Bucket for store the tweets
BUCKET_NAME = 'tweets-lake'
bucket = storage_client.get_bucket(BUCKET_NAME)

### Connecting to Google Cloud Pub/Sub

In [None]:
PROJECT_ID = 'unbosque'
TOPIC = 'new-tweet-notify'

In [None]:
# Creating the Pub/Sub publisher client
publisher_client = pubsub_v1.PublisherClient.from_service_account_json('unbosque.json')

In [None]:
# Instantiating the topic path
topic_path = publisher_client.topic_path(PROJECT_ID, TOPIC)

### Extracting and loading tweets

In [None]:
id = None
while True:
    try:
        # Getting tweets: https://docs.tweepy.org/en/latest/api.html#tweepy.API.search_tweets
        tweets = api.search_tweets(q = 'elecciones colombia', tweet_mode = 'extended', lang = 'es', max_id = id)
        
        # Iterating over a page of tweets
        for tweet in tweets:
            tweet_dict = tweet._json
            
            if not tweet_dict['full_text'].startswith('RT'): # Avoiding RTs
                # Storing the tweet on the bucket
                print(write_to_bucket(tweet_dict, tweet_dict['id_str']))
                
                # Publishing the tweet on the Pub/Sub topic
                print(publish_message(json.dumps(tweet_dict)))
                
                # Simulating streaming
                time.sleep(randint(1, 10))
            
        id = tweet.id
    except tweepy.TooManyRequests:
        print('Too many requests. Sleeping...')
        time.sleep(600)
        print('Getting back to action!')
        pass