# This file stream data from twitter and send stream

### import packages

In [0]:
import socket
import sys
import requests
import json
import time

### replace with your bearer_token below

In [0]:
bearer_token = ""

## Specify your parameters

In [0]:
max_iter = 20   # total time to send sockets per window_size
max_per_iter = 128 # max number of results to send per window_size
window_size = 8 # window_size that allows socket to sleep while sending max_per_iter results

In [0]:
def create_headers(bearer_token):
    '''
        create request header for given bearer_token account
        
        #param:
            bearer_token: (str) get your own at https://developer.twitter.com/en/portal/dashboard 
        
        #return:
            json format headers with authorized bearer_token
    '''  
    
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    
    return headers
        
def set_rules(headers, bearer_token, rules):
    '''
        set stream rules for given bearer_token account
        
        #param:
            headers: request header for twitter api 
            bearer_token: (str) get your own at https://developer.twitter.com/en/portal/dashboard 
            rules: (json) with rule value and corresponding tag (optional)
            
        #return: 
            None
    '''  
    
    payload = {"add": rules}
    response = requests.post(
        "https://api.twitter.com/2/tweets/search/stream/rules",
        headers=headers,
        json=payload,
    )
    if response.status_code != 201:
        raise Exception(
            "Cannot add rules (HTTP {}): {}".format(response.status_code, response.text)
        )
    print('following rules have been set: ')
    print(json.dumps(response.json()))
    
def get_rules(headers, bearer_token):
    '''
        get current stream rules for given bearer_token account
        
        #param:
            headers: request header for twitter api 
            bearer_token: (str) get your own at https://developer.twitter.com/en/portal/dashboard 
            
        #return:
            json format rules that have been created on the given account
    '''  
    
    response = requests.get(
        "https://api.twitter.com/2/tweets/search/stream/rules", headers=headers
    )
    if response.status_code != 200:
        raise Exception(
            "Cannot get rules (HTTP {}): {}".format(response.status_code, response.text)
        )
    print('current stream rules: ')
    print(json.dumps(response.json()))
    
    return response.json()

def delete_all_rules(headers, bearer_token, rules):
    '''
        delete current stream rules for given bearer_token account
        
        #param:
            headers: request header for twitter api 
            bearer_token: (str) get your own at https://developer.twitter.com/en/portal/dashboard 
            rules: (json) with rule value and corresponding tag (optional)
            
        #return:
            None
    '''    
    
    if rules is None or "data" not in rules:
        return None

    ids = list(map(lambda rule: rule["id"], rules["data"]))
    payload = {"delete": {"ids": ids}}
    response = requests.post(
        "https://api.twitter.com/2/tweets/search/stream/rules",
        headers=headers,
        json=payload
    )
    if response.status_code != 200:
        raise Exception(
            "Cannot delete rules (HTTP {}): {}".format(
                response.status_code, response.text
            )
        )
    print(json.dumps(response.json()))

def get_stream(headers, bearer_token, expansions, fields):
    '''
        streaming data from twitter with bearer_token
        
        #param:
            headers: request header for twitter api 
            bearer_token: (str) get your own at https://developer.twitter.com/en/portal/dashboard 
            expansions: (str) attachment requirement
            fields: (str) media fields requirement
            save_to_disk: (bool) true for save to disk and false for not
            save_path: (str) local path to save img
            total: (int) total number of results to save 
        
        #return:
            None
    '''
        
    data = []
    response = requests.get(
        "https://api.twitter.com/2/tweets/search/stream" + expansions + fields, headers=headers, stream=True,
    )
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Cannot get stream (HTTP {}): {}".format(
                response.status_code, response.text
            )
        )
    num = 0
    
    return response

def send_tweets_to_spark(http_resp, tcp_connection):
    count = 0
    local_iter = 0

    for line in http_resp.iter_lines():
        try:
            full_tweet = json.loads(line)
            full_tweet['includes']['media']
        except:
            print('not media attached!')
            continue
        
        for line in full_tweet['includes']['media']:
            try:
                print("Tweet media url: " + line['url'])
                print ("------------------------------------------")
                tcp_connection.send((line['url'] + '\n').encode('utf-8'))
                count += 1
                if count >= max_per_iter:
                    time.sleep(window_size)
                    count = 0
                    local_iter += 1
                    
#                 print(count)
            except:
                e = sys.exc_info()[0]
                print("Error: %s" % e)
                
            if local_iter >= max_iter:
                print('reach max iteration bound')
                return

In [0]:
media_fields = "&media.fields=duration_ms,height,media_key,preview_image_url,public_metrics,type,url,width"
expansions = "?expansions=attachments.media_keys"

# following rules search all tweets with a, e, i or o (suggested not changed)
# search_rules = [{'value': 'a' + ' has:images',
#                 'tag': 'a'},
#                 {'value': 'e' + ' has:images',
#                 'tag': 'e'},
#                 {'value': 'i' + ' has:images',
#                 'tag': 'i'},
#                 {'value': 'o' + ' has:images',
#                 'tag': 'o'}   
#                 ]
search_rules = [{'value': '(butterfly has:images)' + ' OR ' + '(cat has:images)' + ' OR (dogs has:images)',
                'tag': 'a'},
                {'value': '(dogs has:images) OR (elephant has:images) OR (hen has:images)',
                'tag': 'e'},
                {'value': '(horse has:images) OR (monkey has:images) OR (panda has:images)',
                'tag': 'i'},
                {'value': '(sheep has:images) OR (spider has:images) OR (squirrel has:images)',
                'tag': 'o'}   
                ]

headers = create_headers(bearer_token)
# if the search_rules not change, plz note the following lines
rules = get_rules(headers, bearer_token)
delete = delete_all_rules(headers, bearer_token, rules)
set1 = set_rules(headers, bearer_token, search_rules)

In [0]:
TCP_IP = "localhost"
TCP_PORT = 9017
conn = None
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind((TCP_IP, TCP_PORT))
s.listen(1)
print("Waiting for TCP connection...")
conn, addr = s.accept()
print("Connected... Starting getting tweets.")

resp = get_stream(headers, bearer_token, expansions, media_fields)
send_tweets_to_spark(resp, conn)
s.close()
conn.close()