# 1 Define Functions to Interact with the Twitter API

In [1]:
import requests 
import json 
import pandas as pd
import urllib
import os
from os import path
from datetime import datetime as dt

# imports the twitter_secrets python file in which we store the twitter API keys
from twitter_secrets import twitter_secrets as ts

# puts the bearer token in the request header
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers
        
# sets the rules on which tweets to retrieve   
def set_rules(headers, delete, bearer_token, rules):
    payload = {"add": rules}
    response = requests.post(
        "https://api.twitter.com/2/tweets/search/stream/rules",
        headers=headers,
        json=payload,
    )
    if response.status_code != 201:
        raise Exception(
            "Cannot add rules (HTTP {}): {}".format(response.status_code, response.text)
        )
    print(json.dumps(response.json()))

# retrieves the current set of rules from the API  
def get_rules(headers, bearer_token):
    response = requests.get(
        "https://api.twitter.com/2/tweets/search/stream/rules", headers=headers
    )
    if response.status_code != 200:
        raise Exception(
            "Cannot get rules (HTTP {}): {}".format(response.status_code, response.text)
        )
    print(json.dumps(response.json()))
    return response.json()

# tells the API to delete our current rule configuration 
def delete_all_rules(headers, bearer_token, rules):
    if rules is None or "data" not in rules:
        return None

    ids = list(map(lambda rule: rule["id"], rules["data"]))
    payload = {"delete": {"ids": ids}}
    response = requests.post(
        "https://api.twitter.com/2/tweets/search/stream/rules",
        headers=headers,
        json=payload
    )
    if response.status_code != 200:
        raise Exception(
            "Cannot delete rules (HTTP {}): {}".format(
                response.status_code, response.text
            )
        )
    print(json.dumps(response.json()))

# starts the stream, iterates through the lines of the response and for each line calls the save_tweets function and the save_media_to_disk function
def get_stream(headers, set, bearer_token, expansions, fields, save_to_disk, save_path):
    data = []
    response = requests.get(
        "https://api.twitter.com/2/tweets/search/stream" + expansions + fields, headers=headers, stream=True,
    )
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Cannot get stream (HTTP {}): {}".format(
                response.status_code, response.text
            )
        )
    i = 0
    for response_line in response.iter_lines():
        i += 1
        if i == max_results:
            break
        else:
            json_response = json.loads(response_line)
            #print(json.dumps(json_response, indent=4, sort_keys=True))
            try:
                save_tweets(json_response)
                if save_to_disk == True:
                    save_media_to_disk(json_response, save_path)
            except (json.JSONDecodeError, KeyError) as err:
                # In case the JSON fails to decode, we skip this tweet
                print(f"{i}/{max_results}: ERROR: encountered a problem with a line of data... \n")
                continue

# appends information from tweets to a dataframe           
def save_tweets(tweet):
    #print(json.dumps(tweet, indent=4, sort_keys=True))
    data = tweet['data']
    includes = tweet['includes']
    media = includes['media']
    for line in media:
        tweet_list.append([data['id'], line['url']])  

# iterates through the media attached to a tweet and saves each media file to the specified directory
def save_media_to_disk(tweet, save_path):
    data = tweet['data']
    #print(json.dumps(data, indent=4, sort_keys=True))
    includes = tweet['includes']
    media = includes['media']
    for line in media:
        media_url = line['url']
        media_key = line['media_key']
        pic = urllib.request.urlopen(media_url)
        file_path = save_path + "\\" + media_key + ".jpg"
        
        if not path.isfile(file_path):
            print(file_path)
            try:
                with open(file_path, 'wb') as localFile:
                    localFile.write(pic.read())
                tweet_list.append(media_key, media_url)
            except Exception as e:
                print('exception when saving media url ' + media_url + ' to path: ' + file_path)

# creates a new directory
def createDir(save_path):
    try:
        os.makedirs(save_path)
    except OSError:
        print ("Creation of the directory %s failed" % save_path)
        if path.exists(savepath):
            print("file already exists")
    else:
        print ("Successfully created the directory %s " % save_path)


# 2 Define the Folder Structure for the Images

In [3]:
# save to disk true/false
save_to_disk = True
 
# saves the images to disk in a new folder path that will be created with the createDir function
if save_to_disk == True: 
    # detect the current working directory and print it
    base_path = os.getcwd()
    print ("The current working directory is %s" % base_path)
    img_dir = '\\twitter\\downloaded_media\\'
    # the write path in which the data will be stored. If it does not yet exist, it will be created
    now = dt.now()
    dt_string = now.strftime("%d%m%Y-%H%M%S")# ddmmYY-HMS
    save_path = base_path + img_dir + dt_string
    createDir(save_path)

The current working directory is C:\Users\Flo\relataly-public-python-API-tutorials
Successfully created the directory C:\Users\Flo\relataly-public-python-API-tutorials\twitter\downloaded_media\02052021-164036 


# 3 Subscribe to the Tweet Streaming Service

In [4]:
# the max number of tweets that will be returned
max_results = 10

# you can adjust the rules if needed
search_rules = [
    {"value": "dog has:images", 
     "tag": "dog pictures", 
     "lang": "en"},
]

# these are the fields that will be delivered with the response
media_fields = "&media.fields=duration_ms,height,media_key,preview_image_url,public_metrics,type,url,width"

# we will retrieve the tweet object extended by the media object
expansions = "?expansions=attachments.media_keys"
tweet_list = []

bearer_token = ts.BEARER_TOKEN
headers = create_headers(bearer_token)
rules = get_rules(headers, bearer_token)
delete = delete_all_rules(headers, bearer_token, rules)
set = set_rules(headers, delete, bearer_token, search_rules)
get_stream(headers, set, bearer_token, expansions, media_fields, save_to_disk, save_path)

df = pd.DataFrame (tweet_list, columns = ['tweetid', 'preview_image_url'])
df

{"data": [{"id": "1388865425992294400", "value": "cat has:images -grumpy", "tag": "cat pictures"}, {"id": "1388865425992294401", "value": "dog has:images", "tag": "dog pictures"}], "meta": {"sent": "2021-05-02T14:41:00.279Z"}}
{"meta": {"sent": "2021-05-02T14:41:01.645Z", "summary": {"deleted": 2, "not_deleted": 0}}}
{"data": [{"value": "dog has:images", "tag": "dog pictures", "id": "1388866121256869898"}], "meta": {"sent": "2021-05-02T14:41:02.800Z", "summary": {"created": 1, "not_created": 0, "valid": 1, "invalid": 0}}}
200
1/10: ERROR: encountered a problem with a line of data... 

2/10: ERROR: encountered a problem with a line of data... 

3/10: ERROR: encountered a problem with a line of data... 

C:\Users\Flo\relataly-public-python-API-tutorials\twitter\downloaded_media\02052021-164036\3_1388866102214549508.jpg
exception when saving media url https://pbs.twimg.com/media/E0Y-XIcUYAQyNdY.jpg to path: C:\Users\Flo\relataly-public-python-API-tutorials\twitter\downloaded_media\0205202

Unnamed: 0,tweetid,preview_image_url
0,1388866111941140481,https://pbs.twimg.com/media/E0Y-XIcUYAQyNdY.jpg
1,1388866111941140481,https://pbs.twimg.com/media/E0Y-XIgUcAEbqCH.jpg
2,1388866111941140481,https://pbs.twimg.com/media/E0Y-XIeVkAAUqbv.jpg
3,1388866111941140481,https://pbs.twimg.com/media/E0Y-XIeUUAAlq3V.jpg
4,1388866115401568260,https://pbs.twimg.com/media/E0PkikXXsAMfaXt.jpg
5,1388866125010702338,https://pbs.twimg.com/media/E0Y-YY4X0AY2rjd.jpg
