# 0. Package

In [1]:
import requests 
import json 
import pandas as pd
import urllib
import os
import time
from os import path
import progressbar
from imagededup.methods import PHash

### Use following code to install package imagededup if the package is not applicable  
`pip install imagededup`  or  `pip install imagededup --user`

## Dataset and Model

Find the corresponding labels and model from the below website:  
https://www.kaggle.com/datasets/piyushkumar18/animal-image-classification-dataset

# 1. Train Set Search Query

In [2]:
def bearer_oauth(r):
    """
        Method required by bearer token authentication.
    """
    
    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "6998final"
    
    return r

def connect_to_endpoint(url):
    '''
        connect with given url
        
        #param:
            url: (str) query with user-defined conditions
        
        #return:
            json format response with provided fields
    '''
    
    response = requests.request("GET", url, auth=bearer_oauth)
#     print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def save_to_file(tweet, save_path):
    '''
        save the attached img(s) by matching tag
        
        #param:
            tweet: (json) metadata of the streaming tweet
            save_path: (str) local path to save img
        
        #return:
            None
    '''
    
    createDir(save_path)
    try:
        tweet['includes']['media']
    except:
#         print('key error, no attachmentss included')
        return
    
    for img in tweet['includes']['media']:
        try:
            media_url = img['url']
            media_key = img['media_key']
            pic = urllib.request.urlopen(media_url)
            exist_file = os.listdir(save_path)
            if (media_key + '.jpg') not in exist_file:
                file_path = save_path+ "/" + media_key + ".jpg"
                with open(file_path, 'wb') as localFile:
                    localFile.write(pic.read())
#             tweet_list.append([data['id'], media_url, data['text']])
        except Exception as e:
            continue

def createDir(save_path):
    '''
        create dir path with given save_path, print error when path already exists.
        
        #param:
            save_path: (str) local path to save img
        
        #return:
            None
    '''
    
    try:
        os.makedirs(save_path)
    except OSError:
        pass
        
def bar(tag):
    """
        create progress bar with tag info
        
        #param:
            tag: (str) display info at the bar
            
        #return:
            None
    """
    
    return progressbar.ProgressBar(
    widgets=[
        'Loading ' + tag + ' images', 
        ' ', progressbar.Percentage(),
        ' ', progressbar.Bar('#'),
        ' ', progressbar.Timer(),
        ' ( ', progressbar.ETA(), ' ) ', 
    ]
    )

def deduplicate(img_path):
    """
        deduplicate img in the given path
        
        #param:
            img_path: local path of the 
        
        #reutrn:
            None
    """
    
    try:
        phasher = PHash()
        # generate hash value for all img in current dir
        encodings = phasher.encode_images(image_dir=img_path)

        # find duplicate img
        duplicates = phasher.find_duplicates(encoding_map=encodings)
        # print(duplicates)
        only_img = []  # unique img
        like_img = []  # similar img

        for img, img_list in duplicates.items():
            if ".png" in img:
                continue
            if img not in only_img and img not in like_img:
                only_img.append(img)
                like_img.extend(img_list)

        # delete file
        for like in like_img:
            like_src = os.path.join(img_path, like)
            png_src = like_src[:-4] + ".png"
            if os.path.exists(like_src):
                os.remove(like_src)
            if os.path.exists(png_src):
                os.remove(png_src)

    except Exception as e:
        print(e)

## Train Set Image Loading

In [8]:
# replace the token by your own
bearer_token = ""
max_results = 10000

def test():
    # endpoint of twitter recent search
    endpoint = "https://api.twitter.com/2/tweets/search/recent"
    
    # default settings
    media_fields = '&media.fields=duration_ms,height,media_key,preview_image_url,public_metrics,type,url,width' 
    expansion = '&expansions=attachments.media_keys'
    
    # total number of results to return, should be in [10, 100]
    number = '&max_results=100'
    # local path to save img
    save_path = './dataset'
    # number of iterations per tag, suggested more than 400
    iteration = 500
    
    # replace with your demand tags
#     tags = ['butterfly','cats','cow','dogs','elephant','hen','horse','monkey','panda','sheep','spider','squirrel']
    tags = ['horse','monkey','panda']
    
    for tag in tags:
        # get progressbar
        mybar = bar(tag)
        path = save_path + '/' + tag
        sign_change = False
        query = '?query=%23{}%20has%3Aimages'.format(tag)
        sign_dep = False
        
        for t in mybar(range(iteration)):
            if t == 0:
                token = ''
            else:
                try:
                    json_response['meta']['next_token']
                    token = '&next_token={}'.format(json_response['meta']['next_token'])
                except:
                    if sign_change:
                        print('all records loaded for tag {}!'.format(tag))
                        break

                    sign_change = True
                    query = '?query={}%20has%3Aimages'.format(tag)  
                    token = ''

            # query = '?query=%23{}%20OR%20{}%20has%3Aimages'.format(tag, tag)  
            search_url = endpoint + query + expansion + media_fields + number + token
            json_response = connect_to_endpoint(search_url)
            save_to_file(json_response, path)

            # deduplicate when records are more than max_results
            if len(os.listdir(path)) >= max_results:
                deduplicate(path)
                sign_dep = True
                
            if len(os.listdir(path)) >= max_results:
                break
        if not sign_dep:
            deduplicate(path)

if __name__ == '__main__':
    # load img from twitter
    test()

Loading horse images 100% |##########| Elapsed Time: 0:23:54 ( Time: 0:23:54 ) 
Loading monkey images 100% |#########| Elapsed Time: 0:08:43 ( Time: 0:08:43 ) 
Loading panda images 100% |##########| Elapsed Time: 0:27:41 ( Time: 0:27:41 ) 


### check img numbers within each tag path

In [9]:
path = './dataset'
for subfile in os.listdir(path):
    print('total img under '+subfile)
    print(len(os.listdir(path + '/' + subfile)))

# pending = ['butterfly','cats','dogs','horse','panda']

total img under horse
5805
total img under monkey
1880
total img under panda
5382


# 2. Streaming Functions

In [None]:
def create_headers(bearer_token):
    '''
        create request header for given bearer_token account
        
        #param:
            bearer_token: (str) get your own at https://developer.twitter.com/en/portal/dashboard 
        
        #return:
            json format headers with authorized bearer_token
    '''  
    
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    
    return headers
        
def set_rules(headers, bearer_token, rules):
    '''
        set stream rules for given bearer_token account
        
        #param:
            headers: request header for twitter api 
            bearer_token: (str) get your own at https://developer.twitter.com/en/portal/dashboard 
            rules: (json) with rule value and corresponding tag (optional)
            
        #return: 
            None
    '''  
    
    payload = {"add": rules}
    response = requests.post(
        "https://api.twitter.com/2/tweets/search/stream/rules",
        headers=headers,
        json=payload,
    )
    if response.status_code != 201:
        raise Exception(
            "Cannot add rules (HTTP {}): {}".format(response.status_code, response.text)
        )
    print('following rules have been set: ')
    print(json.dumps(response.json()))
    
def get_rules(headers, bearer_token):
    '''
        get current stream rules for given bearer_token account
        
        #param:
            headers: request header for twitter api 
            bearer_token: (str) get your own at https://developer.twitter.com/en/portal/dashboard 
            
        #return:
            json format rules that have been created on the given account
    '''  
    
    response = requests.get(
        "https://api.twitter.com/2/tweets/search/stream/rules", headers=headers
    )
    if response.status_code != 200:
        raise Exception(
            "Cannot get rules (HTTP {}): {}".format(response.status_code, response.text)
        )
    print('current stream rules: ')
    print(json.dumps(response.json()))
    
    return response.json()

def delete_all_rules(headers, bearer_token, rules):
    '''
        delete current stream rules for given bearer_token account
        
        #param:
            headers: request header for twitter api 
            bearer_token: (str) get your own at https://developer.twitter.com/en/portal/dashboard 
            rules: (json) with rule value and corresponding tag (optional)
            
        #return:
            None
    '''    
    
    if rules is None or "data" not in rules:
        return None

    ids = list(map(lambda rule: rule["id"], rules["data"]))
    payload = {"delete": {"ids": ids}}
    response = requests.post(
        "https://api.twitter.com/2/tweets/search/stream/rules",
        headers=headers,
        json=payload
    )
    if response.status_code != 200:
        raise Exception(
            "Cannot delete rules (HTTP {}): {}".format(
                response.status_code, response.text
            )
        )
    print(json.dumps(response.json()))

def get_stream(headers, bearer_token, expansions, fields, save_to_disk, save_path, total):
    '''
        streaming data from twitter with bearer_token
        
        #param:
            headers: request header for twitter api 
            bearer_token: (str) get your own at https://developer.twitter.com/en/portal/dashboard 
            expansions: (str) attachment requirement
            fields: (str) media fields requirement
            save_to_disk: (bool) true for save to disk and false for not
            save_path: (str) local path to save img
            total: (int) total number of results to save 
        
        #return:
            None
    '''
        
    data = []
    response = requests.get(
        "https://api.twitter.com/2/tweets/search/stream" + expansions + fields, headers=headers, stream=True,
    )
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Cannot get stream (HTTP {}): {}".format(
                response.status_code, response.text
            )
        )
    num = 0
    for response_line in response.iter_lines():
        if num >= total:
            break
        else:
            try:
                json_response = json.loads(response_line)
                if save_to_disk == True:
                    save_media_to_disk_stream(json_response, save_path)
                num += 1
            except (json.JSONDecodeError, KeyError) as err:

                continue 
        
def save_media_to_disk_stream(tweet, save_path):
    '''
        save the attached img(s) by matching tag
        
        #param:
            tweet: (json) metadata of the streaming tweet
            save_path: (str) local path to save img
        
        #return:
            None
    '''
    
    data = tweet['data']
    includes = tweet['includes']
    media = includes['media']
    
    save_path += '/' + tweet["matching_rules"][0]["tag"]
    createDir(save_path)
    
    for line in media:
        media_url = line['url']
        media_key = line['media_key']
        pic = urllib.request.urlopen(media_url)

        try:
            exist_file = os.listdir(save_path)
            if (media_key + '.jpg') in exist_file:
                continue
                
            file_path = save_path+ "/" + media_key + ".jpg"
            with open(file_path, 'wb') as localFile:
                localFile.write(pic.read())
#             tweet_list.append([data['id'], media_url, data['text']])
        except Exception as e:
            print(e)
    
def createDir(save_path):
    '''
        create dir path with given save_path, print error when path already exists.
        
        #param:
            save_path: (str) local path to save img
        
        #return:
            None
    '''
    
    try:
        os.makedirs(save_path)
    except OSError:
        pass
    else:
        print ("Successfully created the directory %s " % save_path)

# Main Call

## 1. Query image with multiple match rules, total results as max_result

In [None]:
def main():
    # replace the token by your own
    bearer_token = "AAAAAAAAAAAAAAAAAAAAAH30awEAAAAA%2Fn7raIY9wrtGrl8YN9vWMwR9kic%3DFUPmD2bXugVrsh0Sw2Ta1aO2nDTKcPzpjutNQ2cGqIPS1s6x70"

    # total max results to save
    max_results = 1000

    # default settings
    media_fields = "&media.fields=duration_ms,height,media_key,preview_image_url,public_metrics,type,url,width"
    expansions = "?expansions=attachments.media_keys"

    # replace with your own labels with prefix #
    # notice here within a group there should be no more than five tags
    tags = [['#butterfly', '#cat', '#cow', '#dog'],  ['#horse', '#elephant', '#hen', '#monkey'], ['#panda', 
            '#sheep', '#spider', '#squirrel']]

    # replace with your own dir location to save img
    save_path = './dataset'
    for group in tags:
        search_rules = []
        for tag in group:
        # adjust the rules if needed
            search_rules.append({'value': tag + ' has:images', 
                                 'tag': tag.replace('#', '')
                                })
        headers = create_headers(bearer_token)
        rules = get_rules(headers, bearer_token)
        delete = delete_all_rules(headers, bearer_token, rules)
        set1 = set_rules(headers, delete, bearer_token, search_rules)
        get_stream(headers, bearer_token, expansions, media_fields, save_to_disk=True, save_path=save_path, total=max_results)

# df = pd.DataFrame (tweet_list, columns = ['tweet_id', 'preview_image_url', 'tweet_text'])
# df.to_csv('streaming_' + tag + '.csv')
if __name__ == '__main__':
    main()

## 2. Query with single tag, result per tag as max_result

In [None]:
def main():
    tags = ['#butterfly', '#cat', '#cow', '#dog',  '#horse']
    media_fields = "&media.fields=duration_ms,height,media_key,preview_image_url,public_metrics,type,url,width"
    expansions = "?expansions=attachments.media_keys"
    search_rules = []
    max_results = 1000

    for tag in tags:
        # adjust the rules if needed
        search_rules = [
            {'value': tag + ' has:images', 
             'tag': tag.replace('#', '')
            }
        ]

        save_path = './dataset'

        headers = create_headers(bearer_token)
        rules = get_rules(headers, bearer_token)
        delete = delete_all_rules(headers, bearer_token, rules)
        set1 = set_rules(headers, delete, bearer_token, search_rules)
        get_stream(headers, bearer_token, expansions, media_fields, save_to_disk=True, save_path=save_path, total=max_results)
    
if __name__ == '__main__':
    main()