<a href="https://colab.research.google.com/github/gladcolor/tweet_downloading_v2/blob/master/tweepy_v2_Ning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install packages

The official [tweets searching progrom](https://github.com/twitterdev/search-tweets-python/tree/v2) is quite good, but I have test it yet. You can try it.


In [None]:
pip install searchtweets-v2

Collecting searchtweets-v2
  Downloading https://files.pythonhosted.org/packages/d6/10/39bc8e59d1dd000fdf393ceb534eb681eef07acf77f8af9b12389fd5c9a5/searchtweets_v2-1.0.7-py3-none-any.whl
Installing collected packages: searchtweets-v2
Successfully installed searchtweets-v2-1.0.7


# Prerequisite functions

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import glob
import logging
import tweepy

In [None]:
def set_logger(log_file_path="debug.log", level="INFO"):
# def set_logger(log_file_path="debug.log", level="DEBUG"):
    logger = logging.getLogger()
    logger.setLevel(level)
    scream_handler = logging.StreamHandler()
    file_handler = logging.FileHandler(log_file_path)
    logger.addHandler(scream_handler)
    logger.addHandler(file_handler)
    return logger
    
  

try:
    # print(len(logger.handlers))
    while len(logger.handlers) > 1:
        logger.handlers.pop(0)
        # print(len(logger.handlers))
except:
    pass

logger = set_logger()

def get_api_token(token_path):    
    try:
        with open(token_path, "r") as f:
            logger.debug("token_path: %s" % token_path)
            lines = f.readlines()
            logger.debug("lines in the file: %s" % lines)

            lines = [line.split(": ")[-1][:-1] for line in lines]
        return lines

    except Exception as e:
        logger.error("Error: %s" % str(e))

from tqdm import tqdm
import ast


#-------------------- merge results -------------------#          
def find_place_id(row):
    # print(row)
    cell_text = row["geo"]
    # print(cell_text)
    if len(cell_text) > 1:
        place_dict = ast.literal_eval(cell_text)
    else:
        return ""
    # print(place_dict)
    if isinstance(place_dict, dict):
        place_id = place_dict.get("place_id", "")
        if len(place_id) > 1:
            return place_id

def clean_tweets(column):
    
    column = column.replace('\n',' ',inplace=True)
    return column

def refine_data(df):
    df['place_id'] = df.apply(find_place_id, axis=1)
    df['text'] = df.apply(clean_tweets, axis=1)
    
    return df

def find_media_row(row, df_media):
    cell_text = row["attachments"]
    if len(cell_text) > 1:
        attachments_dict = ast.literal_eval(cell_text)
    else:
        return ""
    
    if isinstance(attachments_dict, dict):
        media_keys = attachments_dict.get("media_keys", "")
        media_rows = []
        # print(df_media)
        # print(attachments_dict)
        for key in media_keys:
            key = str(key)
            if len(key) > 1:
                # print(key)
                # print(df_media['media_table_media_key'])
                row = df_media[df_media['media_table_media_key']==key].iloc[0].to_json()
                # print(df_media[df_media['media_table_media_key']==key])
                media_rows.append(row)
        # print(media_rows)
        return media_rows
    return ""

def merge_results(saved_path):
    data_files = glob.glob(os.path.join(saved_path, "*_data.csv"))
    logger.info("Start to merge %d filles." % len(data_files))
    all_df = []
    for d in tqdm(data_files[:1]):
        df_data = pd.read_csv(d)
        print(d)
        df_data = df_data.fillna("")
        df_data = refine_data(df_data)
        # print(df_data['place_id'])
        
        users_csv = d.replace("data.csv", "includes_users.csv")
        places_csv = d.replace("data.csv", "includes_places.csv")
        tweets_csv = d.replace("data.csv", "includes_tweets.csv")
        media_csv = d.replace("data.csv", "includes_media.csv")
        
        df_places = pd.read_csv(places_csv)
        df_users = pd.read_csv(users_csv)
        # df_users = df_users.apply(clean_tweets, axis=0)
        df_users["description"] = df_users["description"].str.replace("\n", " ")
        df_tweets = pd.read_csv(tweets_csv)
        df_media = pd.read_csv(media_csv).fillna("")
        df_media['media_key'] = df_media['media_key'].astype(str)
        df_tweets['text'] = df_tweets["text"].str.replace("\n", " ")

        # print(len(df_data))
        new_column_name = {name: "places_table_" + name for name in df_places.columns}
        df_places = df_places.rename(columns=new_column_name)

        new_column_name = {name: "users_table_" + name for name in df_users.columns}
        df_users = df_users.rename(columns=new_column_name)

        new_column_name = {name: "tweets_table_" + name for name in df_tweets.columns}
        df_tweets = df_tweets.rename(columns=new_column_name)

        new_column_name = {name: "media_table_" + name for name in df_media.columns}
        df_media = df_media.rename(columns=new_column_name)                

        df_merged = pd.merge(df_data, df_places, how='left', left_on="place_id", right_on="places_table_id")
        # print(len(df_merged))
        df_merged = pd.merge(df_merged, df_users, how='left', left_on="author_id", right_on="users_table_id")
        # print(len(df_merged))
        df_merged = pd.merge(df_merged, df_tweets, how='left', left_on="id", right_on="tweets_table_id")
        # print(len(df_tweets))
        # print(len(df_merged))
        df_merged["media_table_rows"] = df_merged.apply(find_media_row, args=(df_media,), axis=1)

        df_merged = df_merged.fillna("")
        df_merged.replace("\n", " ")
        df_merged = df_merged.drop_duplicates(subset=['id'], keep='last')
        # print(len(df_places))
        # return df_merged
        all_df.append(df_merged)

    final_df = pd.concat(all_df)
    final_file = os.path.join(saved_path, "merged.csv")
    final_df.to_csv(final_file, index=False)
    logger.info("Saved merged tweets in %s ." % final_file)

    return final_df

# Set tokens

Put your Twitter API tokens in the ```tweet_api_keys.txt``` file in the same directory of this notebook in the following format:
```
Consumer API Key: XXXX
Consumer API Secret Key: XXXX
Bearer Token: XXXX
Access Token: XXXX
Access Token Secret: XXXX
```


In [None]:
token_path = r'/content/drive/MyDrive/Research/tweets_downloading/tweet_api_keys.txt'

tokens = get_api_token(token_path)

consumer_key = tokens[0]
consumer_secret = tokens[1]
bearer_token = tokens[2]
access_token = tokens[3]
access_token_secret = tokens[4]

# Download tweets

The following cell is a exmaple query to download tweets in Australia with a keyword of "vaccine" since 2021-01-01 to 2021-06-01.

Please set ```query```, ```start_time```, ```end_time```, ```saved_path```, and ```max_results``` (10 - 500).

See these pages to building a query: 

[Building queries for Search Tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query#examples)

[Search Tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all)




In [None]:
import requests
import os
import json
import pandas as pd
import time


# a exmaple query to download tweets in Australia with a keyword of "vaccine" since 2020-01-01
# keyword = "vaccine"

# query = f"({keyword}) place_country:AU -is:retweet"
# query = f"({keyword}) place_country:AU"
query = "(vaccin OR vaccination OR vaccine OR vaccinate) place_country:AU has:geo"
# query = f"({keyword})"
start_time = "2021-01-01T00:00:01Z"
end_time = "2021-06-01T00:00:01Z"
max_results = 500
# saved_path = os.path.join(os.getcwd(), "saved_tweets")
saved_path = os.path.join("/content/drive/MyDrive/Research/tweets_downloading/tweets_AU_vaccin")
# since_id = "139819805172285849"  # cannot used with start/end_time!


# borrow from Twitter:
# https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/master/Full-Archive-Search/full-archive-search.py


search_url = "https://api.twitter.com/2/tweets/search/all"

# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields

# https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all


def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers


def connect_to_endpoint(url, headers, params):
    response = requests.request("GET", search_url, headers=headers, params=params)
    # print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)   
    return response.json()

def save_search(json_response, saved_path):
    try:
        if not os.path.exists(saved_path):
            os.mkdir(saved_path)

        meta = json_response['meta']
        data = json_response['data']
        includes = json_response['includes']
        basename = f"{meta['oldest_id']}_{meta['newest_id']}_{meta['result_count']}"

        data_filename = os.path.join(saved_path, basename + "_data.csv")
        df = pd.DataFrame(data)
        df.to_csv(data_filename, index=False)
        result_count = meta['result_count']
        result_count = str(result_count)
        logger.info("Saved %s tweets in: %s" % (result_count, data_filename))

        for key in includes.keys():
            includes_filename = os.path.join(saved_path, basename + f"_includes_{key}.csv")
            df = pd.DataFrame(includes[key])
            df.to_csv(includes_filename, index=False)
    except Exception as e:
        logger.error(e, exc_info=True)

def execute_download(saved_path=os.getcwd()):

    next_token = 'start'
    search_url = "https://api.twitter.com/2/tweets/search/all"
    headers = create_headers(bearer_token)
    total = 0
    query_params = {'query': query, \
                    "max_results": str(max_results), \
                    'expansions': 'attachments.poll_ids,attachments.media_keys,author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id', \
                    'tweet.fields': 'attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld', \
                    'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type', \
                    "user.fields": 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',\
                    "media.fields": "duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics", \
                    "poll.fields": "duration_minutes,end_datetime,id,options,voting_status", \
                    "start_time": start_time, \
                    "end_time": end_time, \
                    # "since_id":since_id, \  # cannot used with start/end_time!
                    }

    while next_token != "":
        try:
            
            json_response = connect_to_endpoint(search_url, headers, query_params)
            df = pd.DataFrame(json_response['data'])
            save_search(json_response, saved_path)
            
            total += int(json_response['meta']['result_count'])
            logger.info("Downloaded %s tweets in total." % total)


            next_token = json_response['meta'].get('next_token', "")
            if next_token == "":
                print("No next page! Exit.")
                return

            query_params.update({"next_token": next_token})            
            time.sleep(1)

        except Exception as e:
            logger.error(e, exc_info=True)
            time.sleep(3)
            continue


execute_download(saved_path=saved_path)
merge_df = merge_results(saved_path)


Saved 500 tweets in: /content/drive/MyDrive/Research/tweets_downloading/tweets_AU_vaccin/1398485693899280386_1399513890791120896_500_data.csv
Downloaded 500 tweets in total.
Saved 499 tweets in: /content/drive/MyDrive/Research/tweets_downloading/tweets_AU_vaccin/1397782512290570242_1398484039409954818_499_data.csv
Downloaded 999 tweets in total.
Saved 500 tweets in: /content/drive/MyDrive/Research/tweets_downloading/tweets_AU_vaccin/1397142598163177472_1397782288872529925_500_data.csv
Downloaded 1499 tweets in total.
Saved 499 tweets in: /content/drive/MyDrive/Research/tweets_downloading/tweets_AU_vaccin/1395717083858571264_1397141570676158473_499_data.csv
Downloaded 1998 tweets in total.
Saved 499 tweets in: /content/drive/MyDrive/Research/tweets_downloading/tweets_AU_vaccin/1394601870304288772_1395715620814036994_499_data.csv
Downloaded 2497 tweets in total.
Saved 499 tweets in: /content/drive/MyDrive/Research/tweets_downloading/tweets_AU_vaccin/1392316309656854534_139460124359979418

No next page! Exit.
/content/drive/MyDrive/Research/tweets_downloading/tweets_AU_vaccin/1399217705626980352_1399515802160680961_499_data.csv


100%|██████████| 1/1 [00:00<00:00,  3.55it/s]
Saved merged tweets in /content/drive/MyDrive/Research/tweets_downloading/tweets_AU_vaccin/merged.csv .


Unnamed: 0,reply_settings,id,created_at,text,conversation_id,context_annotations,source,entities,lang,author_id,referenced_tweets,possibly_sensitive,public_metrics,geo,in_reply_to_user_id,attachments,place_id,places_table_country_code,places_table_geo,places_table_country,places_table_place_type,places_table_id,places_table_full_name,places_table_name,users_table_name,users_table_verified,users_table_created_at,users_table_description,users_table_id,users_table_public_metrics,users_table_url,users_table_location,users_table_username,users_table_protected,users_table_profile_image_url,users_table_entities,users_table_pinned_tweet_id,tweets_table_geo,tweets_table_in_reply_to_user_id,tweets_table_reply_settings,tweets_table_id,tweets_table_created_at,tweets_table_text,tweets_table_conversation_id,tweets_table_context_annotations,tweets_table_source,tweets_table_entities,tweets_table_lang,tweets_table_author_id,tweets_table_referenced_tweets,tweets_table_possibly_sensitive,tweets_table_public_metrics,tweets_table_attachments,media_table_rows
0,everyone,1399515802160680961,2021-05-31T23:59:03.000Z,,1399515802160680961,"[{'domain': {'id': '123', 'name': 'Ongoing New...",Twitter for iPad,"{'mentions': [{'start': 3, 'end': 16, 'usernam...",en,1150553499500814339,"[{'type': 'retweeted', 'id': '1399252743546572...",False,"{'retweet_count': 3, 'reply_count': 0, 'like_c...",,,,,,,,,,,,Elaine McKay,False,2019-07-14T23:51:45.000Z,"Left wing nut job, life member of a union, wor...",1150553499500814339,"{'followers_count': 8729, 'following_count': 6...",,Living on Awabakal land,ElaineM11584892,False,https://pbs.twimg.com/profile_images/138066380...,,,,,,,,,,,,,,,,,,,
1,everyone,1399515521129668608,2021-05-31T23:57:56.000Z,,1399515521129668608,"[{'domain': {'id': '123', 'name': 'Ongoing New...",Twitter for iPad,"{'mentions': [{'start': 3, 'end': 18, 'usernam...",en,260041830,"[{'type': 'retweeted', 'id': '1399511270135455...",False,"{'retweet_count': 4, 'reply_count': 0, 'like_c...",,,,,,,,,,,,Brad Hooper 🔴⚪️💙,False,2011-03-03T02:47:37.000Z,Architect and urban designer from Central Vict...,260041830,"{'followers_count': 2029, 'following_count': 1...",https://t.co/xCKLemqBym,Australia,bradhooperarch,False,https://pbs.twimg.com/profile_images/117462315...,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",,,,,,,,,,,,,,,,,,
2,everyone,1399514829417639937,2021-05-31T23:55:11.000Z,,1399514829417639937,,Twitter for iPad,"{'annotations': [{'start': 22, 'end': 29, 'pro...",en,15523288,"[{'type': 'retweeted', 'id': '1399472499880955...",False,"{'retweet_count': 5, 'reply_count': 0, 'like_c...",,,,,,,,,,,,BarbVee,False,2008-07-22T00:30:33.000Z,,15523288,"{'followers_count': 1038, 'following_count': 1...",,,barbvee,False,https://pbs.twimg.com/profile_images/57028873/...,,,,,,,,,,,,,,,,,,,
3,everyone,1399514544435712003,2021-05-31T23:54:03.000Z,,1399514544435712003,"[{'domain': {'id': '123', 'name': 'Ongoing New...",Twitter for iPhone,"{'mentions': [{'start': 3, 'end': 18, 'usernam...",en,1450640060,"[{'type': 'retweeted', 'id': '1399511270135455...",False,"{'retweet_count': 4, 'reply_count': 0, 'like_c...",,,,,,,,,,,,Dr Darren Egberts,False,2013-05-23T05:39:37.000Z,"Darren is Principal of Sacred Heart College, K...",1450640060,"{'followers_count': 130, 'following_count': 38...",,"Kyneton, Victoria",djegberts,False,https://pbs.twimg.com/profile_images/748860631...,,,,,,,,,,,,,,,,,,,
4,everyone,1399513890791120896,2021-05-31T23:51:27.000Z,,1399487042426310656,"[{'domain': {'id': '123', 'name': 'Ongoing New...",Twitter for iPhone,"{'mentions': [{'start': 0, 'end': 13, 'usernam...",en,865490222745702401,"[{'type': 'replied_to', 'id': '139948704242631...",False,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",{'place_id': '004ec16c62325149'},5.49807e+08,,004ec16c62325149,AU,"{'type': 'Feature', 'bbox': [152.668522848, -2...",Australia,city,004ec16c62325149,"Brisbane, Queensland",Brisbane,Robyn Grote,False,2017-05-19T08:52:03.000Z,retired hybrid physio/medical engineer. Polio ...,865490222745702401,"{'followers_count': 503, 'following_count': 14...",,"Yuggerah land, Brisbane",Robyn_Grote,False,https://pbs.twimg.com/profile_images/108221421...,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
497,everyone,1399219639901573120,2021-05-31T04:22:13.000Z,,1399219639901573120,"[{'domain': {'id': '123', 'name': 'Ongoing New...",Twitter for iPad,"{'annotations': [{'start': 26, 'end': 42, 'pro...",en,1150553499500814339,"[{'type': 'retweeted', 'id': '1399181975571763...",False,"{'retweet_count': 239, 'reply_count': 0, 'like...",,,,,,,,,,,,Elaine McKay,False,2019-07-14T23:51:45.000Z,"Left wing nut job, life member of a union, wor...",1150553499500814339,"{'followers_count': 8729, 'following_count': 6...",,Living on Awabakal land,ElaineM11584892,False,https://pbs.twimg.com/profile_images/138066380...,,,,,,,,,,,,,,,,,,,
498,everyone,1399218577618575374,2021-05-31T04:17:59.000Z,,1399218577618575374,"[{'domain': {'id': '123', 'name': 'Ongoing New...",Twitter for Android,"{'annotations': [{'start': 26, 'end': 42, 'pro...",en,1231854405130932225,"[{'type': 'retweeted', 'id': '1399181975571763...",False,"{'retweet_count': 239, 'reply_count': 0, 'like...",,,,,,,,,,,,💧Ian Williams,False,2020-02-24T08:12:50.000Z,progressive atheist baby boomer🐀 2050 is too l...,1231854405130932225,"{'followers_count': 557, 'following_count': 27...",,,IanWill94901388,False,https://abs.twimg.com/sticky/default_profile_i...,,,,,,,,,,,,,,,,,,,
499,everyone,1399218066945282050,2021-05-31T04:15:58.000Z,,1399218066945282050,"[{'domain': {'id': '123', 'name': 'Ongoing New...",Twitter Web App,"{'annotations': [{'start': 26, 'end': 42, 'pro...",en,1281269582,"[{'type': 'retweeted', 'id': '1399181975571763...",False,"{'retweet_count': 239, 'reply_count': 0, 'like...",,,,,,,,,,,,Dean Boggainho Boggainho,False,2013-03-19T19:10:40.000Z,,1281269582,"{'followers_count': 91, 'following_count': 540...",,,deanbogga,False,https://pbs.twimg.com/profile_images/560655661...,,,,,,,,,,,,,,,,,,,
500,everyone,1399218004366270465,2021-05-31T04:15:43.000Z,,1399218004366270465,"[{'domain': {'id': '123', 'name': 'Ongoing New...",Twitter Web App,"{'annotations': [{'start': 26, 'end': 42, 'pro...",en,67636066,"[{'type': 'retweeted', 'id': '1399181975571763...",False,"{'retweet_count': 239, 'reply_count': 0, 'like...",,,,,,,,,,,,(what)Katy(did),False,2009-08-21T15:36:24.000Z,"Medical doctor, RANZCP trainee, passionate abo...",67636066,"{'followers_count': 496, 'following_count': 95...",,Dja Dja Wurrung Country,mckaty,False,https://pbs.twimg.com/profile_images/139845295...,,1.2999e+18,,,,,,,,,,,,,,,,,
