In [35]:
# ! pip install emoji --upgrade
# ! pip install twarc
# ! pip install twarc-csv

In [4]:
# Load packages

In [1]:
import os
import glob
import logging
from tqdm import tqdm
import ast
import requests
import os
import json
import pandas as pd
import time

import emoji


# Prerequisite functions

In [75]:
def set_logger(log_file_path="debug.log", level="INFO"):
# def set_logger(log_file_path="debug.log", level="DEBUG"):
    logger = logging.getLogger()
    logger.setLevel(level)
    scream_handler = logging.StreamHandler()
    file_handler = logging.FileHandler(log_file_path)
    logger.addHandler(scream_handler)
    logger.addHandler(file_handler)
    return logger
    


try:
    # print(len(logger.handlers))
    while len(logger.handlers) > 1:
        logger.handlers.pop(0)
        # print(len(logger.handlers))
except:
    pass

logger = set_logger()



def is_too_many_requests(json_response, start_time):
    is_too_many = False
    now = time.perf_counter()    
    elapsed_time = now - start_time
    time_window = 15 * 60 # seconds, 15 min
    title = json_response.get('title', "")
    if title =='Too Many Requests':            
        is_to_many = True
        print(f"To many requests, will sleep {int(time_window - elapsed_time)} seconds.")
        time.sleep(time_window)
        
    return is_too_many, elapsed_time


#-------------------- merge results -------------------#          
def find_place_id(row):
    # print(row)
    
    cell_text = row.get("geo", "")
    # print(cell_text)
    if len(cell_text) > 1:
        place_dict = ast.literal_eval(cell_text)
    else:
        return ""
    # print(place_dict)
    if isinstance(place_dict, dict):
        place_id = place_dict.get("place_id", "")
        if len(place_id) > 1:
            return place_id

def clean_tweets(row):
    
    text = row['text'].replace('\n',' ').replace(",", ";").replace('\r', '').replace('\t', ' ').strip()
    return text

def find_poll_id(row):
    
    text = row['text'].replace('\n',' ').replace(",", ";").replace('\r', '').replace('\t', ' ').strip()
    return text

def refine_data(df):
    df['place_id'] = df.apply(find_place_id, axis=1)
    df['text'] = df.apply(clean_tweets, axis=1)
    
    return df

def find_media_row(row, df_media):
    cell_text = row["attachments"]
    if len(cell_text) > 1:
        attachments_dict = ast.literal_eval(cell_text)
    else:
        return ""
    
    if isinstance(attachments_dict, dict):
        media_keys = attachments_dict.get("media_keys", "")
        media_rows = []
        # print(df_media)
        # print(attachments_dict)
        for key in media_keys:
            key = str(key)
            if len(key) > 1:
                print(key)
                print(df_media['media_table_media_key'])
                row = df_media[df_media['media_table_media_key']==key].iloc[0]#.to_json(orient='values')[1:-1]
                row = json.dumps(row)
                print(df_media[df_media['media_table_media_key']==key])
                media_rows.append(row)
        # print(media_rows)
        return media_rows
    return ""


def find_poll_row(row, df_poll):
    cell_text = row["attachments"]
    if len(cell_text) > 1:
        attachments_dict = ast.literal_eval(cell_text)
    else:
        return ""
    
    if isinstance(attachments_dict, dict):
        poll_ids = attachments_dict.get("poll_ids", "")
        poll_rows = []
        # print(df_poll)
        # print(attachments_dict)
        for i in poll_ids:
            i = str(i)
            if len(i) > 1:
                # print(i)
                # print(df_poll['polls_table_id'])
                print("df_poll['polls_table_id']", df_poll['polls_table_id'])
                row = df_poll[df_poll['polls_table_id']==i].iloc[0]#.to_json(orient='values')[1:-1]
                print("row in find_poll_row():", row)
                row = json.dumps(row)
#                 print(df_poll[df_poll['polls_table_media_id']==i])
                poll_rows.append(row)
        # print(poll_rows)
        return poll_rows
    return ""

def get_lonlat(row):
    row["lon"] = ""
    row["lat"] = ""
#     print('row[places_table_geo]:', row["places_table_geo"])
    if len(row["places_table_geo"]) > 1:
        geo_dict = ast.literal_eval(row["places_table_geo"])
#         print('geo_dict:', geo_dict)
        bbox = geo_dict.get("bbox", [])
        if len(bbox) == 4:
            row["lon"] = (bbox[0] + bbox[2]) / 2
            row["lat"] = (bbox[1] + bbox[3]) / 2
    return row

def merge_results(saved_path, is_zipped=False):
    if is_zipped:
        suffix = '.csv.gz'
    else:
        suffix = '.csv'
    
    data_files = glob.glob(os.path.join(saved_path, f"*_data{suffix}"))
    logger.info("Start to merge %d filles." % len(data_files))
    all_df = []
    for d in tqdm(data_files[:]):
        try:
            df_data = pd.read_csv(d)
            print(d)
            df_data = df_data.fillna("")
            df_data = refine_data(df_data)

            df_merged = df_data        

            # process places file
            places_csv = d.replace("data.csv", "includes_places.csv")
            if os.path.exists(places_csv):
                df_places = pd.read_csv(places_csv).fillna("")
                new_column_name = {name: "places_table_" + name for name in df_places.columns}
                df_places = df_places.rename(columns=new_column_name)        
                df_merged = pd.merge(df_merged, df_places, how='left', left_on="place_id", right_on="places_table_id")

            # process tweets file
            tweets_csv = d.replace("data.csv", "includes_tweets.csv")
            if os.path.exists(tweets_csv):
                df_tweets = pd.read_csv(tweets_csv).fillna("")
                df_tweets["text"] = df_tweets["text"].str.replace("\n", " ")
                new_column_name = {name: "tweets_table_" + name for name in df_tweets.columns}
                df_tweets = df_tweets.rename(columns=new_column_name)      
                df_merged = pd.merge(df_merged, df_tweets, how='left', left_on="id", right_on="tweets_table_id")

            # process users file
            users_csv = d.replace("data.csv", "includes_users.csv")
            if os.path.exists(tweets_csv):
                df_users = pd.read_csv(users_csv).fillna("")
                df_users["description"] = df_users["description"].str.replace("\n", " ")
                new_column_name = {name: "users_table_" + name for name in df_users.columns}
                df_users = df_users.rename(columns=new_column_name)     
                df_merged = pd.merge(df_merged, df_users, how='left', left_on="author_id", right_on="users_table_id")        

            # process media file
            media_csv = d.replace("data.csv", "includes_media.csv")
            if os.path.exists(media_csv):
                df_media = pd.read_csv(media_csv).fillna("")
                df_media['media_key'] = df_media['media_key'].astype(str)
                new_column_name = {name: "media_table_" + name for name in df_media.columns}
                df_media = df_media.rename(columns=new_column_name)  
                df_merged["media_table_rows"] = df_merged.apply(find_media_row, args=(df_media,), axis=1)

           # process poll file
            poll_csv = d.replace("data.csv", "includes_polls.csv")
            if os.path.exists(poll_csv):
                df_poll = pd.read_csv(poll_csv).fillna("")
                df_poll['poll_ids'] = df_poll['poll_ids'].astype(str)
                new_column_name = {name: "poll_table_" + name for name in df_poll.columns}
                df_poll = df_poll.rename(columns=new_column_name)  
                df_merged["poll_table_rows"] = df_merged.apply(find_poll_row, args=(poll_csv,), axis=1)
                
        except Exception as e:
            print("Error in merge_results for loop: ", e)
            logger.error(e, exc_info=True)
        

        df_merged = df_merged.fillna("")
        df_merged.replace("\n", " ")
        df_merged = df_merged.drop_duplicates(subset=['id'], keep='last')
        # print(len(df_places))
        # return df_merged
        all_df.append(df_merged)

    print("\nGenerating final CSV file, including %d small CSV files." % len(all_df))
    print("\nPlease wait...")

    final_df = pd.concat(all_df).fillna("")
    final_df = final_df.apply(get_lonlat, axis=1).reset_index()
    final_file = os.path.join(saved_path, "merged.csv")
    final_df.to_csv(final_file, index=False)
    logger.info("\nSaved merged tweets in %s ." % final_file)

    return final_df

# Set tokens

Put your Twitter API tokens in the ```tweet_api_keys.txt``` file in the same directory of this notebook in the following format:
```
Consumer API Key: XXXX
Consumer API Secret Key: XXXX
Bearer Token: XXXX
Access Token: XXXX
Access Token Secret: XXXX
```

In [36]:
def get_api_token(token_path):    
    try:
        with open(token_path, "r") as f:
            logger.debug("token_path: %s" % token_path)
            lines = f.readlines()
            logger.debug("lines in the file: %s" % lines)

            lines = [line.split(": ")[-1][:-1] for line in lines]
        return lines

    except Exception as e:
        logger.error("Error: %s" % str(e))


def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

def connect_to_endpoint(endpoint , headers, params):
    response = requests.request("GET", endpoint, headers=headers, params=params)
    # print(response.status_code)
#     if response.status_code != 200:
#         raise Exception(response.status_code, response.text)   
    return response.json()

token_path = r'J:\Research\tweet_download\tweet_api_keys.txt'
token_path = r'K:\Research\tweet_downloading\python_code\tweet_api_keys.txt'

tokens = get_api_token(token_path)

consumer_key = tokens[0]
consumer_secret = tokens[1]
bearer_token = tokens[2]
access_token = tokens[3]
access_token_secret = tokens[4]

# Count tweets

See the API document:
https://developer.twitter.com/en/docs/twitter-api/tweets/counts/quick-start/recent-tweet-counts

In [79]:
def get_tweet_count(query, start_time, end_time, granularity='day', next_token=None):
    print("Counting tweets, please wait...")
    
    start_timer = time.perf_counter()  
    
    tweet_count_total = 0
    endpoint = r'https://api.twitter.com/2/tweets/counts/all'
    query_params = {'query': query, \
                    "start_time": start_time, \
                    "end_time": end_time, \
                    "granularity": granularity, \
                    "next_token": next_token, \
                    }
    headers = create_headers(bearer_token)
    
    next_token = 'Start'
    
    page_cnt = 0
    
    while next_token is not None:
        json_response = connect_to_endpoint(endpoint, headers, query_params)
        is_too_many, elapsed_time = is_too_many_requests(json_response, start_timer)
        next_token = json_response['meta'].get('next_token', None)        
        tweet_count = json_response['meta']['total_tweet_count']
        tweet_count_total += tweet_count
        page_cnt += 1
        query_params['next_token'] = next_token
        
        if page_cnt % 20 == 0:
            print(f"    current tweet count: {tweet_count_total}")
        
#         print(f"next_token: {next_token}. total_tweet_count: {tweet_count_total}")
    
    return tweet_count_total#, json_response

query = "telemedicine  OR telehealth  OR telecare "

start_time = "2019-01-01T00:00:00Z"
end_time   = "2019-12-31T23:59:59Z"

tweet_count_total= get_tweet_count(query=query, start_time=start_time, end_time=end_time, granularity='day')
tweet_count_total

Counting tweets, please wait...


310211

# Download tweets

The following cell is a exmaple query to download tweets in Australia with a keyword of "vaccine" since 2021-01-01 to 2021-06-01.

Please set ```query```, ```start_time```, ```end_time```, ```saved_path```, and ```max_results``` (10 - 500).

See these pages to building a query: 

[Building queries for Search Tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query#examples)

[Search Tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all)




In [80]:
# a exmaple query to download tweets in Australia with a keyword of "vaccine" since 2020-01-01
# keyword = "vaccine"

# query = f"({keyword}) place_country:AU -is:retweet"
# query = f"({keyword}) place_country:AU"
# query = "(vaccin OR vaccination OR vaccine OR vaccinate) place_country:AU"

query = "telemedicine  OR telehealth  OR telecare"

# query = f"({keyword})"
start_time = "2019-01-01T00:00:00Z"
end_time   = "2019-12-31T23:59:59Z"
max_results = 500   # max_results can be 500 if do not request the field: context_annotations

# since_id = "139819805172285849"  # cannot used with start/end_time!


# borrow from Twitter:
# https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/master/Full-Archive-Search/full-archive-search.py
search_url = "https://api.twitter.com/2/tweets/search/all"


# saved_path = os.path.join(os.getcwd(), "saved_tweets")
saved_path = r"downloaded_tweets_test"
os.makedirs(saved_path, exist_ok=True)

In [53]:
# def create_headers(bearer_token):
#     headers = {"Authorization": "Bearer {}".format(bearer_token)}
#     return headers


# def connect_to_endpoint(endpoint , headers, params):
#     response = requests.request("GET", endpoint, headers=headers, params=params)
#     # print(response.status_code)
# #     if response.status_code != 200:
# #         raise Exception(response.status_code, response.text)   
#     return response.json()

def save_search(json_response, 
                saved_path,
               is_zipped=False,
               ):
    try:
        if not os.path.exists(saved_path):
            os.mkdir(saved_path)
            
        if is_zipped:
            suffix = '.csv.gz'
        else:
            suffix = '.csv'
            

        meta = json_response['meta']
        data = json_response['data']
        includes = json_response['includes']
        basename = f"{meta['oldest_id']}_{meta['newest_id']}_{meta['result_count']}"

        data_filename = os.path.join(saved_path, basename + f"_data{suffix}")
        df = pd.DataFrame(data)
        for c in df.columns:
            df[c] = df[c].astype(str)
            df[c] = df[c].str.replace('\n',' ').replace(",", ";").replace('\r', '').replace('\t', ' ').str.strip()
        df.to_csv(data_filename, index=False)
        result_count = meta['result_count']
        result_count = str(result_count)
        logger.info("Saved %s tweets in: %s" % (result_count, data_filename))

        for key in includes.keys():
            includes_filename = os.path.join(saved_path, basename + f"_includes_{key}{suffix}")
            df = pd.DataFrame(includes[key])
            for c in df.columns:
                df[c] = df[c].astype(str)
                df[c] = df[c].str.replace('\n',' ').replace(",", ";").replace('\r', '').replace('\t', ' ').str.strip()
            df.to_csv(includes_filename, index=False)
            
#         return df
        
    except Exception as e:
        logger.error(e, exc_info=True)

def execute_download(saved_path=os.getcwd(),
                     
                    
                    ):
    
    max_results = 500
    
    chunk_size = 100000 # tweets
    
    has_context_annotations = False
    
    start_timer = time.perf_counter()

    next_token = 'start'
    search_url = "https://api.twitter.com/2/tweets/search/all"
    headers = create_headers(bearer_token)
    total = 0
    query_params = {'query': query, \
                    "max_results": str(max_results), \
                    'expansions': 'attachments.poll_ids,attachments.media_keys,author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id', \
                    
                     # HAVE context_annotations, max_results can be only 100
#                     'tweet.fields': 'attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld', \
                    
                    # NO context_annotations,  max_results can be 500
                    'tweet.fields': 'attachments,author_id,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld', \

                    'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type', \
                    "user.fields": 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',\
                    "media.fields": "duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics", \
                    "poll.fields": "duration_minutes,end_datetime,id,options,voting_status", \
                    "start_time": start_time, \
                    "end_time": end_time, \
                    # "since_id":since_id, \  # cannot used with start/end_time!
                    }
    
    if has_context_annotations:
        query_params['tweet.fields'] = query_params['tweet.fields'] + ",context_annotations"
        if max_results > 100:
            print(f"max_results has set to 100 when requesting context_annotations. ")
            
    
    tweet_count_total = get_tweet_count(query, start_time, end_time, granularity='day', next_token=None)
    print(f"Found {tweet_count_total} tweets for query: {query}. Period: {start_time} - {end_time}")
    
    while next_token != "":
        try:           
            json_response = connect_to_endpoint(search_url, headers, query_params)
            is_too_many, elapsed_time = is_too_many_requests(json_response, start_timer)
#             df = pd.DataFrame(json_response['data'])
            save_search(json_response, saved_path)
            
            total += int(json_response['meta']['result_count'])
            logger.info("Downloaded %s tweets in total." % total)


            next_token = json_response['meta'].get('next_token', "")
            if next_token == "":
                print("No next page! Exit.")
                return

            query_params.update({"next_token": next_token})            
#             time.sleep(1)
        
        except Exception as e:
            logger.error(e, exc_info=True)
            
            print(e)
            
            now = time.perf_counter()
            
            time_window = 15 * 60 # seconds
            
            if 'Too Many Requests' in json_response.text:
                elapsed_time = int(now - start_timer)
                need_to_wait_time = time_window - elapsed_time
                print(f'Too Many Requests, waiting for {need_to_wait_time} seconds.')
                time.sleep(need_to_wait_time)
                
            continue

saved_path = r'K:\Research\tweet_downloading\python_code\downloaded_tweets_test'
execute_download(saved_path=saved_path)
# merge_df = merge_results(saved_path)
# merge_df

Counting tweets, please wait...
Found 441 tweets for query: telemedicine  OR telehealth  OR telecare. Period: 2021-11-29T20:00:01Z - 2021-11-30T00:00:01Z


Saved 436 tweets in: K:\Research\tweet_downloading\python_code\downloaded_tweets_test\1465410276778553347_1465470516379021313_436_data.csv
Downloaded 436 tweets in total.


No next page! Exit.


# Testing merge_df

In [76]:
# execute_download(saved_path=saved_path)
merge_df = merge_results(saved_path)
merge_df

Start to merge 1 filles.
  0%|                                                                                            | 0/1 [00:00<?, ?it/s]Object of type Series is not JSON serializable
Traceback (most recent call last):
  File "C:\Users\N\AppData\Local\Temp/ipykernel_35216/1153263485.py", line 188, in merge_results
    df_merged["media_table_rows"] = df_merged.apply(find_media_row, args=(df_media,), axis=1)
  File "D:\Anaconda3\lib\site-packages\pandas\core\frame.py", line 7768, in apply
    return op.get_result()
  File "D:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 185, in get_result
    return self.apply_standard()
  File "D:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 276, in apply_standard
    results, res_index = self.apply_series_generator()
  File "D:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 290, in apply_series_generator
    results[i] = self.f(v)
  File "D:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 110, in f
    retu

K:\Research\tweet_downloading\python_code\downloaded_tweets_test\1465410276778553347_1465470516379021313_436_data.csv
3_1465467373977223171
0     3_1465467373977223171
1     3_1465467107760652301
2     3_1465465488297844738
3     3_1465464733759275012
4     3_1465464266773966856
              ...          
57    3_1465410510749380612
58    3_1465410510950703109
59    3_1465410292305825797
60    3_1465410276359086095
61    3_1465410274786263045
Name: media_table_media_key, Length: 62, dtype: object
Error in merge_results for loop:  Object of type Series is not JSON serializable

Generating final CSV file, including 1 small CSV files.

Please wait...


PermissionError: [Errno 13] Permission denied: 'K:\\Research\\tweet_downloading\\python_code\\downloaded_tweets_test\\merged.csv'

In [54]:
next_token = 'start'
search_url = "https://api.twitter.com/2/tweets/search/all"
headers = create_headers(bearer_token)
total = 0
query_params = {'query': query, \
                "max_results": str(max_results), \
                'expansions': 'attachments.poll_ids,attachments.media_keys,author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id', \
                'tweet.fields': 'attachments,author_id,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld', \
                'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type', \
                "user.fields": 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',\
                "media.fields": "duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics", \
                "poll.fields": "duration_minutes,end_datetime,id,options,voting_status", \
                "start_time": start_time, \
                "end_time": end_time, \
                # "since_id":since_id, \  # cannot used with start/end_time!
                }


In [55]:
json_response = connect_to_endpoint(search_url, headers, query_params)

In [56]:
#json_response.keys()   # ['data', 'includes', 'meta']
json_response['meta']

{'newest_id': '1465470516379021313',
 'oldest_id': '1465410276778553347',
 'result_count': 435}

In [57]:
json_response['includes'].keys()  # dict_keys(['users', 'tweets', 'media', 'places', 'polls'])

dict_keys(['users', 'tweets', 'media', 'places', 'polls'])

In [55]:
# json_response['includes']['tweets']

# Check merged dataframe

In [2]:
csv_file1 = r'K:\Research\tweet_downloading\telecare_all.csv'
csv_file2 = r'K:\Research\tweet_downloading\telecare_all2.csv'
# csv_file = r'K:\Research\tweet_downloading\python_code\downloaded_tweets_test\merged.csv'
df1 = pd.read_csv(csv_file1)
df2 = pd.read_csv(csv_file2)

# df2

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
import numpy as np

df2['year'] = df2['year'].astype(float, errors='ignore').astype(int, errors='ignore')
df2['month'] = df2['month'].astype(float,errors='ignore').astype(int, errors='ignore')

df2['year'] = pd.to_numeric(df2['year'],errors='coerce').replace(np.nan, 0, regex=True)
df2['month'] = pd.to_numeric(df2['month'],errors='coerce').replace(np.nan, 0, regex=True)

df2 = df2[df2['year'] > 2018]
df2 = df2[df2['year'] < 2022]
df2 = df2[df2['month'] > 0]
df2 = df2[df2['month'] < 13]

df2['year'].unique(), df2['month'].unique(), len(df2)

(array([2019]), array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]), 264150)

In [6]:
import numpy as np

df1['year'] = df1['year'].astype(float, errors='ignore').astype(int, errors='ignore')
df1['month'] = df1['month'].astype(float,errors='ignore').astype(int, errors='ignore')

df1['year'] = pd.to_numeric(df1['year'],errors='coerce').replace(np.nan, 0, regex=True)
df1['month'] = pd.to_numeric(df1['month'],errors='coerce').replace(np.nan, 0, regex=True)

df1 = df1[df1['year'] > 2018]
df1 = df1[df1['year'] < 2022]
df1 = df1[df1['month'] > 0]
df1 = df1[df1['month'] < 13]

df1['year'].unique(), df1['month'].unique(), len(df1)

(array([2019., 2020., 2021.]),
 array([11., 12.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]),
 2312260)

In [8]:
df_all = pd.concat([df1, df2], axis=0)
len(df_all)

2576410

In [11]:
df_all['year'] = df_all['year'].astype(int)
df_all['month'] = df_all['month'].astype(int)

df_all.to_csv(r'K:\Research\tweet_downloading\telecar.csv.gz', index=False)

In [12]:
df_all2 = pd.read_csv(r'K:\Research\tweet_downloading\telecar.csv.gz')

df_all2

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,tweetid,userid,username,postdate,message,geoType,longitude,latitude,place,placeBboxwest,...,year,month,geo,country_code,country,tweet_lang,message_en,message_cn,sentiment,topic
0,1.192478e+18,8.630710e+17,dejaehyun,2019-11-07T16:25:11.000Z,RT @DrKehMDCU43: Telemedicine มันมาเร็วกว่าที่...,,,,,,...,2019,11,,,,th,,,0.0000,
1,1.192478e+18,1.304426e+08,diving_news,2019-11-07T16:25:05.000Z,American Well to buy Aligned Telehealth in beh...,,,,,,...,2019,11,,,,en,,,0.2732,
2,1.192478e+18,3.301612e+09,jing2Inwza,2019-11-07T16:25:01.000Z,RT @DrKehMDCU43: Telemedicine มันมาเร็วกว่าที่...,,,,,,...,2019,11,,,,th,,,0.0000,
3,1.192478e+18,2.376639e+09,ppcallmeh,2019-11-07T16:24:53.000Z,RT @DrKehMDCU43: Telemedicine มันมาเร็วกว่าที่...,,,,,,...,2019,11,,,,th,,,0.0000,
4,1.192478e+18,1.128195e+18,Omar_Ziyadeh,2019-11-07T16:24:53.000Z,RT @AlignedTH: We are excited to announce that...,,,,,,...,2019,11,,,,en,,,0.4003,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2576405,1.192471e+18,6.027252e+08,_imlobo,2019-11-07T15:55:28.000Z,RT @DrKehMDCU43: Telemedicine มันมาเร็วกว่าที่...,,,,,,...,2019,11,,,,th,,,0.0000,
2576406,1.192471e+18,1.367045e+09,ning_pb,2019-11-07T15:55:23.000Z,RT @DrKehMDCU43: Telemedicine มันมาเร็วกว่าที่...,,,,,,...,2019,11,,,,th,,,0.0000,
2576407,1.192471e+18,9.588306e+17,DME_Health,2019-11-07T15:55:00.000Z,RT @_timos_: In Alaska; Telehealth is Popular ...,,,,,,...,2019,11,,,,en,,,0.4215,
2576408,1.192471e+18,1.150765e+18,ilyeunsung,2019-11-07T15:54:58.000Z,RT @DrKehMDCU43: Telemedicine มันมาเร็วกว่าที่...,,,,,,...,2019,11,,,,th,,,0.0000,


In [13]:
df_all2['year'].unique(), df_all2['month'].unique(), len(df_all2)

(array([2019, 2020, 2021], dtype=int64),
 array([11, 12,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64),
 2576410)

In [15]:
df_all2.groupby(['year', 'month']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,tweetid,userid,username,postdate,message,geoType,longitude,latitude,place,placeBboxwest,...,bboxtype,placeid,geo,country_code,country,tweet_lang,message_en,message_cn,sentiment,topic
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2019,1,25114,25114,25114,25114,25114,344,344,344,344,344,...,0,344,344,344,344,25114,0,0,25114,0
2019,2,24386,24386,24386,24386,24386,443,443,443,443,443,...,0,443,443,443,443,24386,0,0,24386,0
2019,3,26457,26457,26457,26457,26457,447,447,447,447,447,...,0,447,447,447,447,26457,0,0,26457,0
2019,4,32316,32316,32316,32316,32316,534,534,534,534,534,...,0,534,534,533,533,32316,0,0,32316,0
2019,5,26848,26848,26848,26848,26848,393,393,393,393,393,...,0,393,393,393,393,26848,0,0,26848,0
2019,6,20657,20657,20657,20657,20657,316,316,316,316,316,...,0,316,316,315,315,20657,0,0,20657,0
2019,7,25276,25276,25276,25276,25276,250,250,250,250,250,...,0,250,250,250,250,25276,0,0,25276,0
2019,8,24964,24964,24964,24964,24964,218,218,218,218,218,...,0,218,218,218,218,24964,0,0,24964,0
2019,9,23903,23903,23903,23903,23903,290,290,290,290,290,...,0,290,290,290,290,23903,0,0,23903,0
2019,10,27928,27928,27928,27928,27928,341,341,341,341,341,...,0,341,341,341,341,27928,0,0,27928,0


In [None]:
s = ' male feminist.
 centre-left liberal.
 NeverBernie
 #StillWithHer'