In [1]:
# importing the requests library
import requests
import json
import os
import time

import pandas as pd
import numpy as np
from dotenv import dotenv_values

In [2]:
config = dotenv_values(".env")  
BEARER_TOKEN = config['BEARER_TOKEN']
# path where twitter files will be stored
path = "Data/twitter/"

In [3]:
# define query parameters 
query = "earthquake -minor, -is:reply -is:retweet" #-from:username / -is:retweet
start_time = "2021-10-17T00:00:00.000Z"
end_time = "2021-10-17T23:59:59.000Z"
max_results = "500"
tweet_fields = "created_at,author_id,geo" # geo produes unreliable results. Kept anyway to simulate dirty data
user_fields = 'username,location' # location is same as geo
file_counter = 0
expansions = 'author_id'



# put query parameters in a list
query_params = {'query': query,'tweet.fields': tweet_fields, 'user.fields': user_fields,  \
                'start_time': start_time, 'end_time': end_time, 'max_results': max_results,\
                'expansions': expansions}

url = "https://api.twitter.com/2/tweets/search/all"

In [4]:
# define headers for authorization
headers = {"Authorization": "Bearer " + BEARER_TOKEN}

print("Starting to fetch data")


while True:
    # get results according to url and query
    response = None
    response = requests.request("GET", url, headers=headers, params=query_params)
              
    if response.status_code != 200:
         raise Exception(response.status_code, response.text)
    
    # create json out of result
    json_response = response.json()
    
    # write data into txt
    with open(path + "twitter_file_" + str(file_counter) + ".txt",
              "w") as outfile:
        outfile.write(json.dumps(json_response, indent=4))

    # check if more data available, if yes continue process
    file_counter += 1
    if 'meta' in json_response:
        if 'next_token' in json_response['meta']:
            query_params['next_token'] = json_response['meta']['next_token']
            next_token = json_response['meta']['next_token']
            print("Fetching next few tweets, next_token: ",query_params['next_token'])
            time.sleep(4)
        else:
            file_counter = 0
            del query_params['next_token']
            break
    else:
        file_counter = 0
        del query_params['next_token']
        break

print("DONE")

Starting to fetch data
Fetching next few tweets, next_token:  b26v89c19zqg8o3fpds9t9fy2y1081q5ndl16bsdgsjnh


KeyboardInterrupt: 

In [3]:
import glob
import os

# create list of all file names with path
file_list = glob.glob(os.path.join(os.getcwd(), "Data\\twitter", "*.txt"))

tweets = []
for file in file_list:
    with open(file) as f:
        tweets.append(json.load(f))
        
# create df out of the data
df_tweets = pd.DataFrame()
df_users = pd.DataFrame()
for tweet in tweets:
    df_tweets=df_tweets.append(pd.json_normalize(tweet['data']))
    df_users=df_users.append(pd.json_normalize(tweet['includes']['users']))
    

df_users = df_users.rename(columns={'id':'author_id'})
df = pd.merge(df_tweets, df_users, on='author_id').drop_duplicates(subset='id').reset_index(drop=True)



In [29]:
df

Unnamed: 0,created_at,text,author_id,id,geo.place_id,geo.coordinates.type,geo.coordinates.coordinates,withheld.copyright,withheld.country_codes_x,username,name,location,withheld.country_codes_y
0,2021-10-17T23:59:54.000Z,"#Earthquake M4.7 CANARY ISLANDS, SPAIN REGION ...",809537352909656064,1449887926985363456,,,,,,EQAlerts,Earthquake Monitor,,
1,2021-10-17T23:58:02.000Z,Moderate magnitude 4.3 #earthquake 24 km north...,809537352909656064,1449887457193963525,,,,,,EQAlerts,Earthquake Monitor,,
2,2021-10-17T23:35:21.000Z,#Earthquake M4.4 Mexico: 82 Km Al Oeste De Chi...,809537352909656064,1449881750532939776,,,,,,EQAlerts,Earthquake Monitor,,
3,2021-10-17T23:12:10.000Z,"#Earthquake M4.0 MINAHASA, SULAWESI, INDONESIA...",809537352909656064,1449875917761884164,,,,,,EQAlerts,Earthquake Monitor,,
4,2021-10-17T23:02:48.000Z,"#Earthquake M3.6 ISLAND OF HAWAII, HAWAII 4min...",809537352909656064,1449873557748596739,,,,,,EQAlerts,Earthquake Monitor,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4953,2021-10-17T00:11:48.000Z,"Earthquake kills 3 in Bali, Indonesia, destroy...",2282930023,1449528535132119043,,,,,,7SealsOfTheEnd,7 Seals Of The End,News & Bible verses USA,
4954,2021-10-17T00:08:42.000Z,4.8 Magnitude Earthquake Strikes Indonesia’s B...,367694191,1449527755733753857,,,,,,ANC_News2,Sanjeev Dev Malik,Gurgaon,
4955,2021-10-17T00:08:00.000Z,"Moderate earthquake rocks Bali, killing at lea...",1313407652202975232,1449527580844077059,,,,,,LindaMi14118735,Linda Miranda,,
4956,2021-10-17T00:05:38.000Z,"A terrible #earthquake hits #Indonesia, causin...",757114028489580544,1449526984086855681,,,,,,News_Disaster1,نُذر العذاب nibiru,منتديات البشرى الإسلاميه,


In [4]:
# create df with usernames and number of postings
account_df = df.username.value_counts()
lst_a = account_df.index.tolist()
lst_v = account_df.tolist()

df_a = pd.DataFrame(lst_a).rename(columns={0: 'username'})
df_a['value'] = lst_v

# filter out accounts with more or equal to 175 postings
df_a = df_a.loc[df_a['value'] > 10].sort_values(by=['value'])
lst_bots = df_a.username.tolist()

# add -from: operator to each entry in list 
append_str = '-from:'
lst_filter = [append_str + sub for sub in lst_bots]

# convert list to a string suitable for the query
filter_names_query = ' '.join(lst_filter)

In [5]:
filter_names_query

'-from:quakeupdates -from:jojo2727 -from:MonitorSismico -from:MyComicalLife -from:news_sokuho_bot -from:DiariosRobot -from:EN_NERV -from:GDACS -from:earthquake_jp -from:EQAlerts -from:j1_quake -from:iSachinSrivstva -from:VolcanoEWS -from:ChileAlertaApp -from:earthb0t -from:sexy_vegetables -from:zishin3255 -from:everyEarthquake -from:MapQuake -from:swap_bot_bash -from:eq_map -from:eq_map_es -from:eq_map_ww -from:SEISMOinfo -from:VegaBajaWx -from:WatchOurCity -from:Keith_Event -from:SismoDetector -from:cvb_223 -from:ExBulletinUk -from:EMSC -from:StoixeioJewelry -from:megamodo -from:earthquakevt -from:QuakeBotter -from:twtaka_jp -from:EarthquakeTw -from:ENSO1998 -from:eq_map_ww2 -from:eq_map_es2'

In [35]:
# define query parameters 
query = "earthquake -minor, -is:reply -is:retweet " + filter_names_query
start_time = "2020-12-01T00:00:00.000Z"
end_time = "2020-12-31T23:59:59.000Z"
max_results = "500"
tweet_fields = "created_at,author_id,geo" # geo produes unreliable results. Kept anyway to simulate dirty data
user_fields = 'username,location' # location is same as geo
place_country = 'JP'
file_counter = 0
expansions = 'author_id'



# put query parameters in a list
query_params = {'query': query,'tweet.fields': tweet_fields, 'user.fields': user_fields,  \
                'start_time': start_time, 'end_time': end_time, 'max_results': max_results,\
                'expansions': expansions}

url = "https://api.twitter.com/2/tweets/search/all"

In [36]:
# define headers for authorization
headers = {"Authorization": "Bearer " + BEARER_TOKEN}

print("Starting to fetch data")


while True:
    # get results according to url and query
    response = None
    response = requests.request("GET", url, headers=headers, params=query_params)
              
    if response.status_code != 200:
         raise Exception(response.status_code, response.text)
    
    # create json out of result
    json_response = response.json()
    
    # write data into txt
    with open(path + "twitter_file_" + str(file_counter) + ".txt",
              "w") as outfile:
        outfile.write(json.dumps(json_response, indent=4))

    # check if more data available, if yes continue process
    file_counter += 1
    if 'meta' in json_response:
        if 'next_token' in json_response['meta']:
            query_params['next_token'] = json_response['meta']['next_token']
            next_token = json_response['meta']['next_token']
            print("Fetching next few tweets, next_token: ",query_params['next_token'])
            time.sleep(4)
        else:
            file_counter = 0
            break
    else:
        file_counter = 0
        break

print("DONE")

Starting to fetch data
Fetching next few tweets, next_token:  b26v89c19zqg8o3foshtdmqzy88mlrbl6lgpsu4iee7p9
Fetching next few tweets, next_token:  b26v89c19zqg8o3foshtdmq3mi88heg2rnp6c46o5pqbh
Fetching next few tweets, next_token:  b26v89c19zqg8o3foshtdknrsku7ek14fz611dhkge2yl
Fetching next few tweets, next_token:  b26v89c19zqg8o3foshtdkngx47trxbyij0i6tm1255kt
Fetching next few tweets, next_token:  b26v89c19zqg8o3foshtdkn5ym504fb19hgyxidlrn5a5
Fetching next few tweets, next_token:  b26v89c19zqg8o3foshtdkmkmv4fczjgb2v8rfmcuvmnx
Fetching next few tweets, next_token:  b26v89c19zqg8o3foshtdkm9mvhqvfzo6h15as1ol26il
Fetching next few tweets, next_token:  b26v89c19zqg8o3foshtdkm9ibu3o03v2gjts7c7fozul
Fetching next few tweets, next_token:  b26v89c19zqg8o3foshtdkm9ia4pm8iq3gpkjayh6p5z1
Fetching next few tweets, next_token:  b26v89c19zqg8o3foshtdij1e5ctja8umofgw95huw5ml
Fetching next few tweets, next_token:  b26v89c19zqg8o3foshtdihu8fn5tv91dvwnmlnthbl31
Fetching next few tweets, next_token:  b26

KeyboardInterrupt: 

## Creating the data frame

In [5]:
df['created_at'] = pd.to_datetime(df['created_at'])

In [6]:
df

Unnamed: 0,created_at,id,text,withheld.copyright,withheld.country_codes,withheld.scope
0,2021-02-28 23:38:16+00:00,1366170888387846144,#europe #switzerland #swiss #lugano #travel #f...,,,
1,2021-02-28 23:30:24+00:00,1366168909682991107,@petersankoff @lawandchocolate I have always w...,,,
2,2021-02-28 23:27:40+00:00,1366168219367714818,"@IdeRetz @PoliticsForAlI @MetroUK Yes, it is J...",,,
3,2021-02-28 23:13:59+00:00,1366164776670355456,UK has the Brazilian Covid19 variant in its mi...,,,
4,2021-02-28 23:11:47+00:00,1366164223353643015,RT @Snishaa2: ༺✿☆✿\r💞🅷🅰🅿🅿🆈 🆃🅷🆄🆁🆂🅳🅰🆈💞\r✿\r☆✿༻\n...,,,
...,...,...,...,...,...,...
441,2008-02-10 19:04:47+00:00,696885762,@warzabidul Sounds like an exciting week. I lo...,,,
442,2007-10-02 08:01:33+00:00,306691052,MaltaMedia.com: New travel regulations to Swit...,,,
443,2007-06-27 15:22:29+00:00,123179132,"Booking travel. Denver, New York, Switzerland...",,,
444,2007-04-11 08:38:16+00:00,24449061,Vraiment fâchée que Switzerland Travel Center ...,,,


In [136]:
df['year'] = pd.to_datetime(df.created_at).dt.year
df['month'] = pd.to_datetime(df.created_at).dt.month

data = df.groupby(["year", "month"]).size().to_frame(name='count').reset_index()

In [139]:
data.to_csv("twitter_data.csv", encoding="UTF-8", index=False)

In [5]:
# To set your environment variables in your terminal run the following line:
# export 'BEARER_TOKEN'='<your_bearer_token>'
bearer_token = "AAAAAAAAAAAAAAAAAAAAAA2zUgEAAAAA8ZT2e%2By4ClxhUnhSn8Kww45qNDo%3DpdVd5VQ7aJElcpvMHABSyldfEJ4EPmUcPENMWxDgDaXIYAnbw2"

search_url = "https://api.twitter.com/2/tweets/search/all"

# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields

query = 'switzerland'
tweet_fields = 'created_at,author_id'
start_time = '2021-10-13T00:00:00.000Z'
end_time = '2021-10-13T23:59:59.000Z'
max_results = '10'


query_params = {'query': query,'tweet.fields': tweet_fields,\
                'start_time': start_time, 'end_time': end_time, 'max_results': max_results}


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2FullArchiveSearchPython"
    return r


def connect_to_endpoint(url, params):
    response = requests.request("GET", search_url, auth=bearer_oauth, params=params)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()


def main():
    json_response = connect_to_endpoint(search_url, query_params)
    print(json.dumps(json_response['data'], indent=4, sort_keys=True))


if __name__ == "__main__":
    main()

200
[
    {
        "author_id": "884330274850471936",
        "created_at": "2021-10-13T23:59:58.000Z",
        "id": "1448438395118800897",
        "text": "RT @overwater001: New COVID cases in the last 24 hours\nUK 42,776\nGermany 5,818\nNetherlands 3,716\nItaly 2,772\nPoland 2,640\nAustria 2,614\nLit\u2026"
    },
    {
        "author_id": "21986927",
        "created_at": "2021-10-13T23:59:50.000Z",
        "id": "1448438359232294918",
        "text": "RT @earthcurated: A summer evening in Lauterbrunnen, Switzerland \ud83c\udde8\ud83c\udded https://t.co/hGLqmGSKQj"
    },
    {
        "author_id": "339879512",
        "created_at": "2021-10-13T23:59:14.000Z",
        "id": "1448438209063661570",
        "text": "@sportsdrenched @DigiEntertain There are levels.\n\nThe company I work for makes Automation machines all over the world.  \n\nIt\u2019s almost cliche, but the basic entry level machines are made in China.\n\nMid level in Italy &amp; Argentina.\n\nTop level sophisticated 