In [1]:
import psycopg2
import psycopg2.extras
import pandas as pd

import requests
import json
import os
import time

import numpy as np
from dotenv import dotenv_values

### Connection to AWS DB

In [2]:
# get the connection details for the rds db from .env file
config = dotenv_values(".env")  
HOST_RDS = config['HOST_RDS']
DBNAME_RDS = config['DBNAME_RDS']
USER_RDS = config['USER_RDS']
PASSWORD_RDS = config['PASSWORD_RDS']


try: 
    conn = psycopg2.connect(host=HOST_RDS, dbname=DBNAME_RDS, user=USER_RDS, password=PASSWORD_RDS)
except psycopg2.Error as e: 
    print("Error: Could not make connection to the Postgres database")
    print(e)

In [3]:
try: 
    cur = conn.cursor()
except psycopg2.Error as e: 
    print("Error: Could not get curser to the Database")
    print(e)
    
# Auto commit is very important
conn.set_session(autocommit=True)

In [7]:
cur.execute("DROP TABLE IF EXISTS tweets;")
cur.execute("DROP TABLE IF EXISTS tweets_user;")

# cur.execute("""CREATE TABLE IF NOT EXISTS tweets (
#             text text, 
#             author_id bigint,
#             id bigint,
#             created_at text);""")


# cur.execute("""CREATE TABLE IF NOT EXISTS tweets_user (
#             id bigint, 
#             username text, 
#             name text, 
#             location text);""")

table_name_t = 'tweets'
table_name_u = 'tweets_user'

### Accessing Twitter API

In [5]:
BEARER_TOKEN = config['BEARER_TOKEN']

# path where twitter files will be stored
path = "Data/twitter/"

In [6]:
filter_names_query = "-from:quakeupdates -from:jojo2727 -from:MonitorSismico -from:MyComicalLife -from:news_sokuho_bot -from:DiariosRobot -from:EN_NERV -from:GDACS -from:earthquake_jp -from:EQAlerts -from:j1_quake -from:iSachinSrivstva -from:VolcanoEWS -from:ChileAlertaApp -from:earthb0t -from:sexy_vegetables -from:zishin3255 -from:everyEarthquake -from:MapQuake -from:swap_bot_bash -from:eq_map -from:eq_map_es -from:eq_map_ww -from:SEISMOinfo -from:VegaBajaWx -from:WatchOurCity -from:Keith_Event -from:SismoDetector -from:cvb_223 -from:ExBulletinUk -from:EMSC -from:StoixeioJewelry -from:megamodo -from:earthquakevt -from:QuakeBotter -from:twtaka_jp -from:EarthquakeTw -from:ENSO1998 -from:eq_map_ww2 -from:eq_map_es2"

In [54]:
# define query parameters 
query = "earthquake -minor, -is:reply -is:retweet " + filter_names_query
start_time = "2021-11-15T00:00:00.000Z"
end_time = "2021-11-15T12:49:59.000Z"
max_results = "500"
tweet_fields = "created_at,author_id" 
user_fields = 'username,location' 
expansions = 'author_id'

# put query parameters in a dict
query_params = {'query': query,'tweet.fields': tweet_fields, 'user.fields': user_fields,  \
                'start_time': start_time, 'end_time': end_time, 'max_results': max_results,\
                'expansions': expansions}

url = "https://api.twitter.com/2/tweets/search/all"

# define headers for authorization
headers = {"Authorization": "Bearer " + BEARER_TOKEN}

data_t = []
data_u = []

In [55]:
while True:
    # get results according to url and query
    response = None
    response = requests.request("GET", url, headers=headers, params=query_params)
    if response.status_code != 200:
         raise Exception(response.status_code, response.text)

    # combine data to one
    json_response = response.json()
    if 'data' in json_response:
        data_t = data_t + json_response['data']
        data_u = data_u + json_response['includes']['users']

    # check if more data available, if yes continue process
    if 'meta' in json_response:
        if 'next_token' in json_response['meta']:
            query_params['next_token'] = json_response['meta']['next_token']
            next_token = json_response['meta']['next_token']
            print("Fetching next few tweets, next_token: ",query_params['next_token'])
            time.sleep(3)
        else:
            if 'next_token' in query_params:
                del query_params['next_token']
            break
    else:
        if 'next_token' in query_params:
            del query_params['next_token']
        break

Fetching next few tweets, next_token:  b26v89c19zqg8o3fpdy5k84dkewj58oiwmlbobujncjgd
Fetching next few tweets, next_token:  b26v89c19zqg8o3fpdy5k836d6s9fhzf1a00n9yii5ke5
Fetching next few tweets, next_token:  b26v89c19zqg8o3fpdy5k60960edlaspc15pz4ypcrfjx
Fetching next few tweets, next_token:  b26v89c19zqg8o3fpdy5k5zcsrjw42l0mlk34rs6cwrjx


In [94]:
# add location to all users, empty string if element does not exist (to insert data into table)
for item in data_u:
    if 'location' in item:
        pass
    else:
        item['location'] = ""
        
# create iterators
iter_tweets = iter(data_t)
iter_users = iter(data_u)

# insert tweets
psycopg2.extras.execute_batch(cur, """INSERT INTO tweets VALUES(
%(text)s,
%(author_id)s,
%(id)s,
%(created_at)s
);""",iter_tweets)

# insert users
psycopg2.extras.execute_batch(cur, """INSERT INTO tweets_user VALUES(
%(id)s,
%(username)s,
%(name)s,
%(location)s
);""",iter_users)


2.4063363075256348


In [9]:
try: 
    cur.execute("INSERT INTO twitter_eq (col1, col2, col3) \
                 VALUES (%s, %s, %s)", \
                 (10, 20, 1970), \
                ()
                
               )
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)

In [196]:
try: 
    cur.execute("SELECT * FROM twitter_eq;")
except psycopg2.Error as e: 
    print("Error: select *")
    print (e)

row = cur.fetchone()
while row:
   print(row)
   row = cur.fetchone()


In [43]:
cur.execute("""CREATE TABLE IF NOT EXISTS timeframes (
            id int,
            startdate text, 
            enddate text);""")

In [42]:
data = [(1, '2019-07-01T00:00:00.000Z', '2019-12-31T24:59:59.000Z'), \
        (2, '2019-01-01T00:00:00.000Z', '2019-06-30T24:59:59.000Z'), \
        (3, '2018-07-01T00:00:00.000Z', '2018-12-31T24:59:59.000Z'), \
        (4, '2018-01-01T00:00:00.000Z', '2018-06-30T24:59:59.000Z'), \
        (5, '2017-07-01T00:00:00.000Z', '2017-12-31T24:59:59.000Z'), \
        (6, '2017-01-01T00:00:00.000Z', '2017-06-30T24:59:59.000Z'), \
        (7, '2016-07-01T00:00:00.000Z', '2016-12-31T24:59:59.000Z'), \
        (8, '2016-01-01T00:00:00.000Z', '2016-06-30T24:59:59.000Z'), \
        (9, '2015-07-01T00:00:00.000Z', '2015-12-31T24:59:59.000Z'), \
        (10, '2015-01-01T00:00:00.000Z', '2015-06-30T24:59:59.000Z'), \
        (11, '2014-07-01T00:00:00.000Z', '2014-12-31T24:59:59.000Z'), \
        (12, '2014-01-01T00:00:00.000Z', '2014-06-30T24:59:59.000Z'), \
        (11, '2013-07-01T00:00:00.000Z', '2013-12-31T24:59:59.000Z'), \
        (12, '2013-01-01T00:00:00.000Z', '2013-06-30T24:59:59.000Z'), \
        (11, '2012-07-01T00:00:00.000Z', '2012-12-31T24:59:59.000Z'), \
        (12, '2012-01-01T00:00:00.000Z', '2012-06-30T24:59:59.000Z'), \
        (11, '2011-07-01T00:00:00.000Z', '2011-12-31T24:59:59.000Z'), \
        (12, '2011-01-01T00:00:00.000Z', '2011-06-30T24:59:59.000Z'), \
        (11, '2010-07-01T00:00:00.000Z', '2010-12-31T24:59:59.000Z'), \
        (12, '2010-01-01T00:00:00.000Z', '2010-06-30T24:59:59.000Z'), \
       ]

In [44]:
try: 
    cur.executemany("""INSERT INTO timeframes (id, startdate, enddate) 
                 VALUES (%s, %s, %s)""", \
                 data)
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)

In [11]:
cur.close()
conn.close()