In [1]:
import os
from functools import partial
from datetime import datetime
import re

from pyspark import SparkContext

In [2]:
index2events_column = {
  0: "GlobalEventId",
  37: "Actor1Geo_CountryCode",
  60: "url",
  40: "lat",
  41: "long",
  59: "DateAdded"
}

index2mentions_column = {
  0: "GlobalEventId",
  3: "MentionType",
  5: "MentionIdentifier",
  1: "EventTimeDate",
  2: "MentionTimeDate"
}

def convert_to_datetime(datetime_str):
    return datetime.strptime(datetime_str, "%Y%m%d%H%M%S")

print(convert_to_datetime("20200326190000"))

type_converters = {
    "EventTimeDate":convert_to_datetime,
    "MentionTimeDate":convert_to_datetime,
    "DateAdded": convert_to_datetime
}

2020-03-26 19:00:00


In [3]:
sc = SparkContext()

In [4]:
basepath = "/home/ubuntu/data/project/gdelt"
mentions_path_english = os.path.join(basepath,"english/mentions/")
mentions_path_multi = os.path.join(basepath,"multilingual/mentions/")
events_path_english = os.path.join(basepath,"english/events/")
events_path_multi = os.path.join(basepath,"multilingual/events/")



In [5]:
print(mentions_path_english)

/home/ubuntu/data/project/gdelt/english/mentions/


In [6]:
mentions_english_rdd = sc.textFile(mentions_path_english)
mentions_multi_rdd = sc.textFile(mentions_path_multi)
mentions_rdd = mentions_english_rdd.union(mentions_multi_rdd)

In [7]:
for line in mentions_rdd.take(10):
    print(line)

833805545	20190327000000	20200326163000	1	abqjournal.com	https://www.abqjournal.com/1436770/3-teenage-boys-arrested-in-death-of-tucson-man-last-october.html	2	-1	100	78	1	50	919	-10.5263157894737		
833907628	20190327090000	20200326163000	1	stuff.co.nz	https://www.stuff.co.nz/national/health/coronavirus/120591047/coronavirus-government-needs-to-get-its-bubble-selfisolated	1	206	-1	324	0	20	3914	-3.53606789250354		
914608602	20200326031500	20200326163000	1	taipeitimes.com	https://www.taipeitimes.com/News/front/archives/2020/03/27/2003733457	1	139	193	170	1	100	3498	-8.91938250428813		
914724943	20200326163000	20200326163000	1	taipeitimes.com	https://www.taipeitimes.com/News/front/archives/2020/03/27/2003733457	1	139	192	182	0	20	3498	-8.91938250428813		
914608603	20200326031500	20200326163000	1	taipeitimes.com	https://www.taipeitimes.com/News/front/archives/2020/03/27/2003733457	1	139	193	183	1	80	3498	-8.91938250428813		
914724944	20200326163000	20200326163000	1	taipeitimes.com	https://

In [8]:
mentions_rdd = mentions_rdd.map(lambda line: line.split("\t"))

In [9]:
print(mentions_rdd.take(1)[0])

['833805545', '20190327000000', '20200326163000', '1', 'abqjournal.com', 'https://www.abqjournal.com/1436770/3-teenage-boys-arrested-in-death-of-tucson-man-last-october.html', '2', '-1', '100', '78', '1', '50', '919', '-10.5263157894737', '', '']


In [10]:
def transform_to_json(record, index2colname):
    return {colname: record[index] for index, colname in index2colname.items() }

mentions_json_rdd = mentions_rdd.map(partial(transform_to_json,index2colname=index2mentions_column))

In [11]:
mentions_json_rdd.take(1)

[{'GlobalEventId': '833805545',
  'MentionType': '1',
  'MentionIdentifier': 'https://www.abqjournal.com/1436770/3-teenage-boys-arrested-in-death-of-tucson-man-last-october.html',
  'EventTimeDate': '20190327000000',
  'MentionTimeDate': '20200326163000'}]

In [12]:
def convert_values(record):
    for key, value in record.items():
        try:
            record[key] = type_converters[key](value)
        except KeyError:
            continue
    return record

In [13]:
mentions_converted_rdd = mentions_json_rdd.filter(lambda record: record!=None).map(convert_values)

In [14]:
mentions_converted_rdd.take(1)

[{'GlobalEventId': '833805545',
  'MentionType': '1',
  'MentionIdentifier': 'https://www.abqjournal.com/1436770/3-teenage-boys-arrested-in-death-of-tucson-man-last-october.html',
  'EventTimeDate': datetime.datetime(2019, 3, 27, 0, 0),
  'MentionTimeDate': datetime.datetime(2020, 3, 26, 16, 30)}]

In [15]:
mentions_web_rdd = mentions_converted_rdd.filter(lambda record: record["MentionType"]=="1")

In [16]:
print(mentions_web_rdd.take(1))

[{'GlobalEventId': '833805545', 'MentionType': '1', 'MentionIdentifier': 'https://www.abqjournal.com/1436770/3-teenage-boys-arrested-in-death-of-tucson-man-last-october.html', 'EventTimeDate': datetime.datetime(2019, 3, 27, 0, 0), 'MentionTimeDate': datetime.datetime(2020, 3, 26, 16, 30)}]


In [17]:
event_mentions_count_rdd = (mentions_web_rdd
                            .map(lambda record: (record["GlobalEventId"],1))
                            .reduceByKey(lambda x, y: x+y)
                           )

In [18]:
event_mentions_count_rdd.take(10)

[('834036503', 1),
 ('914724951', 1),
 ('913233243', 7),
 ('913247109', 1),
 ('914722400', 3),
 ('914682360', 46),
 ('914725017', 1),
 ('914715839', 2),
 ('914708327', 12),
 ('914675894', 12)]

In [19]:
events_english_rdd = sc.textFile(events_path_english)
events_multi_rdd = sc.textFile(events_path_multi)
events_rdd = events_english_rdd.union(events_multi_rdd)

In [20]:
events_rdd = events_rdd.map(lambda line: line.split("\t"))

In [21]:
events_json_rdd = events_rdd.map(partial(transform_to_json,index2colname=index2events_column))

In [22]:
print(events_json_rdd.take(1))

[{'GlobalEventId': '914761007', 'Actor1Geo_CountryCode': 'US', 'url': 'https://www.aboutlawsuits.com/firefighter-foam-kidney-cancer-lawsuit-2-169746/', 'lat': '33.8191', 'long': '-80.9066', 'DateAdded': '20200326200000'}]


In [23]:
corona_regex = re.compile("(?=.*virus)(?=.*corona)|[Cc][Oo][Vv][Ii][Dd]-?19")

In [24]:
events_corona_rdd = events_json_rdd.filter(lambda record: corona_regex.search(record["url"])!=None)

In [25]:
events_geo_rdd = (events_corona_rdd
                  .map(lambda record: (record["GlobalEventId"],(record["lat"],record["long"])))
                  .filter(lambda tpl: tpl[1]!=('','')))

In [26]:
events_geo_rdd.take(1)

[('914761024', ('26.6503', '-80.4498'))]

In [27]:
joined_rdd = events_geo_rdd.leftOuterJoin(event_mentions_count_rdd)

In [28]:
joined_rdd.take(2)

[('914761316', (('53.3331', '-6.24889'), 1)),
 ('914761395', (('30.5833', '114.267'), 1))]

In [29]:
import mysql.connector

In [33]:
def create_mysql_cursor(user, password, database):
    cnx = mysql.connector.connect(user=user, password=password, database=database)
    cursor = cnx.cursor()
    return cursor

mysql_cursor = partial(create_mysql_cursor, user="root", password="testtest",database="Corona")

In [38]:
def store_partition(partition_iterator, connection_factory):
    user = "root"
    password = "testtest"
    database = "Corona"
    cnx = mysql.connector.connect(user=user, password=password, database=database)
    cursor = cnx.cursor()
    #cursor = connection_factory()
    
    records = [(event_id, float(lat), float(long), count) for event_id, ((lat, long), count) in partition_iterator]
    
    add_event = ("INSERT INTO GeoEvents "
               "(EventId, Latitude, Longitude, MentionCount) "
               "VALUES (%s, %s, %s, %s)")
    
    cursor.executemany(add_event, records)
    
    cnx.commit()
    cnx.close()

joined_rdd.foreachPartition(partial(store_partition, connection_factory=mysql_cursor))