In [84]:
import os
from functools import partial
from datetime import datetime
import re

from pyspark import SparkContext

In [38]:
index2events_column = {
  0: "GlobalEventId",
  37: "Actor1Geo_CountryCode",
  60: "url",
  40: "lat",
  41: "long",
  59: "DateAdded"
}

index2mentions_column = {
  0: "GlobalEventId",
  3: "MentionType",
  5: "MentionIdentifier",
  1: "EventTimeDate",
  2: "MentionTimeDate"
}

def convert_to_datetime(datetime_str):
    return datetime.strptime(datetime_str, "%Y%m%d%H%M%S")

print(convert_to_datetime("20200326190000"))

type_converters = {
    "EventTimeDate":convert_to_datetime,
    "MentionTimeDate":convert_to_datetime,
    "DateAdded": convert_to_datetime
}

2020-03-26 19:00:00


In [4]:
sc = SparkContext()

In [68]:
basepath = "/home/ubuntu/data/project/gdelt"
mentions_path_english = os.path.join(basepath,"english/mentions/")
mentions_path_multi = os.path.join(basepath,"multilingual/mentions/")
events_path_english = os.path.join(basepath,"english/events/")
events_path_multi = os.path.join(basepath,"multilingual/events/")



In [69]:
print(mentions_path_english)

/home/ubuntu/data/project/gdelt/english/mentions/


In [70]:
mentions_english_rdd = sc.textFile(mentions_path_english)
mentions_multi_rdd = sc.textFile(mentions_path_multi)
mentions_rdd = mentions_english_rdd.union(mentions_multi_rdd)

In [71]:
for line in mentions_rdd.take(10):
    print(line)

833805545	20190327000000	20200326163000	1	abqjournal.com	https://www.abqjournal.com/1436770/3-teenage-boys-arrested-in-death-of-tucson-man-last-october.html	2	-1	100	78	1	50	919	-10.5263157894737		
833907628	20190327090000	20200326163000	1	stuff.co.nz	https://www.stuff.co.nz/national/health/coronavirus/120591047/coronavirus-government-needs-to-get-its-bubble-selfisolated	1	206	-1	324	0	20	3914	-3.53606789250354		
914608602	20200326031500	20200326163000	1	taipeitimes.com	https://www.taipeitimes.com/News/front/archives/2020/03/27/2003733457	1	139	193	170	1	100	3498	-8.91938250428813		
914724943	20200326163000	20200326163000	1	taipeitimes.com	https://www.taipeitimes.com/News/front/archives/2020/03/27/2003733457	1	139	192	182	0	20	3498	-8.91938250428813		
914608603	20200326031500	20200326163000	1	taipeitimes.com	https://www.taipeitimes.com/News/front/archives/2020/03/27/2003733457	1	139	193	183	1	80	3498	-8.91938250428813		
914724944	20200326163000	20200326163000	1	taipeitimes.com	https://

In [72]:
mentions_rdd = mentions_rdd.map(lambda line: line.split("\t"))

In [73]:
print(mentions_rdd.take(1)[0])

['833805545', '20190327000000', '20200326163000', '1', 'abqjournal.com', 'https://www.abqjournal.com/1436770/3-teenage-boys-arrested-in-death-of-tucson-man-last-october.html', '2', '-1', '100', '78', '1', '50', '919', '-10.5263157894737', '', '']


In [74]:
def transform_to_json(record, index2colname):
    return {colname: record[index] for index, colname in index2colname.items() }

mentions_json_rdd = mentions_rdd.map(partial(transform_to_json,index2colname=index2mentions_column))

In [75]:
mentions_json_rdd.take(1)

[{'GlobalEventId': '833805545',
  'MentionType': '1',
  'MentionIdentifier': 'https://www.abqjournal.com/1436770/3-teenage-boys-arrested-in-death-of-tucson-man-last-october.html',
  'EventTimeDate': '20190327000000',
  'MentionTimeDate': '20200326163000'}]

In [79]:
def convert_values(record):
    for key, value in record.items():
        try:
            record[key] = type_converters[key](value)
        except KeyError:
            continue
    return record

In [80]:
mentions_converted_rdd = mentions_json_rdd.filter(lambda record: record!=None).map(convert_values)

In [81]:
mentions_converted_rdd.take(1)

[{'GlobalEventId': '833805545',
  'MentionType': '1',
  'MentionIdentifier': 'https://www.abqjournal.com/1436770/3-teenage-boys-arrested-in-death-of-tucson-man-last-october.html',
  'EventTimeDate': datetime.datetime(2019, 3, 27, 0, 0),
  'MentionTimeDate': datetime.datetime(2020, 3, 26, 16, 30)}]

In [82]:
mentions_web_rdd = mentions_converted_rdd.filter(lambda record: record["MentionType"]=="1")

In [83]:
print(mentions_web_rdd.take(1))

[{'GlobalEventId': '833805545', 'MentionType': '1', 'MentionIdentifier': 'https://www.abqjournal.com/1436770/3-teenage-boys-arrested-in-death-of-tucson-man-last-october.html', 'EventTimeDate': datetime.datetime(2019, 3, 27, 0, 0), 'MentionTimeDate': datetime.datetime(2020, 3, 26, 16, 30)}]


In [86]:
corona_regex = re.compile("(?=.*virus)(?=.*corona)|[Cc][Oo][Vv][Ii][Dd]-?19")

In [88]:
mentions_corona_rdd = mentions_web_rdd.filter(lambda record: corona_regex.search(record["MentionIdentifier"])!=None)

In [89]:
for rec in mentions_corona_rdd.take(3):
    print(rec)

{'GlobalEventId': '833907628', 'MentionType': '1', 'MentionIdentifier': 'https://www.stuff.co.nz/national/health/coronavirus/120591047/coronavirus-government-needs-to-get-its-bubble-selfisolated', 'EventTimeDate': datetime.datetime(2019, 3, 27, 9, 0), 'MentionTimeDate': datetime.datetime(2020, 3, 26, 16, 30)}
{'GlobalEventId': '914724946', 'MentionType': '1', 'MentionIdentifier': 'https://www.geekwire.com/2020/two-seattle-startups-helping-doctors-uncover-hidden-trends-covid-19-cases/', 'EventTimeDate': datetime.datetime(2020, 3, 26, 16, 30), 'MentionTimeDate': datetime.datetime(2020, 3, 26, 16, 30)}
{'GlobalEventId': '834036503', 'MentionType': '1', 'MentionIdentifier': 'https://www.geekwire.com/2020/two-seattle-startups-helping-doctors-uncover-hidden-trends-covid-19-cases/', 'EventTimeDate': datetime.datetime(2019, 3, 27, 18, 30), 'MentionTimeDate': datetime.datetime(2020, 3, 26, 16, 30)}


In [95]:
event_mentions_count_rdd = (mentions_corona_rdd
                            .map(lambda record: (record["GlobalEventId"],1))
                            .reduceByKey(lambda x, y: x+y)
                           )

In [96]:
event_mentions_count_rdd.take(10)

[('834036503', 1),
 ('913233243', 6),
 ('913247109', 1),
 ('914708327', 11),
 ('914725098', 1),
 ('914699972', 168),
 ('914725122', 1),
 ('914582285', 15),
 ('914639646', 2),
 ('914582480', 8)]