# Periodically Collect and Filter GDELT Archives
This notebook is the main code for parsing and retrieving news article archives from the [GDELT](http://data.gdeltproject.org/events/) project, as well as filtering these archives based on the type of actors and other GDELT specific properties for the analysis at hand.

In [None]:
output_dir = './polar/'

In [None]:
from datetime import datetime, date, timedelta

starting_date = date(year = 2020, month = 2, day = 1)
ending_date = date(year = 2020, month = 8, day = 1)

In [None]:
days_duration = ending_date - starting_date
days_duration = days_duration.days + 1

In [None]:
print('Starting date: ' + str(starting_date))
print('Ending date: ' + str(ending_date))
print()

print('Total number of days: ' + str(days_duration))

In [None]:
import wget

base = 'http://data.gdeltproject.org/events/{}.export.CSV.zip'

In [None]:
import os

if os.path.isdir(output_dir): print('Warning: Path \'%s\' already exists.' % output_dir)
else: 
    os.makedirs(output_dir)
    os.makedirs(output_dir + 'dumps')
    os.makedirs(output_dir + 'html')
    os.makedirs(output_dir + 'articles')

In [None]:
from tqdm import tqdm

for i in tqdm(range(days_duration), desc='Retrieving GDELT Dumps'):
    d = starting_date + timedelta(days=i)
    d_str = d.strftime('%Y%m%d')

    wget.download(base.format(d_str), out=output_dir + 'dumps')

In [None]:
import os

print('Succesfully collected %d GDELT archives.' % len([g for g in os.listdir(output_dir + 'dumps') if g.endswith('CSV.zip')]))

In [None]:
gdelt_fields = [
    'globaleventid', 'day', 'monthyear', 'year', 'fractiondate', 'actor1code', 'actor1name', 'actor1countrycode',
    'actor1knowngroupcode', 'actor1ethniccode', 'actor1religion1code', 'actor1religion2code', 'actor1type1code',
    'actor1type2code', 'actor1type3code', 'actor2code', 'actor2name', 'actor2countrycode', 'actor2knowngroupcode',
    'actor2ethniccode', 'actor2religion1code', 'actor2religion2code', 'actor2type1code', 'actor2type2code', 
    'actor2type3code', 'isrootevent', 'eventcode', 'eventbasecode', 'eventrootcode', 'quadclass', 'goldsteinscale', 
    'nummentions', 'numsources', 'numarticles', 'avgtone', 'actor1geo_type', 'actor1geo_fullname', 
    'actor1geo_countrycode', 'actor1geo_adm1code', 'actor1geo_lat', 'actor1geo_long', 'actor1geo_featureid', 
    'actor2geo_type', 'actor2geo_fullname', 'actor2geo_countrycode', 'actor2geo_adm1code string', 'actor2geo_lat', 
    'actor2geo_long', 'actor2geo_featureid', 'actiongeo_type', 'actiongeo_fullname', 'actiongeo_countrycode', 
    'actiongeo_adm1code', 'actiongeo_lat', 'actiongeo_long', 'actiongeo_featureid', 'dateadded', 'sourceurl'
]

In [None]:
import urllib, numpy

def get_source_path(sourceurl): 
    if not isinstance(sourceurl, str) and numpy.isnan(sourceurl): return ''
    return urllib.parse.urlparse(sourceurl).path    

def get_source(sourceurl): 
    if not isinstance(sourceurl, str) and numpy.isnan(sourceurl): return ''
    return urllib.parse.urlparse(sourceurl).netloc    

In [None]:
def generate_query_url(url, gd_day, archive_flag = True):
    if archive_flag: return 'https://web.archive.org/web/' + str(gd_day) + '00000/' + url  
    else: return url  

In [None]:
from newspaper import Article
from newspaper import Config

config = Config()

config.browser_user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
config.request_timeout = 3

def collect_article(article_url, parse_flag=True, nlp_flag=False):
    article = Article(article_url, config=config)
    article.download()
    if parse_flag: article.parse()
    if parse_flag and nlp_flag: article.nlp()
    
    return article

In [None]:
import string, re

def prepare_title(s):
    for st in string.punctuation: s = s.replace(st, ' ')
    s = re.sub(' +', '-', s)
    s = s.lower()
    
    return s[:]

In [None]:
import zipfile, pandas as pd, requests

from threading import Thread
from queue import Queue

In [None]:
import time
from requests.exceptions import ConnectionError, InvalidSchema, MissingSchema, TooManyRedirects, RetryError

def fetch_article_task(q):

    while not q.empty():

        idx, hgd = q.get()
        
        archive_url = generate_query_url(
            hgd['sourceurl'],
            hgd['day'],
            archive_flag=False
        )   

        hgd['config_day'] = d_str
        
        try: article_obj = collect_article(archive_url, parse_flag=False)
        except Exception as ex: 
            print(idx, archive_url, ex)
            pass
                        
        output_folder = output_dir + 'html/' + str(hgd['config_day']) + '/'
        output_file = output_folder + '.' + hgd['source'] + '.' + prepare_title(get_source_path(archive_url))[:100] + '.html'
        if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
        with open(output_file, 'w') as html_file: html_file.write(article_obj.html)
                            
        time.sleep(0.100)
        
        q.task_done()

In [None]:
from tqdm import tqdm
import time, sys

for i in range(days_duration):
    d = starting_date + timedelta(days=i)
        
    d_str = d.strftime('%Y%m%d')  
    
    zf = zipfile.ZipFile('{}{}.export.CSV.zip'.format(output_dir + 'dumps/', d_str)) 

    gd_df = pd.read_csv(zf.open('{}.export.CSV'.format(d_str)), sep='\t', header=None)
    gd_df.columns = gdelt_fields
        
    gd_df['sourceurl_path'] = gd_df['sourceurl'].apply(get_source_path)   
    gd_df['source'] = gd_df['sourceurl'].apply(get_source)   
      
    #######################################
    # Here add your filters for the GDELT #
    # articles. For example, I want the   # 
    # articles to be related to the US.   #
    #######################################
        
    scope_df = gd_df[(
        (gd_df['actor1countrycode']=='USA') | 
        (gd_df['actor2countrycode']=='USA')
    )]
    
    scope_df = scope_df.sort_values(by = ['numarticles'], ascending=False)
    
    scope_df = scope_df.head(5000)
    
    article_n = len(set(scope_df['sourceurl'].values))
    scope_df = list(scope_df.T.to_dict().values())
    
    q = Queue(maxsize=0)
    threads = min(128, article_n)

    sys.stdout.write('- Fetching {} articles for: {}'.format(article_n, d.strftime('%Y %m %d')))
    sys.stdout.flush()
    
    if article_n == 0: os.makedirs(output_dir + 'html/' + d_str, exist_ok=True)
    
    for j in range(article_n): q.put((j, scope_df[j]))
        
    t0 = time.time()

    for k in range(threads):
        thread = Thread(target=fetch_article_task, args=[q])
        thread.setDaemon(True)
        thread.start()

    q.join()

    t1 = time.time()

    sys.stdout.write(' [{}s]'.format(round(t1 - t0, 6)))
    sys.stdout.flush()
    print()

In [None]:
from multiprocessing import Pool, Process, Manager
import json

def parse_html(file_path):
        
    with open(file_path, 'r') as f: html_content = f.read()
        
    hgd = file_path.split('/')[-2]
    uid = file_path.split('/')[-1].replace('.html', '')
        
    if len(html_content) == 0: return None

    article = Article('', language='en')
    article.download(input_html=html_content)
    article.parse()
    
    hgd_dt = datetime.strptime(hgd, '%Y%m%d')
    
    article_dict = {
        'url': article.url,
        'uid': uid,
        'images': list(article.images),
        'publication-date': article.publish_date.strftime('%Y-%m-%d') if article.publish_date else hgd_dt.strftime('%Y-%m-%d'),
        'text': article.text,
        'title': article.title,
        'top-image': article.top_image
    }
        
    article_dict_str = json.dumps(article_dict, indent=4)
    
    output_folder = output_dir + 'articles/' + hgd + '/'
    output_file = output_folder + uid + '.json'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'w') as html_file: html_file.write(article_dict_str)

    return True

In [None]:
file_paths = []

for i in tqdm(list(range(days_duration))):
    
    d = starting_date + timedelta(days=i)
    d_str = d.strftime('%Y%m%d')  
    
    daily_path = output_dir + 'html/' + d_str + '/'
    
    file_paths += [daily_path + p for p in os.listdir(daily_path)]

In [None]:
import multiprocessing

for i in tqdm(
    pool.imap_unordered(parse_html, file_paths),
    desc='HTML Parsing',
    total=len(file_paths)
): pass

pool.close()
pool.join()

In [None]:
article_path_list = []

for root, folders, files in os.walk(f'{output_dir}articles'):
    if len(files) == 0: continue
    for f in files: article_path_list.append(root + '/' + f)

In [None]:
print('Articles:', len(article_path_list))