In [1]:
import json
import os
import glob

from os import makedirs
from os.path import join, exists
from datetime import date, timedelta

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/rmh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
aspect_key_words = ['job', 'economy', 'unemployment', 'race', 'election', 'president', 
                    'policy', 'immigration', 'sex', 'woman', 'health', 
                    'global', 'warming', 'police', 'black', 'rating', 'growth', 'undocumented', 
                    'russia', 'allegation', 'infrastructure', 'republican', 'democrat', 'senate', 'house',
                    'donald', 'trump', 'hillary', 'clinton', 'barack', 'obama', 'john', 'mccain',
                    'mitt', 'romney', 'george', 'bush', 'john', 'kerry', 'gore']

In [3]:
def get_election_data(start_date, end_date, election_year):

    data = []
    dayrange = range((end_date - start_date).days + 1)

    ARTICLES_DIR = join('data', 'guardian', election_year)

    for daycount in dayrange:
        dt = start_date + timedelta(days=daycount)
        datestr = dt.strftime('%Y-%m-%d')
        fname = join(ARTICLES_DIR, datestr + '.json')
        with open(fname) as f:
            for hd in json.load(f):
                data.append(hd)

    return data

In [4]:
def identify_aspects_with_fuzzy_match(aspects, news_hd):
    asp_match = []
    pos = []
    lbls = []
    hdls = []

    stopWords = set(stopwords.words('english'))

    for hd in news_hd:
        #skip = False
        for w in hd.split():
            if w not in stopWords:
                for kw in aspects:
                    score = fuzz.partial_ratio(kw, w)

                    if score >= 80:
                        s_idx = hd.find(w)
                        asp_match.append(kw)
                        pos.append(str(s_idx) + ',' + str(s_idx + len(w)))
                        lbls.append('positive')
                        hdls.append(hd)
                        #skip = True
                        #break

            #if skip == True:
            #    break
            
    return hdls, asp_match, lbls, pos

In [5]:
def write_data(year, hds, pos, asp, lbl):
    with open('data/processed-data/' + year + '/headlines.txt', "w") as output:
        for row in hds:
            output.write(str(row) + '\n')

    with open('data/processed-data/' + year + '/position.txt', "w") as output:
        for row in pos:
            output.write(str(row) + '\n')

    with open('data/processed-data/' + year + '/term.txt', "w") as output:
        for row in asp:
            output.write(str(row) + '\n')

    with open('data/processed-data/' + year + '/label.txt', "w") as output:
        for row in lbl:
            output.write(str(row) + '\n')

In [6]:
def gen_trainable_data(start_date, end_date, year, fz_kw):
    print('----' + year + '----------')
    news_headlines = get_election_data(start_date, end_date, year)
    headlines, aspects, labels, positions = identify_aspects_with_fuzzy_match(fz_kw, news_headlines)
    write_data(year, headlines, positions, aspects, labels)

In [7]:
gen_trainable_data(date(2016, 8, 8), date(2016, 11, 8), '2016', aspect_key_words)
gen_trainable_data(date(2012, 8, 6), date(2012, 11, 6), '2012', aspect_key_words)
gen_trainable_data(date(2008, 8, 4), date(2008, 11, 4), '2008', aspect_key_words)
gen_trainable_data(date(2004, 8, 2), date(2004, 11, 2), '2004', aspect_key_words)
gen_trainable_data(date(2000, 8, 7), date(2000, 11, 7), '2000', aspect_key_words)

----2016----------
----2012----------
----2008----------
----2004----------
----2000----------
