In [1]:
import json
import os
import glob
import re
import pandas as pd
from os import makedirs
from os.path import join, exists
from pathlib import Path
from datetime import datetime, date, timedelta
from pathlib import Path
from pycorenlp import StanfordCoreNLP
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords


nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/rmh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [3]:
def get_sentiment(text):
    res = nlp.annotate(text,
                       properties={'annotators': 'sentiment',
                                   'outputFormat': 'json',
                                   'timeout': 1000,
                       })
    
    try:
        if len(res['sentences']) == 0:
            return -1
        else:
            return res['sentences'][0]['sentimentValue']
    except:
        return False
    else:
        return 2 #neutral sentiment

In [13]:
def get_and_save_election_data(src_path, target_path, start_date, end_date):

    data1 = []
    data = []
    dayrange = range((end_date - start_date).days + 1)
    
    for daycount in dayrange:
        dt = start_date + timedelta(days=daycount)
        datestr = dt.strftime('%Y-%m-%d')
        fname = join(src_path, datestr + '.json')
        
        if Path(fname).is_file():

            with open(fname) as f:
                for hd in json.load(f):
                    score = -1
                    if hd != None:
                        score = int(get_sentiment(hd.lower()))
                    
                    #if score == -1:
                    #    score = 2

                    data1.append({'date': datestr, 'score': score})
                    
                    if hd != None:
                        data.append({'date': datestr, 'text': hd.lower()})
        else:
            data1.append({'date': datestr, 'score': -1})

    os.makedirs(target_path, exist_ok=True)
    
    df = pd.DataFrame(data1, columns=['date', 'score'])
    df3 = pd.DataFrame(data, columns=['date', 'text'])
    
    mean_score = df['score'].mean(skipna=True)
    df['score']=df.score.mask(df.score == -1,mean_score)
    
    #if len(df) == 0:
    #    df2 = df.groupby('date')['score'].mean().to_frame().reset_index()
    
    df.to_csv(join(target_path, 'headlines1.csv'), index=False)
    
    df3.to_csv(join(target_path, 'headlines.csv'), index=False)

In [14]:
def agg_all(source):
    #start_dt = datetime.today() - timedelta(days=30)
    #end_dt = datetime.today() - timedelta(days=1)
    
    start_dt = date(2020,5,26)
    end_dt = date(2020,6,27)
    
    
    print('cand')
    
    get_and_save_election_data(join('data-set-2','candidates', source,'trump'), 
                             join('data-set-aggregated','candidates', source, 'trump'), 
                             start_dt, end_dt)
    
    get_and_save_election_data(join('data-set-2','candidates', source,'biden'), 
                             join('data-set-aggregated','candidates', source, 'biden'), 
                             start_dt, end_dt)
    
    print('econ')
    
    get_and_save_election_data(join('data-set-2','economy',source,'trump'), 
                             join('data-set-aggregated','economy', source, 'trump'), 
                             start_dt, end_dt)
    
    get_and_save_election_data(join('data2','economy',source,'biden'), 
                             join('data-set-aggregated','economy',source,'biden'), 
                             start_dt, end_dt)
    
    print('env')
    
    get_and_save_election_data(join('data-set-2','environment',source,'trump'), 
                             join('data-set-aggregated','environment',source,'trump'), 
                             start_dt, end_dt)
    
    get_and_save_election_data(join('data-set-2','environment',source,'biden'), 
                             join('data-set-aggregated','environment',source,'biden'), 
                             start_dt, end_dt)
    
    print('party')
    
    get_and_save_election_data(join('data-set-2','party',source,'trump'), 
                             join('data-set-aggregated','party',source,'trump'), 
                             start_dt, end_dt)
    
    get_and_save_election_data(join('data-set-2','party',source,'biden'), 
                             join('data-set-aggregated','party',source,'biden'), 
                             start_dt, end_dt)
    
    print('health')
    
    get_and_save_election_data(join('data-set-2','health',source,'trump'), 
                             join('data-set-aggregated','health',source,'trump'), 
                             start_dt, end_dt)
    
    get_and_save_election_data(join('data-set-2','health',source,'biden'), 
                             join('data-set-aggregated','health',source,'biden'), 
                             start_dt, end_dt)
    
    print('imm')
    
    get_and_save_election_data(join('data-set-2','immigration',source,'trump'), 
                             join('data-set-aggregated','immigration',source,'trump'), 
                             start_dt, end_dt)
    
    get_and_save_election_data(join('data-set-2','immigration',source,'biden'), 
                             join('data-set-aggregated','immigration',source,'biden'), 
                             start_dt, end_dt)
    
    print('job')
    
    get_and_save_election_data(join('data-set-2','job',source,'trump'), 
                             join('data-set-aggregated','job',source,'trump'), 
                             start_dt, end_dt)
    
    get_and_save_election_data(join('data-set-2','job',source,'biden'), 
                             join('data-set-aggregated','job',source,'biden'), 
                             start_dt, end_dt)

In [15]:
agg_all('cnn')

cand
econ
env
party
health
imm
job


In [16]:
agg_all('fox-news')

cand
econ
env
party
health
imm
job


In [17]:
agg_all('guardian')

cand
econ
env
party
health
imm
job
