In [1]:
import os
import requests
from datetime import datetime, timedelta
import time
import warnings
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP

warnings.filterwarnings('ignore')

#Connect to running Stanford NLP server
nlp = StanfordCoreNLP('http://localhost:9000')

def get_news(ticker, date):
    FINNHUB_API_KEY = "c3dhc1iad3icrjj6i7qg"
    r = requests.get(f'https://finnhub.io/api/v1/company-news?symbol={ticker}&from={date}&to={date}&token={FINNHUB_API_KEY}',
                 verify=False)
    data = r.json()
    h = []
    for d in data:
        d['date'] = datetime.utcfromtimestamp(d['datetime']).strftime('%Y-%m-%d')
        h.append([d['id'], d['category'], d['date'], d['headline'], d['related'], d['source'], d['summary'], d['url']])

    df = pd.DataFrame(h, columns=['id', 'category', 'date', 'headline', 'related', 'source', 'summary', 'url'])
    df['date'] = pd.to_datetime(df['date'])
    return df

def increment_one_day(str_date):
    _date = datetime.strptime(str_date, '%Y-%m-%d') + timedelta(days=1)
    _date = _date.strftime('%Y-%m-%d')
    return _date

def get_sentiment(text):
    result = nlp.annotate(text, properties={
                       'annotators': 'sentiment',
                       'outputFormat': 'json',
                       'timeout': 5000,
                   })
    return np.mean([int(i['sentimentValue']) for i in result['sentences']])


tickers = ['TSLA', 'GE', 'NVDA', 'AMD']

all_news = pd.DataFrame([])

for ticker in tickers:
    #start date
    _date = '2020-07-08'
    
    #create empty dataframe
    df = pd.DataFrame([])
    
    #loop over dates and get news articles and append to dataframe : limit 60 api calls per minute
    while _date != datetime.today().strftime('%Y-%m-%d'):
        df = df.append(get_news(ticker, _date))
        df = df.drop_duplicates()
        time.sleep(1.1)
        _date = increment_one_day(_date)
    
    #There are some repeat headlines on the same day, so getting a daily headline count per article
    #Maybe duplicates of the same headline indicates more important news??
    duplicate_headlines = df[['date', 'headline', 'id']]
    dh = (duplicate_headlines.groupby(['date', 'headline'], as_index=False)
          .count()
          .rename(columns={'id': 'headline_count'}))
      
    # Get unique headlines by date
    no_dups = df.drop_duplicates(subset=['date', 'headline'])
    
    #Merge in headline counts
    no_dups = no_dups.merge(dh, how='left', on=['date', 'headline'])
    
    #Insert ticker
    no_dups.insert(0, 'ticker', ticker)
    
    #Append to dataframe that has all tickers
    all_news = all_news.append(no_dups)
    
"""
Loop over each headline, get the sentiment, then create sentiment column in dataframe

Stanford NLP Sentiment Scale:
0: Very Negative
1: Negative
2: Neutral
3: Positive
4: Very Positive
"""
sent = []
error_count = 0
for i in all_news['headline'].tolist():
    text = str(i)
    try:
        sent.append(get_sentiment(text))
    except:
        sent.append(-1)
        error_count += 1
all_news['sentiment'] = sent


In [2]:
error_count

0