In [None]:
# dependencies
import os
from tqdm import tqdm
import datetime
import dateutil
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# import APIs
from Keys.NYTAPI import nyt_api
from Keys.NewsAPI import news_api
from Keys.AlphaAPI import alpha_api

In [None]:
%matplotlib notebook

# Stock Data

In [None]:
nas = "NDAQ"
dow = "DJI"
sap = "SPX"

sap_base_url = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol="+sap+"&apikey="+alpha_api
nas_base_url = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol="+nas+"&apikey="+alpha_api
dow_base_url = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol="+dow+"&apikey="+alpha_api

In [None]:
sap_data = requests.get(sap_base_url).json()
nas_data = requests.get(nas_base_url).json()
dow_data = requests.get(dow_base_url).json()

In [None]:
# set dates
stock_dates = []

# decide the start date
start_date = datetime.date(2018,7,1)

for i in range(60):
    stock_date = start_date + datetime.timedelta(i)
    stock_dates.append(stock_date.isoformat())

In [None]:
sap_close_prices = []
nas_close_prices = []
dow_close_prices = []
sap_volumes = []
nas_volumes = []
dow_volumes = []
sap_dates_output = []
nas_dates_output = []
dow_dates_output = []
for stock_date in tqdm(stock_dates):
    # There are holidays and weekends
    try:
        sap_close_prices.append(sap_data["Time Series (Daily)"][stock_date]["4. close"])
        nas_close_prices.append(nas_data["Time Series (Daily)"][stock_date]["4. close"])
        dow_close_prices.append(dow_data["Time Series (Daily)"][stock_date]["4. close"])
        
        sap_volumes.append(sap_data["Time Series (Daily)"][stock_date]["5. volume"])
        nas_volumes.append(nas_data["Time Series (Daily)"][stock_date]["5. volume"])
        dow_volumes.append(dow_data["Time Series (Daily)"][stock_date]["5. volume"])
        
        if stock_date in sap_data["Time Series (Daily)"].keys():
            sap_dates_output.append(stock_date)
        if stock_date in nas_data["Time Series (Daily)"].keys():
            nas_dates_output.append(stock_date)
        if stock_date in dow_data["Time Series (Daily)"].keys():
            dow_dates_output.append(stock_date)
    except:
        pass

In [None]:
sap_df = pd.DataFrame({"Date": sap_dates_output, "S&P Close Price": pd.to_numeric(sap_close_prices), "S&P Volume": pd.to_numeric(sap_volumes)})
nas_df = pd.DataFrame({"Date": nas_dates_output, "NAS Close Price": pd.to_numeric(nas_close_prices), "NAS Volume": pd.to_numeric(nas_volumes)})
dow_df = pd.DataFrame({"Date": dow_dates_output, "DOW Close Price": pd.to_numeric(dow_close_prices), "DOW Volume": pd.to_numeric(dow_volumes)})
sap_df["S&P Volume"] = sap_df["S&P Volume"].map("{:,}".format)
nas_df["NAS Volume"] = nas_df["NAS Volume"].map("{:,}".format)
dow_df["DOW Volume"] = dow_df["DOW Volume"].map("{:,}".format)

print(sap_df.dtypes)
nas_df


# New York Times

In [None]:
# build a function that transfer the ISO formatted string back to datetime
def getDataTimeFromISO(iso):
    d = dateutil.parser.parse(iso)
    return d

In [None]:
getDataTimeFromISO(dates_output[0])

In [None]:
# define the day_lag variable. -1 means the news yesterday may determines the price today.
day_lag = -1
bd = getDataTimeFromISO(dates_output[0]).date()+datetime.timedelta(day_lag)
ed = getDataTimeFromISO(dates_output[-1]).date()+datetime.timedelta(day_lag)

In [None]:
bd = bd.isoformat()
ed = ed.isoformat()

In [None]:
begin_date = bd[:4] + bd[5:7] + bd[8:10]
end_date = ed[:4] + ed[5:7] + ed[8:10]
print(begin_date, end_date)

In [None]:
# prepare variables
# make sure we can loop through all the articles we get
n = 200
pages = range(n)
snippet = []
pub_date = []

# urls
nyt_base_url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# get data from urls
for page in tqdm(pages):
    params = {
        "api-key": nyt_api,
        "q": "Stock",
        "begin_date": begin_date,
        "end_date": end_date,
        "sort": "newest",
        "fl": ["snippet","pub_date"],
        "page": page
    }

    try: 
        nyt_data = requests.get(nyt_base_url, params=params).json()
        # loop through 10 articles on each page
        for i in range(10):
            snippet.append(nyt_data["response"]["docs"][i]["snippet"])
            interm_date = nyt_data["response"]["docs"][i]["pub_date"]
            pub_date.append(interm_date[:4]+interm_date[5:7]+interm_date[8:10])
        
    except:
        pass

# WSJ

In [None]:
# url
news_base_url = "https://newsapi.org/v2/everything"
wsj_source = "the-wall-street-journal"
wsj_topic = "stock"
wsj_description = []
wsj_pub_date = []

# News API free version limits the requests to 1,000 per day
wsj_pages = range(20)

for page in tqdm(wsj_pages):
        wsj_params = {
            "q": wsj_topic,
            "sources": wsj_source,
            "apiKey": news_api,
            "from": bd,
            "to": ed,
            "pageSize": 100,
            "page": page
        }
        
        try:
            wsj_data = requests.get(news_base_url, params=wsj_params).json()
            # loop through each article on each page
            for i in range(20):
                wsj_description.append(wsj_data["articles"][i]["description"])
                wsj_pub_date.append(wsj_data["articles"][i]["publishedAt"][:10])
        except:
            pass

# Fox

In [None]:
# url
news_base_url = "https://newsapi.org/v2/everything"
fox_source = "fox-news"
fox_topic = "stock"
fox_description = []
fox_pub_date = []

# News API free version limits the requests to 1,000 per day
fox_pages = range(20)

for page in tqdm(fox_pages):
        fox_params = {
            "q": fox_topic,
            "sources": fox_source,
            "apiKey": news_api,
            "from": bd,
            "to": ed,
            "pageSize": 100,
            "page": page
        }
        
        try:
            fox_data = requests.get(news_base_url, params=fox_params).json()
            # loop through each article on each page
            for i in range(20):
                fox_description.append(fox_data["articles"][i]["description"])
                fox_pub_date.append(fox_data["articles"][i]["publishedAt"][:10])
        except:
            pass

# Vader Sentiment Score

In [None]:
# define a function to recycle the code
def vaderSentimentScoreCalculator(sentences):
    analyzer = SentimentIntensityAnalyzer()
    vader_scores = []
    for sentence in sentences:
        vs = analyzer.polarity_scores(sentence)
        vader_scores.append(vs['compound'])
    return vader_scores

In [None]:
nyt_vaderscores = vaderSentimentScoreCalculator(snippet)
wsj_vadersocres = vaderSentimentScoreCalculator(wsj_description)
fox_vaderscores = vaderSentimentScoreCalculator(fox_description)

# Establish DataFrame - This section needs rework

In [None]:
news_dates = []

# add the day_lag back in order to merge

for d in pub_date:
    d = getDataTimeFromISO(d).date() - datetime.timedelta(day_lag)
    d = d.isoformat()
    news_dates.append(d)

news_df = pd.DataFrame({"Date": news_dates, "VaderScore (Day_Lag="+str(day_lag)+")": vader_scores})

In [None]:
news_df

In [None]:
grouped_news_df = news_df.groupby("Date")
adjusted_news_df = pd.DataFrame({
    "Date": grouped_news_df.count().index,
    "VaderScore (Day_Lag="+str(day_lag)+")": grouped_news_df["VaderScore (Day_Lag="+str(day_lag)+")"].mean()
})

In [None]:
adjusted_news_df.dtypes

In [None]:
stock_df.dtypes

In [None]:
# merge stock_df and adjusted_news_df
merged_df = stock_df.merge(adjusted_news_df, on="Date")

In [None]:
merged_df