In [8]:
# dependencies
import os
from tqdm import tqdm
import datetime
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# import APIs

from NYTAPI import nyt_api
from newsapi import news_api
from AlphaAPI import alpha_api

In [9]:
%matplotlib notebook

# Stock Data

In [10]:
nas = "NDAQ"
dow = "DJI"
sap = "SPX"

sap_base_url = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol="+sap+"&apikey="+alpha_api
nas_base_url = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol="+nas+"&apikey="+alpha_api
dow_base_url = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol="+dow+"&apikey="+alpha_api


In [11]:
sap_data = requests.get(sap_base_url).json()
nas_data = requests.get(nas_base_url).json()
dow_data = requests.get(dow_base_url).json()

In [12]:
# set dates
stock_dates = []

start_date = datetime.date(2018,7,1)

for i in range(60):
    stock_date = start_date + datetime.timedelta(i)
    stock_dates.append(stock_date.isoformat())

In [16]:
sap_close_prices = []
nas_close_prices = []
dow_close_prices = []
sap_volumes = []
nas_volumes = []
dow_volumes = []
sap_dates_output = []
nas_dates_output = []
dow_dates_output = []

for stock_date in tqdm(stock_dates):
    # There are holidays and weekends
    try:
        sap_close_prices.append(sap_data["Time Series (Daily)"][stock_date]["4. close"])
        nas_close_prices.append(nas_data["Time Series (Daily)"][stock_date]["4. close"])
        dow_close_prices.append(dow_data["Time Series (Daily)"][stock_date]["4. close"])
        
        sap_volumes.append(sap_data["Time Series (Daily)"][stock_date]["5. volume"])
        nas_volumes.append(nas_data["Time Series (Daily)"][stock_date]["5. volume"])
        dow_volumes.append(dow_data["Time Series (Daily)"][stock_date]["5. volume"])
        
        if stock_date in sap_data["Time Series (Daily)"].keys():
            sap_dates_output.append(stock_date)
        if stock_date in nas_data["Time Series (Daily)"].keys():
            nas_dates_output.append(stock_date)
        if stock_date in dow_data["Time Series (Daily)"].keys():
            dow_dates_output.append(stock_date)
    except:
        pass

100%|██████████| 60/60 [00:00<00:00, 103819.41it/s]


In [25]:
sap_df = pd.DataFrame({"Date": sap_dates_output, "S&P Close Price": pd.to_numeric(sap_close_prices), "S&P Volume": pd.to_numeric(sap_volumes)})
nas_df = pd.DataFrame({"Date": nas_dates_output, "NAS Close Price": pd.to_numeric(nas_close_prices), "NAS Volume": pd.to_numeric(nas_volumes)})
dow_df = pd.DataFrame({"Date": dow_dates_output, "DOW Close Price": pd.to_numeric(dow_close_prices), "DOW Volume": pd.to_numeric(dow_volumes)})
sap_df["S&P Volume"] = sap_df["S&P Volume"].map("{:,}".format)
nas_df["NAS Volume"] = nas_df["NAS Volume"].map("{:,}".format)
dow_df["DOW Volume"] = dow_df["DOW Volume"].map("{:,}".format)

print(sap_df.dtypes)
dow_df

Date                object
S&P Close Price    float64
S&P Volume          object
dtype: object


Unnamed: 0,Date,DOW Close Price,DOW Volume
0,2018-07-02,24307.1797,244110000
1,2018-07-03,24174.8203,160960000
2,2018-07-05,24356.7402,237670000
3,2018-07-06,24456.4805,219450000
4,2018-07-09,24776.5898,240580000
5,2018-07-10,24919.6602,235040000
6,2018-07-11,24700.4492,237370000
7,2018-07-12,24924.8906,233150000
8,2018-07-13,25019.4102,255520000
9,2018-07-16,25064.3594,231270000


# New York Times

In [None]:
print(dates_output[0])
print(dates_output[-1])

In [None]:
begin_date = dates_output[0][:4] + dates_output[0][5:7] + dates_output[0][8:10]
end_date = dates_output[-1][:4] + dates_output[-1][5:7] + dates_output[-1][8:10]
print(begin_date, end_date)

In [None]:
# prepare variables
# make sure we can loop through all the articles we get
n = 200
pages = list(range(n))
snippet = []
pub_date = []

# urls
nyt_base_url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# get data from urls
for page in tqdm(pages):
    params = {
        "api-key": nyt_api,
        "q": "Stock",
        "begin_date": begin_date,
        "end_date": end_date,
        "sort": "newest",
        "fl": ["snippet","pub_date"],
        "page": page
    }

    try: 
        nyt_data = requests.get(nyt_base_url, params=params).json()
        # loop through 10 articles on each page
        for i in range(10):
            snippet.append(nyt_data["response"]["docs"][i]["snippet"])
            interm_date = nyt_data["response"]["docs"][i]["pub_date"]
            pub_date.append(interm_date[:4]+interm_date[5:7]+interm_date[8:10])
        
    except:
        pass

# Vader Sentiment Score

In [None]:
sentences = snippet
vader_scores = []

In [None]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)
    vader_scores.append(vs['compound'])

# Establish DataFrame

In [None]:
# convert pub_date to numeric for later data processing
news_dates = []

for d in pub_date:
    d = str(d)
    news_dates.append(d[:4] + "-" + d[4:6] + "-" + d[6:8])

news_df = pd.DataFrame({"Date": news_dates, "VaderScore": vader_scores})

In [None]:
news_df

In [None]:
grouped_news_df = news_df.groupby("Date")
adjusted_news_df = pd.DataFrame({
    "Date": grouped_news_df.count().index,
    "VaderScore": grouped_news_df["VaderScore"].mean()
})

In [None]:
adjusted_news_df.dtypes

In [None]:
sap_df.dtypes

In [None]:
# merge stock_df and adjusted_news_df
merged_df = sap_df.merge(adjusted_news_df, on="Date")

In [None]:
merged_df