In [1]:
# dependencies
import os
from tqdm import tqdm
import datetime
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# import APIs

from NYTAPI import nyt_api
from newsapi import news_api
from AlphaAPI import alpha_api

In [2]:
%matplotlib notebook

# Stock Data

In [3]:
nas = "NDAQ"
dow = "DJI"
sap = "SPX"

sap_base_url = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol="+sap+"&apikey="+alpha_api
nas_base_url = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol="+nas+"&apikey="+alpha_api
dow_base_url = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol="+dow+"&apikey="+alpha_api


In [4]:
sap_data = requests.get(sap_base_url).json()
nas_data = requests.get(nas_base_url).json()
dow_data = requests.get(dow_base_url).json()

In [5]:
# set dates
stock_dates = []

start_date = datetime.date(2018,7,1)

for i in range(60):
    stock_date = start_date + datetime.timedelta(i)
    stock_dates.append(stock_date.isoformat())

In [6]:
sap_close_prices = []
nas_close_prices = []
dow_close_prices = []
sap_volumes = []
nas_volumes = []
dow_volumes = []
sap_dates_output = []
nas_dates_output = []
dow_dates_output = []

for stock_date in tqdm(stock_dates):
    # There are holidays and weekends
    try:
        sap_close_prices.append(sap_data["Time Series (Daily)"][stock_date]["4. close"])
        nas_close_prices.append(nas_data["Time Series (Daily)"][stock_date]["4. close"])
        dow_close_prices.append(dow_data["Time Series (Daily)"][stock_date]["4. close"])
        
        sap_volumes.append(sap_data["Time Series (Daily)"][stock_date]["5. volume"])
        nas_volumes.append(nas_data["Time Series (Daily)"][stock_date]["5. volume"])
        dow_volumes.append(dow_data["Time Series (Daily)"][stock_date]["5. volume"])
        
        if stock_date in sap_data["Time Series (Daily)"].keys():
            sap_dates_output.append(stock_date)
        if stock_date in nas_data["Time Series (Daily)"].keys():
            nas_dates_output.append(stock_date)
        if stock_date in dow_data["Time Series (Daily)"].keys():
            dow_dates_output.append(stock_date)
    except:
        pass

100%|██████████| 60/60 [00:00<00:00, 90916.99it/s]


In [7]:
sap_df = pd.DataFrame({"Date": sap_dates_output, "S&P Close Price": pd.to_numeric(sap_close_prices), "S&P Volume": pd.to_numeric(sap_volumes)})
nas_df = pd.DataFrame({"Date": nas_dates_output, "NAS Close Price": pd.to_numeric(nas_close_prices), "NAS Volume": pd.to_numeric(nas_volumes)})
dow_df = pd.DataFrame({"Date": dow_dates_output, "DOW Close Price": pd.to_numeric(dow_close_prices), "DOW Volume": pd.to_numeric(dow_volumes)})
sap_df["S&P Volume"] = sap_df["S&P Volume"].map("{:,}".format)
nas_df["NAS Volume"] = nas_df["NAS Volume"].map("{:,}".format)
dow_df["DOW Volume"] = dow_df["DOW Volume"].map("{:,}".format)

print(sap_df.dtypes)
nas_df

Date                object
S&P Close Price    float64
S&P Volume          object
dtype: object


Unnamed: 0,Date,NAS Close Price,NAS Volume
0,2018-07-02,91.67,573510
1,2018-07-03,90.14,505302
2,2018-07-05,90.33,616651
3,2018-07-06,90.11,791671
4,2018-07-09,91.18,733839
5,2018-07-10,90.76,821187
6,2018-07-11,91.38,731418
7,2018-07-12,93.37,1184372
8,2018-07-13,93.11,608740
9,2018-07-16,93.89,493055


# New York Times

In [8]:
print(sap_dates_output[0])
print(sap_dates_output[-1])

2018-07-02
2018-08-29


In [9]:
begin_date = sap_dates_output[0][:4] + sap_dates_output[0][5:7] + sap_dates_output[0][8:10]
end_date = sap_dates_output[-1][:4] + sap_dates_output[-1][5:7] + sap_dates_output[-1][8:10]
print(begin_date, end_date)

20180702 20180829


In [10]:
# prepare variables
# make sure we can loop through all the articles we get
n = 200
pages = list(range(n))
snippet = []
pub_date = []

# urls
nyt_base_url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# get data from urls
for page in tqdm(pages):
    params = {
        "api-key": nyt_api,
        "q": "Stock",
        "begin_date": begin_date,
        "end_date": end_date,
        "sort": "newest",
        "fl": ["snippet","pub_date"],
        "page": page
    }

    try: 
        nyt_data = requests.get(nyt_base_url, params=params).json()
        # loop through 10 articles on each page
        for i in range(10):
            snippet.append(nyt_data["response"]["docs"][i]["snippet"])
            interm_date = nyt_data["response"]["docs"][i]["pub_date"]
            pub_date.append(interm_date[:4]+interm_date[5:7]+interm_date[8:10])
        
    except:
        pass

100%|██████████| 200/200 [00:52<00:00,  3.83it/s]


# Vader Sentiment Score

In [11]:
sentences = snippet
vader_scores = []

In [12]:
analyzer = SentimentIntensityAnalyzer()

In [13]:
for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)
    vader_scores.append(vs['compound'])

# Establish DataFrame

In [14]:
# convert pub_date to numeric for later data processing
news_dates = []

for d in pub_date:
    d = str(d)
    news_dates.append(d[:4] + "-" + d[4:6] + "-" + d[6:8])

news_df = pd.DataFrame({"Date": news_dates, "VaderScore": vader_scores})

In [15]:
news_df

Unnamed: 0,Date,VaderScore
0,2018-08-28,0.4215
1,2018-08-28,-0.3612
2,2018-08-28,0.0000
3,2018-08-28,0.0000
4,2018-08-28,0.0000
5,2018-08-28,0.0000
6,2018-08-28,0.0000
7,2018-08-28,0.8750
8,2018-08-28,0.7096
9,2018-08-28,-0.5106


In [16]:
grouped_news_df = news_df.groupby("Date")
adjusted_news_df = pd.DataFrame({
    "Date": grouped_news_df.count().index,
    "VaderScore": grouped_news_df["VaderScore"].mean()
})

In [17]:
adjusted_news_df.dtypes

Date           object
VaderScore    float64
dtype: object

In [18]:
sap_df.dtypes

Date                object
S&P Close Price    float64
S&P Volume          object
dtype: object

In [19]:
# merge stock_df and adjusted_news_df
merged_df = sap_df.merge(adjusted_news_df, on="Date")

Defaulting to column, but this will raise an ambiguity error in a future version
  


In [20]:
merged_df

Unnamed: 0,Date,S&P Close Price,S&P Volume,VaderScore
0,2018-07-02,2726.71,3073650000,0.144712
1,2018-07-06,2759.8201,2554780000,0.2884
2,2018-07-10,2793.8401,3063850000,0.0
3,2018-07-11,2774.02,2964740000,0.017514
4,2018-07-16,2798.4299,2812230000,-0.136763
5,2018-07-17,2809.55,3050730000,-0.2274
6,2018-07-19,2804.49,3266700000,0.311522
7,2018-07-20,2801.8301,3230210000,-0.128
8,2018-07-23,2806.98,2907430000,-0.014444
9,2018-07-25,2846.0701,3553010000,0.230371
