In [1]:
# dependencies
import os
from tqdm import tqdm
import datetime
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# import APIs
from Keys.NYTAPI import nyt_api
from Keys.NewsAPI import news_api
from Keys.AlphaAPI import alpha_api

In [2]:
%matplotlib notebook

# Stock Data

In [3]:
stock_base_url = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=SPX&apikey="+alpha_api

In [4]:
stock_data = requests.get(stock_base_url).json()

In [5]:
# set dates
stock_dates = []

start_date = datetime.date(2018,7,1)

for i in range(60):
    stock_date = start_date + datetime.timedelta(i)
    stock_dates.append(stock_date.isoformat())

In [6]:
close_prices = []
volumes = []
dates_output = []

for stock_date in tqdm(stock_dates):
    # There are holidays and weekends
    try:
        close_prices.append(stock_data["Time Series (Daily)"][stock_date]["4. close"])
        volumes.append(stock_data["Time Series (Daily)"][stock_date]["5. volume"])
        if stock_date in stock_data["Time Series (Daily)"].keys():
            dates_output.append(stock_date)
    except:
        pass

100%|██████████| 60/60 [00:00<00:00, 94289.34it/s]


In [7]:
stock_df = pd.DataFrame({"Date": dates_output, "S&P Close Price": pd.to_numeric(close_prices), "S&P Volume": pd.to_numeric(volumes)})
stock_df["S&P Volume"] = stock_df["S&P Volume"].map("{:,}".format)

print(stock_df.dtypes)
stock_df

Date                object
S&P Close Price    float64
S&P Volume          object
dtype: object


Unnamed: 0,Date,S&P Close Price,S&P Volume
0,2018-07-02,2726.71,3073650000
1,2018-07-03,2713.22,1911470000
2,2018-07-05,2736.6101,2953420000
3,2018-07-06,2759.8201,2554780000
4,2018-07-09,2784.1699,3050040000
5,2018-07-10,2793.8401,3063850000
6,2018-07-11,2774.02,2964740000
7,2018-07-12,2798.29,2821690000
8,2018-07-13,2801.3101,2614000000
9,2018-07-16,2798.4299,2812230000


# New York Times

In [8]:
print(dates_output[0])
print(dates_output[-1])

2018-07-02
2018-08-24


In [9]:
begin_date = dates_output[0][:4] + dates_output[0][5:7] + dates_output[0][8:10]
end_date = dates_output[-1][:4] + dates_output[-1][5:7] + dates_output[-1][8:10]
print(begin_date, end_date)

20180702 20180824


In [10]:
# prepare variables
# make sure we can loop through all the articles we get
n = 200
pages = list(range(n))
snippet = []
pub_date = []

# urls
nyt_base_url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# get data from urls
for page in tqdm(pages):
    params = {
        "api-key": nyt_api,
        "q": "Stock",
        "begin_date": begin_date,
        "end_date": end_date,
        "sort": "newest",
        "fl": ["snippet","pub_date"],
        "page": page
    }

    try: 
        nyt_data = requests.get(nyt_base_url, params=params).json()
        # loop through 10 articles on each page
        for i in range(10):
            snippet.append(nyt_data["response"]["docs"][i]["snippet"])
            interm_date = nyt_data["response"]["docs"][i]["pub_date"]
            pub_date.append(interm_date[:4]+interm_date[5:7]+interm_date[8:10])
        
    except:
        pass

100%|██████████| 200/200 [00:48<00:00,  4.09it/s]


# Vader Sentiment Score

In [11]:
sentences = snippet
vader_scores = []

In [12]:
analyzer = SentimentIntensityAnalyzer()

In [13]:
for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)
    vader_scores.append(vs['compound'])

# Establish DataFrame

In [14]:
# convert pub_date to numeric for later data processing
news_dates = []

for d in pub_date:
    d = str(d)
    news_dates.append(d[:4] + "-" + d[4:6] + "-" + d[6:8])

news_df = pd.DataFrame({"Date": news_dates, "VaderScore": vader_scores})

In [15]:
news_df

Unnamed: 0,Date,VaderScore
0,2018-08-23,0.3612
1,2018-08-23,-0.2263
2,2018-08-23,0.3612
3,2018-08-23,0.8591
4,2018-08-23,-0.6908
5,2018-08-23,0.0000
6,2018-08-23,-0.7351
7,2018-08-23,0.2716
8,2018-08-23,0.0000
9,2018-08-23,0.0000


In [16]:
grouped_news_df = news_df.groupby("Date")
adjusted_news_df = pd.DataFrame({
    "Date": grouped_news_df.count().index,
    "VaderScore": grouped_news_df["VaderScore"].mean()
})

In [17]:
adjusted_news_df.dtypes

Date           object
VaderScore    float64
dtype: object

In [18]:
stock_df.dtypes

Date                object
S&P Close Price    float64
S&P Volume          object
dtype: object

In [19]:
# merge stock_df and adjusted_news_df
merged_df = stock_df.merge(adjusted_news_df, on="Date")

Defaulting to column, but this will raise an ambiguity error in a future version
  from ipykernel import kernelapp as app


In [20]:
merged_df

Unnamed: 0,Date,S&P Close Price,S&P Volume,VaderScore
0,2018-07-03,2713.22,1911470000,0.07224
1,2018-07-06,2759.8201,2554780000,0.1037
2,2018-07-12,2798.29,2821690000,-0.1905
3,2018-07-13,2801.3101,2614000000,-0.036783
4,2018-07-16,2798.4299,2812230000,-0.02305
5,2018-07-19,2804.49,3266700000,0.20694
6,2018-07-24,2820.3999,3417530000,0.118333
7,2018-07-25,2846.0701,3553010000,-0.097425
8,2018-07-27,2818.8201,3415710000,0.29068
9,2018-07-31,2816.29,3892100000,0.16037
