In [1]:
import pandas as pd
import numpy as np
import requests
import yfinance as yf
from datetime import date
from datetime import datetime


In [2]:
from sklearn.decomposition import PCA

In [3]:
tickers = ['XTN', 'HST', 'SPY']

In [4]:
asset_data = yf.download(tickers, start='2020-03-01', end='2021-03-01')

[*********************100%***********************]  3 of 3 completed


In [5]:
closes_df = asset_data['Close']

In [6]:
daily_returns = closes_df.pct_change()

In [7]:
covid_interest = pd.read_csv('Resources/covidKeywordInterest.csv', index_col='Week', parse_dates=True, infer_datetime_format=True)

In [8]:
twitter_sentiment = pd.read_csv('Resources/sentiment.csv', index_col='date', parse_dates=True, infer_datetime_format=True)

In [9]:
covid_data = pd.read_csv('Resources/Covid-19-World-Dataset.csv', index_col='date', parse_dates=True, infer_datetime_format=True)

In [10]:
start = '2020-03-01'
end = '2021-03-01'

In [11]:
df = pd.DataFrame({'date':pd.date_range(start=start, end=end)})

In [12]:
weekly_scores = covid_interest['Interest Score'][8:62].values.tolist()

In [13]:
expanded_scores = []
for score in weekly_scores:
    for idx in range(0,7):
        expanded_scores.append(score)

In [14]:
ex_scs = expanded_scores[:(len(expanded_scores) - 12)]

In [15]:
df['Interest Score'] = ex_scs

In [16]:
df.set_index('date', inplace=True)

In [17]:
updated_df = pd.concat([df,daily_returns], axis=1, join='inner')

In [18]:
compound = twitter_sentiment['compound']

In [19]:
c_list = compound.values.tolist()

In [20]:
sentiments = []
for score in c_list:
    if score > .05:
        sentiments.append('positive')
    elif score < -.05:
        sentiments.append('negative')
    else:
        sentiments.append('neutral')

In [21]:
twitter_sentiment['sentiment'] = sentiments

In [22]:
master_df = pd.concat([updated_df,twitter_sentiment], axis=1, join='inner')

In [23]:
covid_df = covid_data.loc[covid_data['location'] == "United States"]

In [24]:
covid_df = covid_df.iloc[:,[3,4,5,6]]

In [25]:
covid_df.dropna(inplace=True)

In [26]:
covid_df = covid_df.loc['2020-03-02':'2021-02-26']

In [27]:
pca = PCA(n_components=1)
covid_pca = pca.fit_transform(covid_df)

In [28]:
covid_pca_df = pd.DataFrame(data=covid_pca, columns=['Principal Component 1'],index=pd.date_range(start='2020-03-02', end='2021-02-26'))

In [29]:
master_df_no_pca = pd.concat([master_df,covid_df], axis=1, join='inner')
master_df_pca = pd.concat([master_df,covid_pca_df], axis=1, join='inner')

In [30]:
master_df_no_pca.dropna(inplace=True)
master_df_pca.dropna(inplace=True)

In [31]:
master_df_no_pca['S&P 500 Daily Movement'] = master_df_no_pca['SPY'].apply(lambda x: 'pos' if x>0 else 'neg')
master_df_pca['S&P 500 Daily Movement'] = master_df_no_pca['SPY'].apply(lambda x: 'pos' if x>0 else 'neg')

In [32]:
master_df_no_pca.drop(columns=['neg','neu', 'pos', 'compound'], inplace=True)
master_df_pca.drop(columns=['neg','neu', 'pos', 'compound'], inplace=True)

In [33]:
supervised_pca = master_df_pca.drop(columns=['SPY'])
supervised_no_pca = master_df_no_pca.drop(columns=['SPY'])
unsupervised_pca = master_df_pca.drop(columns=['SPY','S&P 500 Daily Movement'])
unsupervised_no_pca = master_df_no_pca.drop(columns=['SPY','S&P 500 Daily Movement'])

In [34]:
supervised_pca.to_csv('Model-Data/supervised_pca.csv')
supervised_no_pca.to_csv('Model-Data/supervised_no_pca.csv')
unsupervised_pca.to_csv('Model-Data/unsupervised_pca.csv')
unsupervised_no_pca.to_csv('Model-Data/unsupervised_no_pca.csv')