In [9]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import os
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [18]:
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}
tickers = ['AMZN', 'TSLA', 'GOOG']
for ticker in tickers:
    url = finwiz_url + ticker
    req = Request(url=url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'}) 
    response = urlopen(req)    
    html = BeautifulSoup(response)
    news_table = html.find(id='news-table')
    news_tables[ticker] = news_table

In [19]:
# Read one single day of headlines for ‘AMZN’ 
amzn = news_tables['AMZN']
# Get all the table rows tagged in HTML with <tr> into ‘amzn_tr’
amzn_tr = amzn.findAll('tr')
for i, table_row in enumerate(amzn_tr):
 # Read the text of the element ‘a’ into ‘link_text’
 a_text = table_row.a.text
 # Read the text of the element ‘td’ into ‘data_text’
 td_text = table_row.td.text
 # Print the contents of ‘link_text’ and ‘data_text’ 
 print(a_text)
 print(td_text)
 # Exit after printing 4 rows of data
 if i == 3:
  break

20 Most Profitable Products to Sell Online in 2023
May-14-23 11:11PM
Seas Path to Profit Paved With Layoffs, Single-Ply Toilet Paper
07:30PM
Billionaire Ken Fishers Top 10 Stock Picks
07:15PM
4 Surprising Things About Costco's Kirkland Products
09:30AM


In [29]:
news_table.findAll('tr')[1].a.get_text()

'Seas Path to Profit Paved With Layoffs, Single-Ply Toilet Paper'

In [49]:
parsed_news = []
for file_name, news_table in news_tables.items():
    for x in news_table.findAll('tr'):
        if type(x.a) != type(None):
          text = x.a.get_text() 
          date_scrape = x.td.text.split()
          if len(date_scrape) == 1:
              time = date_scrape[0] 
          else:
              date = date_scrape[0]
              time = date_scrape[1] 
          ticker = file_name.split('_')[0]
          
          parsed_news.append([ticker, date, time, text])
          
parsed_news[:5]

[['AMZN',
  'May-14-23',
  '11:11PM',
  '20 Most Profitable Products to Sell Online in 2023'],
 ['AMZN',
  'May-14-23',
  '07:30PM',
  'Seas Path to Profit Paved With Layoffs, Single-Ply Toilet Paper'],
 ['AMZN',
  'May-14-23',
  '07:15PM',
  'Billionaire Ken Fishers Top 10 Stock Picks'],
 ['AMZN',
  'May-14-23',
  '09:30AM',
  "4 Surprising Things About Costco's Kirkland Products"],
 ['AMZN',
  'May-14-23',
  '08:00AM',
  'Whole Foods vs. Trader Joes  Which Store Is Better for Your Money?']]

In [50]:
vader = SentimentIntensityAnalyzer()
columns = ['ticker', 'date', 'time', 'headline']
parsed_and_scored_news = pd.DataFrame(parsed_news, columns=columns)
scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()
scores_df = pd.DataFrame(scores)
parsed_and_scored_news = parsed_and_scored_news.join(scores_df, rsuffix='_right')
parsed_and_scored_news['date'] = pd.to_datetime(parsed_and_scored_news.date).dt.date
parsed_and_scored_news.head()

Unnamed: 0,ticker,date,time,headline,neg,neu,pos,compound
0,AMZN,2023-05-14,11:11PM,20 Most Profitable Products to Sell Online in ...,0.0,0.715,0.285,0.4927
1,AMZN,2023-05-14,07:30PM,"Seas Path to Profit Paved With Layoffs, Single...",0.0,0.756,0.244,0.4404
2,AMZN,2023-05-14,07:15PM,Billionaire Ken Fishers Top 10 Stock Picks,0.0,0.769,0.231,0.2023
3,AMZN,2023-05-14,09:30AM,4 Surprising Things About Costco's Kirkland Pr...,0.0,0.704,0.296,0.2732
4,AMZN,2023-05-14,08:00AM,Whole Foods vs. Trader Joes Which Store Is Be...,0.0,0.791,0.209,0.4404


In [51]:
amzn_scored_news = parsed_and_scored_news[parsed_and_scored_news['ticker'] == 'AMZN']
tsla_scored_news = parsed_and_scored_news[parsed_and_scored_news['ticker'] == 'TSLA']
goog_scored_news = parsed_and_scored_news[parsed_and_scored_news['ticker'] == 'GOOG']

In [52]:
import yfinance as yf

In [62]:
amzn = yf.Ticker('AMZN')
amzn_price = amzn.history(interval='1m', start=amzn_scored_news['date'].iloc[-1], end=amzn_scored_news['date'].iloc[0])['Close']
amzn_price.index = amzn_price.index.tz_convert(None)

In [63]:
from datetime import datetime
datetime_objs = []
for index, row in amzn_scored_news.iterrows():
    datetime_str = datetime.strftime(row['date'], '%Y-%m-%d') + ' ' + row['time']
    datetime_obj = datetime.strptime(datetime_str, '%Y-%m-%d %I:%M%p')
    datetime_objs.append(datetime_obj)

In [68]:
amzn_signal = amzn_scored_news[['neg', 'pos']]
amzn_signal.index = datetime_objs
amzn_signal = amzn_signal[str(amzn_price.index[0]):str(amzn_price.index[-1])].sort_index()
amzn_signal

  amzn_signal = amzn_signal[str(amzn_price.index[0]):str(amzn_price.index[-1])].sort_index()


Unnamed: 0,neg,pos
2023-05-08 13:31:00,0.0,0.322
2023-05-08 13:35:00,0.0,0.145
2023-05-08 14:05:00,0.0,0.752
2023-05-08 14:40:00,0.0,0.175
2023-05-08 15:56:00,0.0,0.000
...,...,...
2023-05-12 15:53:00,0.0,0.000
2023-05-12 16:00:00,0.0,0.438
2023-05-12 16:08:00,0.0,0.000
2023-05-12 16:18:00,0.0,0.000


In [76]:
amzn_trade = pd.merge(amzn_price, amzn_signal, left_index=True, right_index=True)
amzn_trade['position'] = amzn_trade['pos'] - amzn_trade['neg']
amzn_trade['return'] = amzn_trade['Close'].pct_change().shift(-1)
ret = amzn_trade['position']*amzn_trade['return']
(1+ret).cumprod()

2023-05-08 13:31:00    1.000583
2023-05-08 13:35:00    1.000831
2023-05-08 14:05:00    1.000068
2023-05-08 14:40:00    1.000162
2023-05-08 15:56:00    1.000162
2023-05-08 16:42:00    1.000959
2023-05-08 17:23:00    1.001505
2023-05-08 18:59:00    1.002613
2023-05-09 15:01:00    1.003024
2023-05-09 15:36:00    1.002439
2023-05-09 16:00:00    1.004360
2023-05-09 17:25:00    1.004360
2023-05-10 14:47:00    1.003922
2023-05-10 14:56:00    1.001609
2023-05-10 15:39:00    1.001609
2023-05-10 15:39:00    1.000877
2023-05-10 17:15:00    1.000580
2023-05-10 18:00:00    1.001683
2023-05-10 19:00:00    1.002762
2023-05-10 19:54:00    1.003841
2023-05-11 13:36:00    1.004921
2023-05-11 14:06:00    1.004180
2023-05-11 16:35:00    1.004607
2023-05-11 16:43:00    1.004593
2023-05-11 19:05:00    1.005282
2023-05-11 19:42:00    1.007920
2023-05-12 15:06:00    1.005985
2023-05-12 15:37:00    1.006017
2023-05-12 15:53:00    1.006017
2023-05-12 16:00:00    1.005101
2023-05-12 16:08:00    1.005101
2023-05-

In [None]:
|