In [135]:
# libraries
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import pycurl as pycurl
from io import BytesIO
import os
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
import matplotlib.pyplot as plt

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/cksidharthan/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Create new folder
try:  
    os.mkdir('./html_files')  
except OSError as error:  
    print(error)  

[Errno 17] File exists: './html_files'


In [3]:
# Delete all the files in the html_files folder
file_path = './html_files/'
file_list = os.listdir('./html_files')
for file in file_list:
    os.remove(file_path + file)

In [4]:
# Get values from Users
company_list = []
url_list = []
finance_url = 'https://finviz.com/quote.ashx?t='
num_companies = int(input('Enter the Number of companies to analyse: '))
for company in range(num_companies):
    company_stock_name = input('Enter the Stock name of the company: ')
    company_list.append(company_stock_name.upper())
    url_list.append(finance_url + company_stock_name)

Enter the Number of companies to analyse: 2
Enter the Stock name of the company: AAPL
Enter the Stock name of the company: AMZN


In [5]:
# Website Downloader
def url_downloader(url):
    byte_obj = BytesIO() 
    curl = pycurl.Curl() 
    curl.setopt(curl.URL, url)
    curl.setopt(curl.WRITEDATA, byte_obj)
    curl.perform() 
    curl.close()
    # Get the content stored in the BytesIO object (in byte characters) 
    get_body = byte_obj.getvalue()
    # Decode the bytes stored in get_body to HTML and print the result 
    return get_body.decode('utf8')

In [6]:
# Write website html to file
def write_to_file(file_name, file_data):
    f = open( file_path + file_name + '.html', 'w')
    f.write(file_data)
    f.close()

In [7]:
# Iterate over company list and download the website html
for company_index in range(num_companies):
    try:
        website_cache = url_downloader(url_list[company_index])
        write_to_file(company_list[company_index], website_cache)
    except:
        raise Exception('Website not found')

In [109]:
# read files from html_files folder
# data dictionary
html_pages = {}

for page_name in os.listdir('./html_files'):
    page_path = f'./html_files/{page_name}'
    page_file = open(page_path, 'r')
    html_data = BeautifulSoup(page_file)
    html_page = html_data.find(id = 'news-table')
    html_pages[page_name] = html_page

In [127]:
# Get Hyperlinks and headlines 
def get_hyperlinks_headlines(company_name, company_data):
    data_list = []
    tr_data = company_data.findAll('tr')
    for index, tr in enumerate(tr_data):
        data = []
        headlines = tr.a.get_text().strip()
        date_td = tr.td.get_text().strip()
        if (len(date_td.split(' ')) > 1):
            date = date_td.split(' ')[0]
            time = date_td.split(' ')[1]
        else:
            time = date_td
        data.append(company_name.split('.')[0])
        data.append(headlines)
        data.append(date)
        data.append(time)
        data_list.append(data)
    return data_list

In [128]:
total_company_data = []
for page_name, news_table in html_pages.items():
    total_company_data += get_hyperlinks_headlines(page_name, html_pages[page_name])
print(total_company_data)

[['AMZN', 'Amazon to Open New Warehouse in New York, Hire 1,000 Workers', 'May-18-20', '05:36PM'], ['AMZN', 'Mario Gabelli Adds to Amazon Holding in 1st Quarter', 'May-18-20', '05:27PM'], ['AMZN', 'FedEx Pops On Alliance with Microsoft, In Challenge To Amazon', 'May-18-20', '04:10PM'], ['AMZN', 'Warren Buffett, Top Funds Dumped These Stocks But Key Buys Stand Out', 'May-18-20', '04:03PM'], ['AMZN', 'The best bet might be to stay on the sidelines, according to this chart of winners and losers in this pandemic', 'May-18-20', '03:57PM'], ['AMZN', 'Dow Jones News: Microsoft Partners With FedEx; Apple Stock Rises as Stores Slowly Reopen', 'May-18-20', '02:02PM'], ['AMZN', 'Microsoft Bets on Startups to Boost Growth', 'May-18-20', '01:52PM'], ['AMZN', 'CEO of North Face and Vans owner says he is hunting for deals', 'May-18-20', '01:21PM'], ['AMZN', 'Best Buy Jumps Almost 10% on Analyst Upgrade', 'May-18-20', '01:09PM'], ['AMZN', 'J.C. Penney bankruptcy aftermath: 700 very large stores may co

In [137]:
columns = ['stock_name', 'headlines', 'date', 'time']
total_company_data_frame = pd.DataFrame(total_company_data, columns = columns)

In [149]:
total_company_data_frame

Unnamed: 0,stock_name,headlines,date,time
0,AMZN,"Amazon to Open New Warehouse in New York, Hire...",May-18-20,05:36PM
1,AMZN,Mario Gabelli Adds to Amazon Holding in 1st Qu...,May-18-20,05:27PM
2,AMZN,"FedEx Pops On Alliance with Microsoft, In Chal...",May-18-20,04:10PM
3,AMZN,"Warren Buffett, Top Funds Dumped These Stocks ...",May-18-20,04:03PM
4,AMZN,The best bet might be to stay on the sidelines...,May-18-20,03:57PM
...,...,...,...,...
195,AAPL,Trump is right that rich guys sometimes bet ag...,May-14-20,04:07PM
196,AAPL,The Top 10 Technology Companies,May-14-20,04:02PM
197,AAPL,"Apple Acquires NextVR, Setting Stage for Bigge...",May-14-20,03:03PM
198,AAPL,Market is 'going to be lower for longer': Stra...,May-14-20,02:57PM
