In [1]:
# Dependencies
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import finnhub

In [2]:
# Setup client
from config import api_finn
finnhub_client = finnhub.Client(api_key=api_finn)

In [3]:
# Pull in Gamestop data from finnhub
data_GME = finnhub_client.stock_candles('GME', 'D', 1599886800, 1612159200)

# Convert to Pandas Dataframe
GME_new = pd.DataFrame(data_GME)

# Clean up data frame (change timestamp to Y-M-D format)
GME_series = pd.to_datetime(GME_new.t,unit = "s")

# Merge the data frame with the cleaned series 
GME_df = pd.merge(GME_new, GME_series, left_index = True, right_index = True)

# Re-write the df with only the desired columns
GME_update = GME_df[["c","h","l","o","v","t_y"]]

# Give the column headings proper titles
GME_final_df = GME_update.rename(columns={"c": "Closing_price", "h": "High_price", "l": "Low_price", "o": "Opening_price", "v": "Market_volume", "t_y": "Date"})
GME_final_df

Unnamed: 0,Closing_price,High_price,Low_price,Opening_price,Market_volume,Date
0,6.91,7.00,6.430,6.80,10118972,2020-09-14
1,7.09,7.26,6.685,6.86,5743489,2020-09-15
2,8.68,9.04,7.030,7.03,19346508,2020-09-16
3,9.20,9.77,8.410,8.57,17026720,2020-09-17
4,9.47,9.77,8.910,9.20,17407513,2020-09-18
...,...,...,...,...,...,...
92,147.98,150.00,80.200,88.56,178587974,2021-01-26
93,347.51,380.00,249.000,354.83,93396666,2021-01-27
94,193.60,483.00,112.250,265.00,58815805,2021-01-28
95,325.00,413.98,250.000,379.71,50566055,2021-01-29


In [4]:
# Pull in Vanguard S&P index data from finnhub
data_VTSAX = finnhub_client.stock_candles('VTSAX', 'D', 1599886800, 1612159200)

# Convert to Pandas Dataframe
VTSAX_new = pd.DataFrame(data_VTSAX)

# Clean up data frame (change timestamp to Y-M-D format)
VTSAX_series = pd.to_datetime(VTSAX_new.t,unit = "s")
VTSAX_series = pd.to_datetime(VTSAX_series.dt.date)

# Merge the data frame with the cleaned series 
VTSAX_df = pd.merge(VTSAX_new, VTSAX_series, left_index = True, right_index = True)

# Re-write the df with only the desired columns
VTSAX_update = VTSAX_df[["c","h","l","o","v","t_y"]]

# Give the column headings proper titles
VTSAX_final_df = VTSAX_update.rename(columns={"c": "Closing_price", "h": "High_price", "l": "Low_price", "o": "Opening_price", "v": "Market_volume", "t_y": "Date"})
VTSAX_final_df

Unnamed: 0,Closing_price,High_price,Low_price,Opening_price,Market_volume,Date
0,83.60,83.60,83.60,83.60,0,2020-09-14
1,84.08,84.08,84.08,84.08,0,2020-09-15
2,83.80,83.80,83.80,83.80,0,2020-09-16
3,83.09,83.09,83.09,83.09,0,2020-09-17
4,82.31,82.31,82.31,82.31,0,2020-09-18
...,...,...,...,...,...,...
91,98.04,98.04,98.04,98.04,0,2021-01-25
92,97.74,97.74,97.74,97.74,0,2021-01-26
93,95.21,95.21,95.21,95.21,0,2021-01-27
94,96.18,96.18,96.18,96.18,0,2021-01-28


In [5]:
# Pull in Amazon data from finnhub
data_AMZN = finnhub_client.stock_candles('AMZN', 'D', 1599886800, 1612159200)

# Convert to Pandas Dataframe
AMZN_new = pd.DataFrame(data_AMZN)

# Clean up data frame (change timestamp to Y-M-D format)
AMZN_series = pd.to_datetime(AMZN_new.t,unit = "s")

# Merge the data frame with the cleaned series 
AMZN_df = pd.merge(AMZN_new, AMZN_series, left_index = True, right_index = True)

# Re-write the df with only the desired columns
AMZN_update = AMZN_df[["c","h","l","o","v","t_y"]]

# Give the column headings proper titles
AMZN_final_df = AMZN_update.rename(columns={"c": "Closing_price", "h": "High_price", "l": "Low_price", "o": "Opening_price", "v": "Market_volume", "t_y": "Date"})
AMZN_final_df

Unnamed: 0,Closing_price,High_price,Low_price,Opening_price,Market_volume,Date
0,3102.97,3187.390,3096.000,3172.9372,4529596,2020-09-14
1,3156.13,3175.020,3108.920,3136.1600,4021535,2020-09-15
2,3078.10,3187.238,3074.150,3179.9900,4531189,2020-09-16
3,3008.73,3029.432,2972.550,3009.2500,6449050,2020-09-17
4,2954.91,3037.800,2905.540,3031.7400,8892580,2020-09-18
...,...,...,...,...,...,...
92,3326.13,3338.000,3282.870,3296.3559,2955235,2021-01-26
93,3232.58,3346.520,3207.080,3341.4900,4660158,2021-01-27
94,3237.62,3301.680,3228.690,3235.0400,3149228,2021-01-28
95,3206.20,3236.990,3184.550,3230.0000,4293556,2021-01-29


In [6]:
# Pull in Dogecoin data from finnhub
data_DOGE = finnhub_client.stock_candles('DOGE-USD', 'D', 1599886800, 1612159200)

# Convert to Pandas Dataframe
DOGE_new = pd.DataFrame(data_DOGE)

# Clean up data frame (change timestamp to Y-M-D format)
DOGE_series = pd.to_datetime(DOGE_new.t,unit = "s")
DOGE_series = pd.to_datetime(DOGE_series.dt.date)

# Merge the data frame with the cleaned series 
DOGE_df = pd.merge(DOGE_new, DOGE_series, left_index = True, right_index = True)

# Re-write the df with only the desired columns
DOGE_update = DOGE_df[["c","h","l","o","v","t_y"]]

# Give the column headings proper titles
DOGE_final_df = DOGE_update.rename(columns={"c": "Closing_price", "h": "High_price", "l": "Low_price", "o": "Opening_price", "v": "Market_volume", "t_y": "Date"})
DOGE_final_df

Unnamed: 0,Closing_price,High_price,Low_price,Opening_price,Market_volume,Date
0,0.0028,0.0028,0.0028,0.0028,117125272,2020-09-11
1,0.0028,0.0028,0.0027,0.0028,124253231,2020-09-12
2,0.0028,0.0028,0.0027,0.0028,128520945,2020-09-13
3,0.0028,0.0029,0.0028,0.0028,100194173,2020-09-14
4,0.0028,0.0029,0.0028,0.0028,148555504,2020-09-15
...,...,...,...,...,...,...
125,0.0094,0.0098,0.0085,0.0094,398965924,2021-01-15
126,0.0093,0.0095,0.0091,0.0094,258630479,2021-01-16
127,0.0091,0.0094,0.0088,0.0093,254487989,2021-01-17
128,0.0092,0.0093,0.0090,0.0091,205025648,2021-01-18


In [7]:
# Pull in AMC entertainment data from finnhub
data_AMC = finnhub_client.stock_candles('AMC', 'D', 1599886800, 1612159200)

# Convert to Pandas Dataframe
AMC_new = pd.DataFrame(data_AMC)

# Clean up data frame (change timestamp to Y-M-D format)
AMC_series = pd.to_datetime(AMC_new.t,unit = "s")

# Merge the data frame with the cleaned series 
AMC_df = pd.merge(AMC_new, AMC_series, left_index = True, right_index = True)

# Re-write the df with only the desired columns
AMC_update = AMC_df[["c","h","l","o","v","t_y"]]

# Give the column headings proper titles
AMC_final_df = AMC_update.rename(columns={"c": "Closing_price", "h": "High_price", "l": "Low_price", "o": "Opening_price", "v": "Market_volume", "t_y": "Date"})
AMC_final_df

Unnamed: 0,Closing_price,High_price,Low_price,Opening_price,Market_volume,Date
0,5.54,5.8703,5.51,5.84,4638082,2020-09-14
1,5.52,5.8700,5.52,5.60,5099315,2020-09-15
2,5.76,5.8750,5.42,5.52,4390816,2020-09-16
3,5.72,5.7900,5.58,5.67,2619665,2020-09-17
4,5.67,5.7400,5.57,5.71,2428464,2020-09-18
...,...,...,...,...,...,...
92,4.96,5.1900,4.37,5.09,456850232,2021-01-26
93,19.90,20.3600,11.01,20.34,1253253552,2021-01-27
94,8.63,16.5000,6.51,11.98,591223928,2021-01-28
95,13.26,16.0000,11.60,14.31,602193320,2021-01-29


In [8]:
# Pull in Disney data from finnhub
data_DIS = finnhub_client.stock_candles('DIS', 'D', 1599886800, 1612159200)

# Convert to Pandas Dataframe
DIS_new = pd.DataFrame(data_DIS)

# Clean up data frame (change timestamp to Y-M-D format)
DIS_series = pd.to_datetime(DIS_new.t,unit = "s")

# Merge the data frame with the cleaned series 
DIS_df = pd.merge(DIS_new, DIS_series, left_index = True, right_index = True)

# Re-write the df with only the desired columns
DIS_update = DIS_df[["c","h","l","o","v","t_y"]]

# Give the column headings proper titles
DIS_final_df = DIS_update.rename(columns={"c": "Closing_price", "h": "High_price", "l": "Low_price", "o": "Opening_price", "v": "Market_volume", "t_y": "Date"})
DIS_final_df

Unnamed: 0,Closing_price,High_price,Low_price,Opening_price,Market_volume,Date
0,131.25,132.9300,130.9040,132.530,8391896,2020-09-14
1,131.24,132.5400,131.0050,131.460,7289104,2020-09-15
2,132.09,133.6200,131.4000,132.170,6605949,2020-09-16
3,130.22,131.7700,128.6000,130.550,6723544,2020-09-17
4,128.63,130.7900,128.1201,129.500,9916267,2020-09-18
...,...,...,...,...,...,...
92,169.56,173.3450,169.0500,173.000,7523199,2021-01-26
93,163.03,166.3400,160.5200,165.010,18239248,2021-01-27
94,171.88,172.8800,165.9500,166.170,14449099,2021-01-28
95,168.17,169.8100,165.7901,168.800,12848825,2021-01-29


In [9]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [/Users/krishna/.wdm/drivers/chromedriver/mac64/88.0.4324.96/chromedriver] found in cache


 


In [10]:
# Website URL to be scraped
url = 'https://www.wsj.com/news/archive/years'

In [11]:
# Open up the browser
browser.visit(url)

In [12]:
soup = bs(browser.html, 'html.parser')

In [13]:
# Create an empty list to hold news headlines
Headline_List = []

In [14]:
# Navigate web page to pull first news headline
Headline_0 = browser.links.find_by_href("/news/archive/2020/september").click()
Headline_0_11 = browser.links.find_by_href("/news/archive/2020/09/11").click()
WSJEx11 = browser.links.find_by_partial_text("Merck Covid-19 Vaccine")
Headline_List.append(WSJEx11.text)
browser.back()
browser.back()

In [15]:
# Navigate web page to pull second news headline
Headline_1 = browser.links.find_by_href("/news/archive/2020/september").click()
Headline_1_9_22 = browser.links.find_by_href("/news/archive/2020/09/22").click()
WSJEx9_22 = browser.links.find_by_partial_text("CDC Advisory Panel")
Headline_List.append(WSJEx9_22.text)
browser.back()
browser.back()

In [16]:
# Navigate web page to pull third news headline
Headline_2 = browser.links.find_by_href("/news/archive/2020/october").click()
Headline_2_10_8 = browser.links.find_by_href("/news/archive/2020/10/08").click()
WSJEx10_8 = browser.links.find_by_partial_text("Chip Company AMD")
Headline_List.append(WSJEx10_8.text)
browser.back()
browser.back()

In [17]:
# Navigate web page to pull fourth news headline
Headline_3 = browser.links.find_by_href("/news/archive/2020/november").click()
Headline_3_11_27 = browser.links.find_by_href("/news/archive/2020/11/27").click()
WSJEx11_27 = browser.links.find_by_partial_text("GM Plans")
Headline_List.append(WSJEx11_27.text)
browser.back()
browser.back()

In [18]:
# Navigate web page to pull fifth news headline
Headline_4 = browser.links.find_by_href("/news/archive/2020/december").click()
Headline_4_12_9 = browser.links.find_by_href("/news/archive/2020/12/09").click()
WSJEx12_9 = browser.links.find_by_partial_text("U.S. Supplies")
Headline_List.append(WSJEx12_9.text)
browser.back()
browser.back()

In [19]:
# Navigate web page to pull sixth news headline
Headline_5 = browser.links.find_by_href("/news/archive/2020/december").click()
Headline_5_12_23 = browser.links.find_by_href("/news/archive/2020/12/23").click()
WSJEx12_23 = browser.links.find_by_partial_text("NBA Postpones")
Headline_List.append(WSJEx12_23.text)
browser.back()
browser.back()

In [20]:
# Navigate web page to pull seventh news headline
Headline_6 = browser.links.find_by_href("/news/archive/2021/january").click()
Headline_6_1_4 = browser.links.find_by_href("/news/archive/2021/01/04").click()
WSJEx1_4= browser.links.find_by_partial_text("Sarah Jessica Parker")
Headline_List.append(WSJEx1_4.text)
browser.back()
browser.back()

In [21]:
# Navigate web page to pull eighth news headline
Headline_7 = browser.links.find_by_href("/news/archive/2021/january").click()
Headline_7_1_14 = browser.links.find_by_href("/news/archive/2021/01/14").click()
WSJEx1_14= browser.links.find_by_partial_text("Trump Orders Military")
Headline_List.append(WSJEx1_14.text)
browser.back()
browser.back()

In [22]:
# Navigate web page to pull ninth news headline
Headline_8 = browser.links.find_by_href("/news/archive/2021/january").click()
Headline_8_1_25 = browser.links.find_by_href("/news/archive/2021/01/25").click()
WSJEx1_25= browser.links.find_by_partial_text("Citadel")
Headline_List.append(WSJEx1_25.text)
browser.back()
browser.back()

In [23]:
# Navigate web page to pull tenth news headline
Headline_9 = browser.links.find_by_href("/news/archive/2021/january").click()
Headline_9_1_27 = browser.links.find_by_href("/news/archive/2021/01/27").click()
WSJEx1_27= browser.links.find_by_partial_text("Biden Re-Examining")
Headline_List.append(WSJEx1_27.text)
browser.back()
browser.back()

In [24]:
# Navigate web page to pull eleventh news headline
Headline_10 = browser.links.find_by_href("/news/archive/2021/january").click()
Headline_10_1_28 = browser.links.find_by_href("/news/archive/2021/01/28").click()
WSJEx1_28= browser.links.find_by_partial_text("The Reddit")
Headline_List.append(WSJEx1_28.text)
browser.back()
browser.back()

In [25]:
# Navigate web page to pull twelfth news headline
Headline_11 = browser.links.find_by_href("/news/archive/2021/january").click()
Headline_11_1_29 = browser.links.find_by_href("/news/archive/2021/01/29").click()
WSJEx1_29= browser.links.find_by_partial_text("Robinhood, Facing Ire")
Headline_List.append(WSJEx1_29.text)
browser.back()
browser.back()

In [26]:
# Navigate web page to pull thirteenth news headline
Headline_12 = browser.links.find_by_href("/news/archive/2021/february").click()
Headline_12_2_2 = browser.links.find_by_href("/news/archive/2021/02/02").click()
WSJEx2_2= browser.links.find_by_partial_text("Hackers Lurked")
Headline_List.append(WSJEx2_2.text)
browser.back()
browser.back()

In [27]:
# Navigate web page to pull fourteenth news headline
Headline_13 = browser.links.find_by_href("/news/archive/2021/february").click()
Headline_13_2_4 = browser.links.find_by_href("/news/archive/2021/02/04").click()
WSJEx2_4 = browser.links.find_by_partial_text("Rolling Stone")
Headline_List.append(WSJEx2_4.text)
browser.back()
browser.back()

In [28]:
# Navigate web page to pull last news headline
Headline_14 = browser.links.find_by_href("/news/archive/2021/february").click()
Headline_14_2_5 = browser.links.find_by_href("/news/archive/2021/02/05").click()
WSJEx2_5= browser.links.find_by_partial_text("The Long and Short")
Headline_List.append(WSJEx2_5.text)

In [29]:
# Close the browser session when done with web scraping
browser.quit()

In [31]:
# View the list of headlines
Headline_List

['Merck Covid-19 Vaccine Begins Human Testing',
 'CDC Advisory Panel to Delay Vote on Initial Vaccine Rollout',
 'Chip Company AMD Pursues Xilinx for $30 Billion Tie-Up',
 'GM Plans to Seek Banking Charter',
 'U.S. Supplies of Covid-19 PPE Fall Short of Targets',
 'NBA Postpones First Game on Second Day of New Season',
 'Sarah Jessica Parker and Matthew Broderick Fetch $15 Million for New York Townhouse',
 'Trump Orders Military Shift to Spur Israeli-Arab Cooperation Against Iran',
 'Citadel, Point72 to Invest $2.75 Billion Into Melvin Capital Management',
 'Biden Re-Examining U.S. Arms Sales to Saudi Arabia, U.A.E.',
 'The Reddit Wolves of Wall Street',
 'Robinhood, Facing Ire on Many Fronts, Defends Its App to Regulators',
 'Hackers Lurked in SolarWinds Email System for at Least 9 Months',
 'Rolling Stone Launches Video Channel on Amazon’s Twitch',
 'The Long and Short of the GameStop Bubble']