# Initial WSJ Data Exploration

It seems the WSJ data only goes back to Feb 5, 1971 (and is missing some dates). We'll want to find some other data that covers from 1918 - 1971 if possible or at least 1945 - 1971 (post-WWII).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from seaborn import set_style
set_style("whitegrid")

In [2]:
price2 = pd.read_csv('data/WSJ_HistoricalPrices(Jan 1, 1981 - Oct 10, 2024).csv')
price1 = pd.read_csv('data/WSJ_HistoricalPrices(Feb 5, 1971 - Dec 31, 1980).csv')
dates = pd.read_csv('data/DaylightSavingsTimeChangeDates_1971-2024.csv')

In [3]:
price = pd.concat([price2, price1], axis=0)
price

Unnamed: 0,Date,Open,High,Low,Close
0,10/10/24,18200.62,18333.39,18154.18,18282.05
1,10/09/24,18179.22,18302.05,18133.02,18291.62
2,10/08/24,18017.93,18203.04,17989.70,18182.92
3,10/07/24,18080.12,18096.33,17900.03,17923.90
4,10/04/24,18130.42,18145.28,17952.30,18137.85
...,...,...,...,...,...
2495,02/11/71,101.45,101.45,101.45,101.45
2496,02/10/71,100.69,100.69,100.69,100.69
2497,02/09/71,100.76,100.76,100.76,100.76
2498,02/08/71,100.84,100.84,100.84,100.84


In [4]:
# dataframe in chronological order
price_chrono = price.iloc[::-1].reset_index(drop=True)
price_chrono

Unnamed: 0,Date,Open,High,Low,Close
0,02/05/71,100.00,100.00,100.00,100.00
1,02/08/71,100.84,100.84,100.84,100.84
2,02/09/71,100.76,100.76,100.76,100.76
3,02/10/71,100.69,100.69,100.69,100.69
4,02/11/71,101.45,101.45,101.45,101.45
...,...,...,...,...,...
13531,10/04/24,18130.42,18145.28,17952.30,18137.85
13532,10/07/24,18080.12,18096.33,17900.03,17923.90
13533,10/08/24,18017.93,18203.04,17989.70,18182.92
13534,10/09/24,18179.22,18302.05,18133.02,18291.62


In [86]:
dates

Unnamed: 0,Year,Day of Week,Date,Time before change,Time after change,Day of Week.1,Date.1,Time before change.1,Time after change.1
0,1971,Sunday,25 April,2:00,3:00,Sunday,31 October,2:00,1:00
1,1972,Sunday,30 April,2:00,3:00,Sunday,29 October,2:00,1:00
2,1973,Sunday,29 April,2:00,3:00,Sunday,28 October,2:00,1:00
3,1974,Sunday,6 January,2:00,3:00,Sunday,27 October,2:00,1:00
4,1975,Sunday,23 February,2:00,3:00,Sunday,26 October,2:00,1:00
5,1976,Sunday,25 April,2:00,3:00,Sunday,31 October,2:00,1:00
6,1977,Sunday,24 April,2:00,3:00,Sunday,30 October,2:00,1:00
7,1978,Sunday,30 April,2:00,3:00,Sunday,29 October,2:00,1:00
8,1979,Sunday,29 April,2:00,3:00,Sunday,28 October,2:00,1:00
9,1980,Sunday,27 April,2:00,3:00,Sunday,26 October,2:00,1:00


In [87]:
dst_dates = pd.to_datetime(dates.Date+" "+dates.Year.astype(str))

In [88]:
price['Date'] = pd.to_datetime(price.Date, format="%m/%d/%y")

In [89]:
dst_mon_spring = dst_dates+pd.DateOffset(1)
dst_fri_spring = dst_dates+pd.DateOffset(-2)

In [94]:
print(price[price.Date.isin(dst_mon_spring)])

            Date      Open      High       Low     Close
148   2024-03-11  16052.63  16085.94  15978.04  16019.27
398   2023-03-13  11041.46  11326.73  10982.80  11188.84
648   2022-03-14  12795.12  12918.01  12555.35  12581.22
900   2021-03-15  13323.47  13460.35  13272.50  13459.71
1156  2020-03-09   7957.93   8243.31   7943.16   7950.68
1407  2019-03-11   7442.56   7558.23   7442.40   7558.06
1657  2018-03-12   7581.04   7609.10   7563.44   7588.32
1908  2017-03-13   5863.48   5877.43   5860.84   5875.78
2159  2016-03-14   4733.39   4762.27   4731.50   4750.28
2415  2015-03-09   4936.08   4950.47   4920.82   4942.44
2666  2014-03-10   4332.62   4339.93   4307.84   4334.45
2917  2013-03-11   3237.74   3252.87   3233.67   3252.87
3166  2012-03-12   2989.05   2994.10   2973.65   2983.66
3417  2011-03-14   2695.66   2715.22   2682.09   2700.97
3669  2010-03-15   2361.92   2367.40   2345.99   2362.21
3925  2009-03-09   1281.98   1315.58   1265.62   1268.64
4176  2008-03-10   2211.95   22