In [39]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_dow_jones_historical_data():
    headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
    url = "https://finance.yahoo.com/quote/%5EDJI/history?p=%5EDJI"
    response = requests.get(url, headers=headers, timeout=100)
    soup = BeautifulSoup(response.text, 'html.parser')

    table = soup.find('table', {'data-test': 'historical-prices'})
    rows = table.tbody.find_all('tr')

    data = []
    for row in rows:
        cells = row.find_all('td')
        #if len(cells) == 7:
        date = cells[0].text
        openprice = cells[1].text
        highprice = cells[2].text
        lowprice = cells[3].text
        close = cells[5].text
        data.append({'Date': date, 'Open': openprice, 'High': highprice,
                     'Low': lowprice, 'Close': close})

    df = pd.DataFrame(data)
    df['Date'] = pd.to_datetime(df['Date'])
    df['Open'] = pd.to_numeric(df['Open'].str.replace(',', ''))
    df['High'] = pd.to_numeric(df['High'].str.replace(',', ''))
    df['Low'] = pd.to_numeric(df['Low'].str.replace(',', ''))
    df['Close'] = pd.to_numeric(df['Close'].str.replace(',', ''))

    return df

dow_jones_data = scrape_dow_jones_historical_data()
print(dow_jones_data)
dow_jones_data.to_csv("sample_data.csv", index = False)

         Date      Open      High       Low     Close
0  2023-06-22  33900.47  34003.56  33835.39  33885.82
1  2023-06-21  33990.56  34097.93  33876.17  33951.52
2  2023-06-20  34206.66  34206.66  33915.93  34053.87
3  2023-06-16  34464.02  34588.68  34285.69  34299.12
4  2023-06-15  33945.98  34488.98  33945.98  34408.06
..        ...       ...       ...       ...       ...
95 2023-02-03  33926.30  34179.58  33813.86  33926.01
96 2023-02-02  34129.30  34145.14  33814.78  34053.94
97 2023-02-01  34039.60  34334.70  33581.42  34092.96
98 2023-01-31  33803.56  34095.23  33664.91  34086.04
99 2023-01-30  33909.21  34055.29  33695.18  33717.09

[100 rows x 5 columns]
