In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import polars as pl
import time
import pandas as pd
import numpy as np

In [25]:
df = pd.read_csv('../Data_Raw/meeting_info/cfa_meetings/all_minute_links.csv')
df.rename(columns={'0':'meeting_link', '1':'meeting_minutes_link'}, inplace=True)
df.head()

Unnamed: 0,meeting_link,meeting_minutes_link
0,/records-research/record-cfa-actions/2024/10/c...,/records-research/record-cfa-actions/2024/10/c...
1,/records-research/record-cfa-actions/2024/09/c...,/records-research/record-cfa-actions/2024/09/c...
2,/records-research/record-cfa-actions/2024/07/c...,/records-research/record-cfa-actions/2024/07/c...
3,/records-research/record-cfa-actions/2024/06/c...,/records-research/record-cfa-actions/2024/06/c...
4,/records-research/record-cfa-actions/2024/05/c...,/records-research/record-cfa-actions/2024/05/c...


In [26]:
def start_driver(url,add_cfa=False,options=None,current_driver=None):
    """
    Initializes a Chrome WebDriver instance and navigates to the specified URL.
    Args:
        url (str): The URL to navigate to after starting the WebDriver.
        add_cfa (bool, optional): If True, prepends "https://www.cfa.gov" to the URL. Defaults to False.
        options (webdriver.ChromeOptions, optional): Chrome options to configure the WebDriver. Defaults to None.
        current_driver (WebDriver, optional): An existing WebDriver instance to use instead of creating a new one. Defaults to None.
    Returns:
        WebDriver: An instance of the Chrome WebDriver.
    """
    if add_cfa:
        url = f"https://www.cfa.gov{url}"

    if current_driver is not None:
        current_driver.get(url)
        return current_driver
    else:
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        return driver
    

def get_soup(driver, parser='html.parser'):
    """
    Returns BeautifulSoup object of the current page.
    """
    return BeautifulSoup(driver.page_source, parser)

In [36]:
driver = start_driver('https://www.cfa.gov')

all_minutes = {}
for i in range(len(df)):
    minutes = df['meeting_minutes_link'][i]
    start_driver(minutes,add_cfa=True,current_driver=driver)
    soup = get_soup(driver)

    all_minutes[df.loc[i,'meeting_link']] = soup.find(
        'div', {'class':'l-page__content l-page__content--middle'}).get_text()


In [37]:
import json

# Convert the dictionary to JSON
json_data = json.dumps(all_minutes)

# Save the JSON data to a file
with open("../Data_Raw/meeting_info/cfa_meetings/minutes.json", "w") as f:
    f.write(json_data)