In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import polars as pl
import time
import pandas as pd
import numpy as np

### Functions

In [None]:
def start_driver(url,add_cfa=False,options=None,current_driver=None):
    """
    Initializes a Chrome WebDriver instance and navigates to the specified URL.
    Args:
        url (str): The URL to navigate to after starting the WebDriver.
        add_cfa (bool, optional): If True, prepends "https://www.cfa.gov" to the URL. Defaults to False.
        options (webdriver.ChromeOptions, optional): Chrome options to configure the WebDriver. Defaults to None.
        current_driver (WebDriver, optional): An existing WebDriver instance to use instead of creating a new one. Defaults to None.
    Returns:
        WebDriver: An instance of the Chrome WebDriver.
    """
    if add_cfa:
        url = f"https://www.cfa.gov{url}"

    if current_driver is not None:
        current_driver.get(url)
        return current_driver
    else:
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        return driver
    

def get_soup(driver, parser='html.parser'):
    """
    Returns BeautifulSoup object of the current page.
    """
    return BeautifulSoup(driver.page_source, parser)

## Go to Individual Meeting Pages

In [2]:
# df.to_csv('meeting_info/meeting_links.csv', index=False)
df = pd.read_csv('../Data_Raw/meeting_info/meeting_links.csv')
df.head()

Unnamed: 0,meeting_name_date,meeting_link,meeting_type,date
0,CFA Meeting — 21 November 2024,/records-research/record-cfa-actions/2024/11/c...,CFA Meeting,2024-11-21
1,OGB Meeting — 7 November 2024,/records-research/record-cfa-actions/2024/11/o...,OGB Meeting,2024-11-07
2,CFA Meeting — 17 October 2024,/records-research/record-cfa-actions/2024/10/c...,CFA Meeting,2024-10-17
3,OGB Meeting — 2 October 2024,/records-research/record-cfa-actions/2024/10/o...,OGB Meeting,2024-10-02
4,CFA Meeting — 19 September 2024,/records-research/record-cfa-actions/2024/09/c...,CFA Meeting,2024-09-19


### CFA MEETINGS

In [349]:
cfa_meeting = df['meeting_type'][0] == 'CFA Meeting'
url = df['meeting_link'][0]
print(f"CFA: {cfa_meeting}.\nLink: {url}")

CFA: True.
Link: /records-research/record-cfa-actions/2024/11/cfa-meeting


In [403]:
driver = start_driver(url, add_cfa=True)
soup = get_soup(driver)

#### Get Old Georgetown Appendix

In [404]:
og_app_link = soup.find(
    'div', 
    {'class': 'field__items'} # All Content After Submission and Reviews
).find(
    'div',
    {'class': 'view-content views__content'} # Appendix 
).find(
    'div',
    {'class': 'views-field views-field-views-conditional-field-2'} # Old Georgetown Appendix
).find('a')['href']

og_app_link

'/records-research/record-cfa-actions/appendices/georgetown/45533'

In [405]:
start_driver(og_app_link, add_cfa=True, current_driver=driver)
soup = get_soup(driver)

In [406]:
# Get all the projects in the Georgetown appendix
appendix = [[h.find('header',{'class':'node__header node__header--teaser-view'}) # Project Number and Link
             ] + [
                 item for item in h.find_all('div',{'class':'field__item'}) # Get information for project part (eg HPA Number, Location, etc.)
                 ] for h in soup.find_all('div',{'class': 'rule-line rule-line--above u-leader-padding views-row views__row'})] # Iterate through each project

In [None]:
#Separate the links and text for some of the information
fixed_vals = []
for i in range(len(appendix)): # Iterate through each project
    record = []
    for info in appendix[i]:
        # Add info to the record (separates links)
        link = info.find('a')
        if link:
            record.append(link.text.strip())
            record.append(link['href'])
        else:
            record.append(info.text)
    fixed_vals.append(record)

In [None]:
# Put all project information into a dataframe
appendix_projects = pd.DataFrame(fixed_vals)

appendix_projects.rename(columns={0:'project_number',
                                  1:'project_link',
                                  2:'hpa_number',
                                  3:'location',
                                  4:'property',
                                  5:'description',
                                  6:'review_type',
                                  7:'previous_review_number',
                                  8:'previous_review_link',
                                  9:'recommendation'},inplace=True)

appendix_projects.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,OG 24-240,/records-research/project-search/og-24-240-1,HPA 24-305,"2728 O Street, NW\nUnited States",Residence,Rear addition,Concept,OG 24-240,/records-research/project-search/og-24-240-0,No objection to concept design for a rear two-...
1,OG 24-255,/records-research/project-search/og-24-255-2,HPA 24-320,"3306 O Street, NW\nUnited States",Residence,New accessory structure with living space and ...,Concept,OG 24-255,/records-research/project-search/og-24-255-1,No objection to concept design for a new acces...
2,OG 24-288,/records-research/project-search/og-24-288-3,HPA 24-365,"3700 O Street, NW\nUnited States",Georgetown University,Campus building signage and wayfinding,Concept,OG 24-288,/records-research/project-search/og-24-288-2,No objection to concept design for building si...
3,OG 24-293,/records-research/project-search/og-24-293-3,HPA 24-370,"3246 S Street, NW\nUnited States",Residence,"Third story addition, renovation, landscape al...",Concept,OG 24-293,/records-research/project-search/og-24-293-2,No objection to concept design for a third sto...
4,OG 24-303,/records-research/project-search/og-24-303-0,HPA 24-391,"1403 Wisconsin Avenue, NW\nUnited States",Commercial,Signage - CVS,Permit,OG 24-303,/records-research/project-search/og-24-303,Returned without action. Permit application fo...


#### Another Example CFA Meeting (Get Letters on Main Page)

On this link, `'/records-research/record-cfa-actions/2017/09/cfa-meeting'`, we have OGB projects that are listed on this page with letters, along with the CFA projects. They are not in the appendix


In [210]:
df['meeting_link'].sample(n=1).iloc[0]

'/records-research/record-cfa-actions/2017/09/cfa-meeting'

In [None]:
start_driver('/records-research/record-cfa-actions/2017/09/cfa-meeting', add_cfa=True, current_driver=driver)
soup = get_soup(driver)

In [413]:
#Get relevant information for the items on the letters
def get_letter_info(soup):
    georgetown_projects = [
    x for x in soup.find_all(
        'div',
        {'class':'u-trailer rule-line rule-line--below rule-line--heavy views-row views__row'}
    ) if x.find('div',{'class':'field-content u-half-trailer u-text-bold'}).get_text(strip=True).startswith("OG")]

    project_info = []
    if len(georgetown_projects) > 0:
        for project in georgetown_projects:
            customer = project.find('h3').text # Project Customer
            pnum_hpa = project.find('div',{'class':"field-content u-half-trailer u-text-bold"}).text # project number and HPA
            description = project.find('p').text # Description of project
            p = [[x.text,x['href']] for x in project.find_all('a')]

            project_info.append([customer,pnum_hpa,description,p])
    return pd.DataFrame(project_info)

In [None]:
letters = get_letter_info(soup)
letters.head()

#### Find Sidebar to get Meeting Minutes

In [None]:
sidebar = soup.find('aside', {'class': 'l-page__sidebar l-page__sidebar--second'})
meeting_minutes_link = [x.find('div',class_=lambda x: "related-minutes" in x) for x in sidebar][0].find('a')['href']

#### **Functions**

In [3]:
def start_driver(url,add_cfa=False,options=None,current_driver=None):
    """
    Initializes a Chrome WebDriver instance and navigates to the specified URL.
    Args:
        url (str): The URL to navigate to after starting the WebDriver.
        add_cfa (bool, optional): If True, prepends "https://www.cfa.gov" to the URL. Defaults to False.
        options (webdriver.ChromeOptions, optional): Chrome options to configure the WebDriver. Defaults to None.
        current_driver (WebDriver, optional): An existing WebDriver instance to use instead of creating a new one. Defaults to None.
    Returns:
        WebDriver: An instance of the Chrome WebDriver.
    """
    if add_cfa:
        url = f"https://www.cfa.gov{url}"

    if current_driver is not None:
        current_driver.get(url)
        return current_driver
    else:
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        return driver
    

def get_soup(driver, parser='html.parser'):
    """
    Returns BeautifulSoup object of the current page.
    """
    return BeautifulSoup(driver.page_source, parser)

In [4]:
def get_appendix(url,add_cfa=True,current_driver=None):
    """
    Extracts the Old Georgetown Appendix projects from the specified URL.
    Args:
        url (str): The URL to navigate to and extract the Old Georgetown Appendix projects from.
        add_cfa (bool, optional): If True, prepends "https://www.cfa.gov" to the URL. Defaults to True.
    Returns:
        pd.DataFrame: A Pandas DataFrame containing the extracted Old Georgetown Appendix projects with the following columns:
            - project_number (str): The project number. [e.g.: "OG 21-001"]
            - project_link (str): The URL link to the project. [e.g.: "/node/123456"]
            - hpa_number (str): The Historic Preservation Act (HPA) number. [e.g.: "HPA 21-001"]
            - location (str): The location of the project. [e.g.: "123 Main St NW"]
            - property (str): The property name. [e.g.: "The White House"]
            - description (str): A brief description of the project. [e.g.: "Renovation of the East Wing"]
            - review_type (str): The type of review. [e.g.: "Concept"]
            - previous_review_number (str): The previous review number. [e.g.: "OG 21-001"]
            - previous_review_link (str): The URL link to the previous review. [e.g.: "/node/123456"]
            - recommendation (str): The recommendation for the project. [e.g.: "Approve with conditions"]
    """
    driver = start_driver(url, add_cfa=add_cfa, current_driver=current_driver)
    soup = get_soup(driver)

    # Find all project nodes
    project_nodes = soup.select('div.rule-line.rule-line--above.u-leader-padding.views-row.views__row')
    projects = []

    for node in project_nodes:
        project = {}
        header = node.find('header', class_='node__header node__header--teaser-view')
        if header:
            project['project_number'] = header.text.strip()
            project['project_link'] = header.find('a')['href'] if header.find('a') else None

        project_information = node.find_all('div',{'class':['field__label','field__item']})
        cols = [col.text.strip().replace(' ','_').lower() for col in project_information if project_information.index(col) % 2 == 0] # Get the column names
        col_vals = [(col.find('a')['href'],col.text) if col.find('a') is not None else (col.text) for col in project_information if project_information.index(col) % 2 == 1] # Get the column values

        if col_vals:
            project.update(dict(zip(cols,col_vals)))
        project['meeting_link'] = url

        projects.append(project)

    return pd.DataFrame(projects)

def get_meeting_minute_link(soup,url):
    """
    Extracts the link to the meeting minutes from the specified URL.
    Args:
        url (str): The URL of the page to extract the meeting minutes link from.
        soup (BeautifulSoup): A BeautifulSoup object containing the HTML content to parse.
    Returns:
        pd.DataFrame: A Pandas DataFrame containing the page URL and meeting minutes link.
    """
    sidebar = soup.find('aside', {'class': 'l-page__sidebar l-page__sidebar--second'})
    
    try:
        meeting_minutes_link = [x.find('div',class_=lambda x: "related-minutes" in x) for x in sidebar][0].find('a')['href']
        return pd.DataFrame([url,meeting_minutes_link]).T
    except:
        print("No meeting minutes found.")
        return None

#Get relevant information for the items on the letters
def get_og_letters(soup):
    """
    Extracts information about Georgetown projects that start with "OG" from the provided BeautifulSoup object.
    Args:
        soup (BeautifulSoup): A BeautifulSoup object containing the HTML content to be parsed.
    Returns:
        pandas.DataFrame: A DataFrame containing the extracted project information with columns:
            - url: The URL of the project.
            - customer: The customer associated with the project.
            - pnum_hpa: The project number and HPA.
            - description: The description of the project.
            - p: A list of lists containing text and href attributes of all anchor tags within the project (get the letter and the previous review (if available)).
    """
    georgetown_projects = [
    x for x in soup.find_all(
        'div',
        {'class':'u-trailer rule-line rule-line--below rule-line--heavy views-row views__row'}
    ) if x.find('div',{'class':'field-content u-half-trailer u-text-bold'}).get_text(strip=True).startswith("OG")]

    if len(georgetown_projects) > 0:
        print(f"Found these georgetown letters: {georgetown_projects}")
        project_info = []
        for project in georgetown_projects:
            customer = project.find('h3').text # Project Customer
            pnum_hpa = project.find('div',{'class':"field-content u-half-trailer u-text-bold"}).text # project number and HPA
            description = project.find('p').text # Description of project
            p = [[x.text,x['href']] for x in project.find_all('a')]

            project_info.append([url,customer,pnum_hpa,description,p])
        return pd.DataFrame(project_info)
    else:
        print("No letters found.")
        return None

def query_meeting_page(meeting_url,add_cfa=True,current_driver=None):
    """
    Extracts the meeting minutes link from a CFA or OGB meeting page.
    Args:
        meeting_url (str): The URL of the CFA or OGB meeting page to query.
    Returns:
        pd.DataFrame: A tuple containing three elements:
            - minute_link (pd.DataFrame): A DataFrame containing the page URL and meeting minutes link.
            - letters (pd.DataFrame): A DataFrame containing the extracted project information.
            - og_appendix (pd.DataFrame): A DataFrame containing the extracted Old Georgetown Appendix projects.
    """
    driver = start_driver(meeting_url, add_cfa=add_cfa, current_driver=current_driver)
    soup = get_soup(driver)

    try:
        minute_link = get_meeting_minute_link(soup,meeting_url)
    except AttributeError as e:
        print(f"Minutes not found: {e}")
        minute_link = None
    
    try:
        letters = get_og_letters(soup)
    except AttributeError as e:
        print(f"Letters not found: {e}")
        letters = None

    try:
        appendix_link = soup.select_one(
            'div.field__items div.view-content.views__content div.views-field-views-conditional-field-2 a'
        )['href']
        og_appendix = get_appendix(appendix_link, add_cfa=True, current_driver=driver)
    except AttributeError as e:
        print(f"Appendix link not found: {e}")
        og_appendix = None

    return minute_link, letters, og_appendix
    

#### Get main page letters and meeting minute links, then old georgetown appendix

In [7]:
# 20 minutes 48 seconds
import os

# Directory to save results
output_dir = "../Data_Raw/meeting_info/cfa_meetings"
os.makedirs(output_dir, exist_ok=True)

# Initialize storage dictionaries
minute_links_dict = {}
letters_dict = {}
appendices_dict = {}

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = start_driver("https://www.cfa.gov", options=options)

# Iterate through URLs
for i, row in df.iterrows():
    if row['meeting_type'] == 'CFA Meeting':
        url = row['meeting_link']
        print(f"\nProcessing URL: {url}")
        try:
            # Call the query_meeting_page function
            minute_link, letters, appendix = query_meeting_page(url,current_driver=driver)
            
            # Save results to dictionaries
            if minute_link is not None:
                minute_links_dict[url] = minute_link
            
            if letters is not None:
                letters_dict[url] = letters
            
            if appendix is not None:
                appendices_dict[url] = appendix
            print(f"Successful.")
        except Exception as e:
            print(f"Error processing URL {url}: {e}")

# Convert dictionaries to DataFrames (optional, if you want a combined view)
minute_links_df = pd.concat(minute_links_dict.values(), keys=minute_links_dict.keys())
letters_df = pd.concat(letters_dict.values(), keys=letters_dict.keys())
appendices_df = pd.concat(appendices_dict.values(), keys=appendices_dict.keys())

# Save combined dataframes to CSV if needed
minute_links_df.to_csv(os.path.join(output_dir, "all_minute_links.csv"), index=False)
letters_df.to_csv(os.path.join(output_dir, "all_letters.csv"), index=False)
appendices_df.to_csv(os.path.join(output_dir, "all_appendices.csv"), index=False)



Processing URL: /records-research/record-cfa-actions/2024/11/cfa-meeting
No meeting minutes found.
No letters found.
Successful.

Processing URL: /records-research/record-cfa-actions/2024/10/cfa-meeting
No letters found.
Successful.

Processing URL: /records-research/record-cfa-actions/2024/09/cfa-meeting
No letters found.
Successful.

Processing URL: /records-research/record-cfa-actions/2024/07/cfa-meeting
No letters found.
Successful.

Processing URL: /records-research/record-cfa-actions/2024/06/cfa-meeting
No letters found.
Successful.

Processing URL: /records-research/record-cfa-actions/2024/05/cfa-meeting
Found these georgetown letters: [<div class="u-trailer rule-line rule-line--below rule-line--heavy views-row views__row"><div class="views-field views-field-field-owner-term"><h3 class="field-content u-text-uppercase u-text-tertiary u-half-trailer">D.C. Department of Buildings</h3></div><span class="views-field views-field-title"><div class="field-content u-half-trailer u-text-

### OGB MEETINGS

In [8]:
i = 1
ogb_meeting = df['meeting_type'][i] == 'OGB Meeting'
url = df['meeting_link'][i]
print(f"OGB: {ogb_meeting}.\nLink: {url}")

#Additionally tested on i = 200 at url = /records-research/record-cfa-actions/2015/09/ogb-meeting

OGB: True.
Link: /records-research/record-cfa-actions/2024/11/ogb-meeting


In [9]:
driver = start_driver(url, add_cfa=True)
soup = get_soup(driver)

#### Get links to consent, denial, other submissions calendars.

In [10]:
appendices = soup.find(
    'div', 
    {'class': 'field__items'} # All Content After Submission and Reviews
).find(
    'div',
    {'class': 'view-content views__content'} # Appendix 
)

# Extract the links for the Old Georgetown Appendix
decision_links = [
    appendices.find('div', {'class': f'views-field views-field-views-conditional-field{i}'}).find('a')['href']
    for i in ['', '-1', '-2']
]
decision_links

['/records-research/record-cfa-actions/old-georgetown-board/consent-calendar/45542',
 '/records-research/record-cfa-actions/old-georgetown-board/denial-calendar/45542',
 '/records-research/record-cfa-actions/old-georgetown-board/other-submissions/45542']

#### Get letters information from the page.

In [5]:
def process_project(project):
    """
    Processes a project to extract URLs for "CFA Action" and "Previous Review".
    Args:
        project (BeautifulSoup object): A BeautifulSoup object representing the HTML content of a project.
    Returns:
        tuple: A tuple containing two elements:
            - cfa (str or None): The URL for "CFA Action" if found, otherwise None.
            - cfa_num (str or None): The project number for "CFA Action" if found, otherwise None.
            - prev (str or None): The URL for "Previous Review" if found, otherwise None.  
            - prev_num (str or None): The project number for "Previous Review" if found, otherwise None.
    """
    
    prev = None
    prev_num = None
    cfa = None
    cfa_num = None
    try:
        action = project.find('div', {'class': 'views-field views-field-nothing-1'}).get_text()
        if "CFA Action" in action:
            cfa = project.find('a', string="CFA Action")['href']
            cfa_num = project.find('a', string="CFA Action").get_text()
        if "Previous Review" in action:
            prev = project.find('a', string="Previous Review")['href']
            prev_num = project.find('a', string="Previous Review").get_text()
    except:
        None

    bottom = project.find_all('div', {'class': 'l-grid l-grid--equal-2 l-grid--condensed u-trailer'})
    
    for b in bottom:
        if "CFA Action" in b.get_text() and not cfa:
            i = str(b).index("CFA Action")
            s = BeautifulSoup(str(b)[i:], 'html.parser')
            cfa = s.find('a')['href']
            cfa_num = s.find('a').get_text()
        if "Previous Review" in b.get_text() and not prev:
            i = str(b).index("Previous Review")
            s = BeautifulSoup(str(b)[i:], 'html.parser')
            prev = s.find('a')['href']
            prev_num = s.find('a').get_text()
    
    return cfa, cfa_num, prev, prev_num

In [6]:
def og_project_reviews(soup):
    """
    Extracts project reviews from the Old Georgetown Appendix page.
    Args:
        soup (BeautifulSoup): A BeautifulSoup object containing the HTML content to parse.
    Returns:
        pd.DataFrame: A DataFrame containing the extracted project reviews with columns:
            - num_hpa (str): The HPA number.
            - location_desc (str): The location description.
            - cfa_action (str): The URL for the CFA action.
            - cfa_action_link (str): The project number for the CFA action.
            - prev_action (str): The URL for the previous action.
            - prev_action_link (str): The project number for the previous action.
            - documents (list): A list of document URLs.
    """
    georgetown_projects = [x for x in soup.find_all('div',
        {'class':'u-trailer rule-line rule-line--below rule-line--heavy views-row views__row'}
    )]

    project_info = []
    for project in georgetown_projects:
        num_hpa = project.find('div',{'class':'field-content u-half-trailer u-text-bold'}).text
        location_desc = project.find('div',{'class':'views-field views-field-nothing'}).text
        cfa_action, cfa_action_link, prev_action, prev_action_link = process_project(project)

        documents = [p.find('a')['href'] for p in project.find_all('span',
        {'class':'file file--mime-application-pdf file--application-pdf icon icon--left icon--file file icon--mime-application-pdf icon--application-pdf'})]

        project_info.append([
            num_hpa,location_desc,
            cfa_action, cfa_action_link, 
            prev_action, prev_action_link,
            documents])
        
    return pd.DataFrame(project_info)

#### Go through the consent, denial, and other pages. Previous function *get_appendix(appendix_url, add_cfa=True)* will work for this.

Note: If you webscrape the latest ones, there will not be recommendations, but presentation material from the project such as slides. Get historic ones.

Be careful. I just found an appendix for https://www.cfa.gov/records-research/record-cfa-actions/appendices/georgetown/41523 on page https://www.cfa.gov/records-research/record-cfa-actions/2019/12/ogb-meeting

In [7]:
def query_ogb_meeting_page(meeting_url,add_cfa=True,current_driver=None):
    """
    Extracts the Old Georgetown Board (OGB) meeting minutes link and project reviews from the specified URL.
    Args:
        meeting_url (str): The URL of the OGB meeting page to query.
        add_cfa (bool, optional): If True, prepends "https://www.cfa.gov" to the URL. Defaults to True.
    Returns:
        pd.DataFrame: A tuple containing two elements:
            - minute_link (pd.DataFrame): A DataFrame containing the page URL and meeting minutes link.
            - project_reviews (pd.DataFrame): A DataFrame containing the extracted project reviews.
    """
    driver = start_driver(meeting_url, add_cfa=add_cfa, current_driver=current_driver)
    soup = get_soup(driver)

    #ADJUST THIS SO THAT IT CAN BE USED FOR WHEN THERE IS NO MEETING
    decision_links = [(soup.select_one(
        f'div.field__items div.view-content.views__content div.views-field-views-conditional-field{i}').text,
        soup.select_one(
        f'div.field__items div.view-content.views__content div.views-field-views-conditional-field{i} a')['href']) for i in ['','-1','-2'] if soup.select_one(
        f'div.field__items div.view-content.views__content div.views-field-views-conditional-field{i}')
    ]
    
    project_reviews = og_project_reviews(soup)

    try:
        consent = [decision_links[i] for i in range(len(decision_links)) if "Consent Calendar" in decision_links[i][0]][0][1]
        consents = get_appendix(consent, add_cfa=True, current_driver=driver)
    except:
        print(f"No consents for {meeting_url}")
        consents = None
    
    try:
        deny = [decision_links[i] for i in range(len(decision_links)) if "Denial Calendar" in decision_links[i][0]][0][1]
        denials = get_appendix(deny, add_cfa=True, current_driver=driver)
    except:
        print(f"No denials for {meeting_url}")
        denials = None

    try:
        other = [decision_links[i] for i in range(len(decision_links)) if "Other Submissions" in decision_links[i][0]][0][1]
        others = get_appendix(other, add_cfa=True, current_driver=driver)
    except:
        print(f"No other submissions for {meeting_url}")
        others = None



    return project_reviews, consents, denials, others

#### Iterate Through to Get Projects, Approval Status

In [None]:
#11 minutes 56 seconds
import os

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# Directory to save results
output_dir = "meeting_info/ogb_meetings"
os.makedirs(output_dir, exist_ok=True)
consent_dir = os.path.join(output_dir, "consents")
denial_dir = os.path.join(output_dir, "denials")
other_dir = os.path.join(output_dir, "others")

os.makedirs(consent_dir, exist_ok=True)
os.makedirs(denial_dir, exist_ok=True)
os.makedirs(other_dir, exist_ok=True)

# Initialize storage dictionaries
reviews_dict = {}
consents_dict = {}
denials_dict = {}
others_dict = {}

driver = start_driver("https://www.cfa.gov", options=options)

# Iterate through URLs
for i, row in df.iterrows():
    if row['meeting_type'] == 'OGB Meeting':
        url = row['meeting_link']
        print(f"\nProcessing URL: {url}")
        try:
            # Call the query page function
            project_reviews, consents, denials, others = query_ogb_meeting_page(url,current_driver=driver)

            # Save results to dictionaries 
            if project_reviews is not None:
                reviews_dict[url] = project_reviews
            
            if consents is not None:
                consents_dict[url] = consents

            if denials is not None:
                denials_dict[url] = denials

            if others is not None:
                others_dict[url] = others

            print(f"Successful.")
        except Exception as e:
            print(f"Error processing URL {url}: {e}")

# Convert dictionaries to DataFrames (optional, if you want a combined view)
reviews_df = pd.concat(reviews_dict.values(), keys=reviews_dict.keys())
consents_df = pd.concat(consents_dict.values(), keys=consents_dict.keys())
denials_df = pd.concat(denials_dict.values(), keys=denials_dict.keys())
others_df = pd.concat(others_dict.values(), keys=others_dict.keys())

# Save combined dataframes to CSV if needed
reviews_df.to_csv(os.path.join(output_dir, "all_reviews.csv"), index=False)
consents_df.to_csv(os.path.join(consent_dir, "all_consents.csv"), index=False)
denials_df.to_csv(os.path.join(denial_dir, "all_denials.csv"), index=False)
others_df.to_csv(os.path.join(other_dir, "all_others.csv"), index=False)


Processing URL: /records-research/record-cfa-actions/2024/11/ogb-meeting
Successful.

Processing URL: /records-research/record-cfa-actions/2024/10/ogb-meeting
Successful.

Processing URL: /records-research/record-cfa-actions/2024/09/ogb-meeting
Successful.

Processing URL: /records-research/record-cfa-actions/2024/07/ogb-meeting
Successful.

Processing URL: /records-research/record-cfa-actions/2024/06/ogb-meeting
Successful.

Processing URL: /records-research/record-cfa-actions/2024/05/ogb-meeting
Successful.

Processing URL: /records-research/record-cfa-actions/2024/04/ogb-meeting
Successful.

Processing URL: /records-research/record-cfa-actions/2024/03/ogb-meeting
Successful.

Processing URL: /records-research/record-cfa-actions/2024/02/ogb-meeting
Successful.

Processing URL: /records-research/record-cfa-actions/2023/12/ogb-meeting
No denials for /records-research/record-cfa-actions/2023/12/ogb-meeting
Successful.

Processing URL: /records-research/record-cfa-actions/2023/11/ogb-me

In [9]:
consents_df.to_csv(os.path.join(consent_dir, "all_consents.csv"), index=False)
denials_df.to_csv(os.path.join(denial_dir, "all_denials.csv"), index=False)
others_df.to_csv(os.path.join(other_dir, "all_others.csv"), index=False)