all_applicant_scraping_data

In [1]:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Base URL for funding applications
BASE_URL = "https://dca.georgia.gov"

# Specific funding page
FUNDING_CYCLE_PAGE = f"{BASE_URL}/applications-funding-and-funding-cycle-selections"

# Define years and types of applications
YEARS = list(range(2010, 2025))  # 1996 to 2024
APPLICATION_TYPES = ["4-core", "9-core"]

# Directories for downloads and logs
BASE_DOWNLOAD_DIR = os.path.join(os.getcwd(), "data", "applications_submitted")
LOG_FILE = os.path.join(BASE_DOWNLOAD_DIR, "not_found_pages.txt")

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_experimental_option("prefs", {
    "safebrowsing.enabled": True
})

# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)


def scrape_application_pages():
    """Scrapes application pages from 1996 to 2024 and downloads documents."""
    not_found_pages = []

    for year in YEARS:
        for app_type in APPLICATION_TYPES:
            page_url = f"{BASE_URL}/{year}-{app_type}-applications-submitted"

            print(f"Processing: {page_url}")

            try:
                driver.get(page_url)
                time.sleep(5)  # Allow time for page to load

                # Check if page exists
                if "Page not found" in driver.page_source or "The requested page could not be found" in driver.page_source:
                    print(f"Page not found: {page_url}")
                    not_found_pages.append(page_url)
                    continue

                # Find all downloadable document links
                document_links = [
                    link.get_attribute("href")
                    for link in driver.find_elements(By.TAG_NAME, "a")
                    if link.get_attribute("href") and (
                        ".xlsx" in link.get_attribute("href") or
                        ".pdf" in link.get_attribute("href") or
                        ".docx" in link.get_attribute("href") or
                        "download" in link.get_attribute("href")
                    )
                ]

                if not document_links:
                    print(f"No documents found on {page_url}")
                    not_found_pages.append(page_url)  # Log as empty page
                    continue

                print(f"Found {len(document_links)} documents on {page_url}")

                # Create directory only if files exist
                download_dir = os.path.join(BASE_DOWNLOAD_DIR, f"{year}_{app_type}")
                os.makedirs(download_dir, exist_ok=True)

                # Click each download link
                for doc_url in document_links:
                    click_download_link(doc_url)

            except Exception as e:
                print(f"Error processing {page_url}: {e}")

    # Save log of missing pages
    if not_found_pages:
        with open(LOG_FILE, "w", encoding="utf-8") as log:
            log.write("\n".join(not_found_pages))
        print(f"Missing pages recorded in {LOG_FILE}")

    driver.quit()


def scrape_funding_cycle_page():
    """Scrapes the 'Applications for Funding and Funding Cycle Selections' page."""
    print(f"Processing: {FUNDING_CYCLE_PAGE}")

    try:
        driver.get(FUNDING_CYCLE_PAGE)
        time.sleep(8)

        # Find all downloadable document links
        document_links = [
            link.get_attribute("href")
            for link in driver.find_elements(By.TAG_NAME, "a")
            if link.get_attribute("href") and (
                ".xlsx" in link.get_attribute("href") or
                ".pdf" in link.get_attribute("href") or
                ".docx" in link.get_attribute("href") or
                "download" in link.get_attribute("href")
            )
        ]

        if not document_links:
            print(f"No documents found on {FUNDING_CYCLE_PAGE}")
            return

        print(f"Found {len(document_links)} documents on {FUNDING_CYCLE_PAGE}")

        # Create directory only if files exist
        download_dir = os.path.join(BASE_DOWNLOAD_DIR, "funding_cycle_selections")
        os.makedirs(download_dir, exist_ok=True)

        # Click each download link
        for doc_url in document_links:
            click_download_link(doc_url)

    except Exception as e:
        print(f"Error processing {FUNDING_CYCLE_PAGE}: {e}")


def click_download_link(url):
    """Clicks on a download link to trigger file download."""
    try:
        print(f"Downloading: {url}")

        # Open each link in a new tab and trigger download
        driver.execute_script(f"window.open('{url}', '_blank');")
        time.sleep(5)  # Wait longer for download to start

    except Exception as e:
        print(f"Error downloading {url}: {e}")

    # Wait to ensure downloads finish before closing browser
    time.sleep(12)



Processing: https://dca.georgia.gov/2010-4-core-applications-submitted
No documents found on https://dca.georgia.gov/2010-4-core-applications-submitted
Processing: https://dca.georgia.gov/2010-9-core-applications-submitted
Found 72 documents on https://dca.georgia.gov/2010-9-core-applications-submitted
Downloading: https://dca.georgia.gov/document/forms/2010-001washingtonestatesiigoracorexls/download
Downloading: https://dca.georgia.gov/document/forms/2010-002ferrylakeestatesgoracorexls/download
Downloading: https://dca.georgia.gov/document/forms/2010-003brookhaveniiigoracorexls/download
Downloading: https://dca.georgia.gov/document/resources/2010-004gatewaypinesgoracorexls/download
Downloading: https://dca.georgia.gov/document/forms/2010-005walnutsquaregoracorexls/download
Downloading: https://dca.georgia.gov/document/forms/2010-006maplesquaregoracorexls/download
Downloading: https://dca.georgia.gov/document/forms/2010-007sanfordplacegoracorexls/download
Downloading: https://dca.georg

In [None]:

if __name__ == "__main__":
    scrape_application_pages()  # Scrape funding applications (4% & 9% cycles)
    scrape_funding_cycle_page()  # Scrape "Applications for Funding and Funding Cycle Selections"

Move files into correct directories

Continue to scrape funding_cycle_page

In [None]:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Base URL for funding applications
BASE_URL = "https://dca.georgia.gov"

# Specific funding page
FUNDING_CYCLE_PAGE = f"{BASE_URL}/applications-funding-and-funding-cycle-selections"

# Directories for downloads and logs
BASE_DOWNLOAD_DIR = os.path.join(os.getcwd(), "data", "applications-funding-and-funding-cycle-selection")
LOG_FILE = os.path.join(BASE_DOWNLOAD_DIR, "not_found_pages.txt")

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_experimental_option("prefs", {
    "safebrowsing.enabled": True
})

# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)



def scrape_funding_cycle_page():
    """Scrapes the 'Applications for Funding and Funding Cycle Selections' page."""
    print(f"Processing: {FUNDING_CYCLE_PAGE}")

    try:
        driver.get(FUNDING_CYCLE_PAGE)
        time.sleep(8)

        # Find all downloadable document links
        document_links = [
            link.get_attribute("href")
            for link in driver.find_elements(By.CLASS_NAME, "document-link__content"):
            if link.get_attribute("href") and (
                ".xlsx" in link.get_attribute("href") or
                ".pdf" in link.get_attribute("href") or
                ".docx" in link.get_attribute("href") or
                "download" in link.get_attribute("href")
            )
        ]

        if not document_links:
            print(f"No documents found on {FUNDING_CYCLE_PAGE}")
            return

        print(f"Found {len(document_links)} documents on {FUNDING_CYCLE_PAGE}")

        # Create directory only if files exist
        download_dir = os.path.join(BASE_DOWNLOAD_DIR, "funding_cycle_selections")
        os.makedirs(download_dir, exist_ok=True)

        # Click each download link
        for doc_url in document_links:
            click_download_link(doc_url)

    except Exception as e:
        print(f"Error processing {FUNDING_CYCLE_PAGE}: {e}")


def click_download_link(url):
    """Clicks on a download link to trigger file download."""
    try:
        print(f"Downloading: {url}")

        # Open each link in a new tab and trigger download
        driver.execute_script(f"window.open('{url}', '_blank');")
        time.sleep(5)  # Wait longer for download to start

    except Exception as e:
        print(f"Error downloading {url}: {e}")

    # Wait to ensure downloads finish before closing browser
    time.sleep(12)



In [None]:
scrape_funding_cycle_page()  # Scrape "Applications for Funding and Funding Cycle Selections"

### PastScraping-2023-4

In [None]:
https://dca.georgia.gov/2023-4-core-applications-submitted

In [1]:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from urllib.parse import urljoin


In [3]:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urljoin

# Target webpage
BASE_URL = "https://dca.georgia.gov/2023-4-core-applications-submitted"

# Set up local directory to save files
DOWNLOAD_DIR = os.path.join(os.getcwd(), "data", "applications_submitted", "downloaded_files")
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": DOWNLOAD_DIR,  # Set download directory
    "download.prompt_for_download": False,       # Auto-download files
    "safebrowsing.enabled": True
})

# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Open the webpage
driver.get(BASE_URL)

# Wait for page elements to load
time.sleep(5)

# Find all document links using Selenium
document_links = []
for link in driver.find_elements(By.TAG_NAME, "a"):
    href = link.get_attribute("href")
    if href and (".xlsx" in href or ".pdf" in href or ".docx" in href or "download" in href):
        document_links.append(href)

if not document_links:
    print("No documents found.")
else:
    print(f"Found {len(document_links)} documents.")

# Click each download link
for doc_url in document_links:
    try:
        print(f"Downloading: {doc_url}")

        # Open each link in a new tab and trigger download
        driver.execute_script(f"window.open('{doc_url}', '_blank');")
        time.sleep(3)  # Wait for the download to start

    except Exception as e:
        print(f"Error downloading {doc_url}: {e}")

# Wait for downloads to complete
time.sleep(10)

# Close the browser
driver.quit()

print(f"All files saved to: {DOWNLOAD_DIR}")

Found 57 documents.
Downloading: https://dca.georgia.gov/document/forms/2023-501nthsidehills4pctcoreseptxlsx/download
Downloading: https://dca.georgia.gov/document/forms/2023-502martinhse4pctcoreseptxlsx/download
Downloading: https://dca.georgia.gov/document/forms/2023-503parishgrove4pctcorexlsx/download
Downloading: https://dca.georgia.gov/document/forms/2023-504oxford4pctcoreseptxlsx/download
Downloading: https://dca.georgia.gov/document/forms/2023-505the3504pctcoreseptxlsx/download
Downloading: https://dca.georgia.gov/document/forms/2023-506cnlcmns4pctcoreseptxlsx/download
Downloading: https://dca.georgia.gov/document/forms/2023-507civicctrr1sr4pctcorexlsx/download
Downloading: https://dca.georgia.gov/document/forms/2023-508autumnridgeappxlsx/download
Downloading: https://dca.georgia.gov/document/forms/2023-509columbusgrdns4pctcorexlsx/download
Downloading: https://dca.georgia.gov/document/forms/2023-510brdghwl4pctcoreseptxlsx/download
Downloading: https://dca.georgia.gov/document/f

In [2]:

# Target URL
BASE_URL = "https://dca.georgia.gov/2023-4-core-applications-submitted"

# Set up Chrome WebDriver options
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")

# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Open the webpage
driver.get(BASE_URL)

# Wait for elements to load
time.sleep(5)

# Get page source and parse with BeautifulSoup
soup = BeautifulSoup(driver.page_source, "html.parser")

# Close Selenium browser
driver.quit()

# Find all document links
document_links = []
for link in soup.find_all("a", href=True):
    href = link["href"]
    if href.endswith((".xlsx", ".xls", ".pdf", ".docx", ".csv")) or "download" in href:
        full_url = urljoin(BASE_URL, href)
        document_links.append(full_url)

if not document_links:
    print("No documents found.")
else:
    print(f"Found {len(document_links)} documents to download.")

# Create local directory for downloads
DOWNLOAD_DIR = os.path.join(os.getcwd(), "data", "applications_submitted", "downloaded_files")
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Function to download files
def download_files(file_urls):
    for url in file_urls:
        try:
            filename = os.path.basename(url)
            file_path = os.path.join(DOWNLOAD_DIR, filename)

            print(f"Downloading: {filename}...")

            # Request file download
            response = requests.get(url, stream=True)
            response.raise_for_status()

            # Write to file
            with open(file_path, "wb") as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

            print(f"Downloaded successfully: {file_path}")

        except requests.RequestException as e:
            print(f"Failed to download {url}: {e}")

# Start downloading files
download_files(document_links)

Found 57 documents to download.
Downloading: download...
Downloaded successfully: /Users/sibilz/Desktop/000-MSQTM/QTM-24 fall courses /25spring /550_1 quantitative science project /LIHTC/data/applications_submitted/downloaded_files/download
Downloading: download...
Downloaded successfully: /Users/sibilz/Desktop/000-MSQTM/QTM-24 fall courses /25spring /550_1 quantitative science project /LIHTC/data/applications_submitted/downloaded_files/download
Downloading: download...
Downloaded successfully: /Users/sibilz/Desktop/000-MSQTM/QTM-24 fall courses /25spring /550_1 quantitative science project /LIHTC/data/applications_submitted/downloaded_files/download
Downloading: download...
Downloaded successfully: /Users/sibilz/Desktop/000-MSQTM/QTM-24 fall courses /25spring /550_1 quantitative science project /LIHTC/data/applications_submitted/downloaded_files/download
Downloading: download...
Downloaded successfully: /Users/sibilz/Desktop/000-MSQTM/QTM-24 fall courses /25spring /550_1 quantitative 

In [9]:
# 启动 WebDriver
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# 访问 IMDb 页面（替换为你的 IMDb 网址）
url = "https://dca.georgia.gov/2023-4-core-applications-submitted"
driver.get(url)

# 获取完整 HTML 源代码
page_source = driver.page_source

# 将 HTML 保存到本地文件
with open("2023_applications_page.html", "w", encoding="utf-8") as file:
    file.write(page_source)

print("IMDb 页面 HTML 已保存为 imdb_page.html")

IMDb 页面 HTML 已保存为 imdb_page.html
