In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm
import math
import time
import os
import polars as pl
import concurrent.futures

def driversetup():
    options = webdriver.ChromeOptions()
    #run Selenium in headless mode
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    #overcome limited resource problems
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("lang=en")
    #open Browser in maximized mode
    options.add_argument("start-maximized")
    #disable infobars
    options.add_argument("disable-infobars")
    #disable extension
    options.add_argument("--disable-extensions")
#     options.add_argument("--incognito")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    driver = webdriver.Chrome(options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});")
    return driver

BASE_URL = 'https://www.fpds.gov/ezsearch/search.do?indexName=awardfull&templateName=1.5.3&s=FPDS.GOV&q='
cur_dir = os.path.dirname(os.getcwd())
naics_data_dir = os.path.join(cur_dir, 'NAICS Processed')
df = pl.read_csv(os.path.join(naics_data_dir, '236220_second_proc.csv'))
df.head()

special_comp_params = ["Follow On to Competed Action",
                       "Not Available for Competition",
                       "Not Competed"]

In [4]:
import polars as pl
from tqdm.notebook import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor, as_completed

# Initialize the drive

# Define the ranges and data
ranges = (0, 52962)

# Function to process a single row
def process_row(row,driver):
    contract_id = row[0]
    agency_id = row[3]
    agency_id = agency_id if agency_id != '2100' else '9700'
    URL = f'https://www.fpds.gov/ezsearch/jsp/viewLinkController.jsp?agencyID={agency_id}&PIID={contract_id}&modNumber=0&transactionNumber=0&idvAgencyID=&idvPIID=&actionSource=searchScreen&actionCode=&documentVersion=1.0&contractType=AWARD&docType=D'
    driver.get(URL)
    try:
        extent = driver.find_element(By.CSS_SELECTOR, "#extentCompeted option:checked").text
    except:
        extent = driver.find_element(By.ID, "competitionInformationForDisplay").get_attribute("value")
    bid_num = driver.find_element(By.ID, "numberOfOffersReceived").get_attribute("value")
    return 'Open' if extent not in special_comp_params else 'Restricted', bid_num

# Function to process a chunk of rows
def process_chunk(chunk, chunk_id):
    driver = driversetup()
    comp_type = []
    bids = []
    for row in tqdm(chunk, desc=f"Thread {chunk_id}", position=chunk_id):
        extent, bid_num = process_row(row, driver)
        comp_type.append(extent)
        bids.append(bid_num)
    return comp_type, bids

# Split the data into chunks for multithreading
num_threads = 10
chunk_size = len(df) // num_threads
chunks = [df[i * chunk_size : (i + 1) * chunk_size] for i in range(num_threads)]

# Ensure the last chunk captures the remaining data
if len(df) % num_threads != 0:
    chunks[-1] = df[(num_threads - 1) * chunk_size :]

# Use ThreadPoolExecutor to process chunks in parallel
comp_type = []
bids = []
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = [executor.submit(process_chunk, chunk, i) for i, chunk in enumerate(chunks)]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Overall Progress"):
        chunk_comp_type, chunk_bids = future.result()
        comp_type.extend(chunk_comp_type)
        bids.extend(chunk_bids)

# # Combine results
# df_result = pl.DataFrame({
#     "Competition Type": comp_type,
#     "Bids": bids
# })

# # Display the result
# print(df_result)

Overall Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Thread 2:   0%|          | 0/5296 [00:00<?, ?it/s]

Thread 1:   0%|          | 0/5296 [00:00<?, ?it/s]

Thread 0:   0%|          | 0/5296 [00:00<?, ?it/s]

Thread 3:   0%|          | 0/5296 [00:00<?, ?it/s]

Thread 4:   0%|          | 0/5296 [00:00<?, ?it/s]

Thread 5:   0%|          | 0/5296 [00:00<?, ?it/s]

Thread 6:   0%|          | 0/5296 [00:00<?, ?it/s]

Thread 8:   0%|          | 0/5296 [00:00<?, ?it/s]

Thread 7:   0%|          | 0/5296 [00:00<?, ?it/s]

Thread 9:   0%|          | 0/5305 [00:00<?, ?it/s]

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="competitionInformationForDisplay"]"}
  (Session info: chrome=134.0.6998.89); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF78545FE45+26629]
	(No symbol) [0x00007FF7853C6010]
	(No symbol) [0x00007FF78525931A]
	(No symbol) [0x00007FF7852AF8E7]
	(No symbol) [0x00007FF7852AFB1C]
	(No symbol) [0x00007FF7853034A7]
	(No symbol) [0x00007FF7852D7AEF]
	(No symbol) [0x00007FF785300169]
	(No symbol) [0x00007FF7852D7883]
	(No symbol) [0x00007FF7852A0550]
	(No symbol) [0x00007FF7852A1803]
	GetHandleVerifier [0x00007FF7857B72BD+3529853]
	GetHandleVerifier [0x00007FF7857CDA22+3621858]
	GetHandleVerifier [0x00007FF7857C24D3+3575443]
	GetHandleVerifier [0x00007FF78552B77A+860474]
	(No symbol) [0x00007FF7853D088F]
	(No symbol) [0x00007FF7853CCBC4]
	(No symbol) [0x00007FF7853CCD66]
	(No symbol) [0x00007FF7853BC2C9]
	BaseThreadInitThunk [0x00007FFF4213E8D7+23]
	RtlUserThreadStart [0x00007FFF432FBF6C+44]


In [11]:
import concurrent.futures
from tqdm import tqdm
from selenium.webdriver.common.by import By
import polars as pl

# Helper function to process a row
def process_row(row, driver):
    contract_id = row[0]
    agency_id = row[3]
    agency_id = agency_id if agency_id != '2100' else '9700'
    URL = f'https://www.fpds.gov/ezsearch/jsp/viewLinkController.jsp?agencyID={agency_id}&PIID={contract_id}&modNumber=0&transactionNumber=0&idvAgencyID=&idvPIID=&actionSource=searchScreen&actionCode=&documentVersion=1.0&contractType=AWARD&docType=D'
    driver.get(URL)
    try:
        extent = driver.find_element(By.CSS_SELECTOR, "#extentCompeted option:checked").text
    except:
        try:
            extent = driver.find_element(By.ID, "competitionInformationForDisplay").get_attribute("value")
        except:
            extent = 'Error'
    bid_num = driver.find_element(By.ID, "numberOfOffersReceived").get_attribute("value")
    return 'Open' if extent not in special_comp_params else 'Restricted', bid_num

# Function to process a chunk of rows
def process_rows(chunk, driver):
    comp_type = []
    bids = []
    for row in chunk:
        extent, bid_num = process_row(row, driver)
        comp_type.append(extent)
        bids.append(bid_num)
    return comp_type, bids

# Main function to setup concurrent processing
def concurrent_process(df, ranges, num_workers=8):
    comp_type = []
    bids = []

    # Create a pool of drivers
    drivers = [driversetup() for _ in range(num_workers)]

    # Create threads
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = []
        
        chunk_size = 1000 
        for i in range(ranges[0], ranges[1], chunk_size):
            chunk = df[i:i+chunk_size]
            driver = drivers[i % num_workers]  # Assign a driver from the pool
            futures.append(executor.submit(process_rows, chunk, driver))
        
        # Gather results
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing rows"):
            chunk_comp_type, chunk_bids = future.result()
            comp_type.extend(chunk_comp_type)
            bids.extend(chunk_bids)
    
    # Close all drivers
    for driver in drivers:
        driver.quit()
    
    return comp_type, bids

# Example usage
ranges = (0, 52996)  # Now working with the larger range
comp_type, bids = concurrent_process(df, ranges)

Processing rows: 100%|█████████████████████████████████████████████████████████████████| 53/53 [11:19<00:00, 12.82s/it]


In [11]:
import concurrent.futures
from tqdm import tqdm
from selenium.webdriver.common.by import By
import polars as pl

# Helper function to process a row
def process_row(row, driver):
    contract_id = row[0]
    agency_id = row[3]
    agency_id = agency_id if agency_id != '2100' else '9700'
    URL = f'https://www.fpds.gov/ezsearch/jsp/viewLinkController.jsp?agencyID={agency_id}&PIID={contract_id}&modNumber=0&transactionNumber=0&idvAgencyID=&idvPIID=&actionSource=searchScreen&actionCode=&documentVersion=1.0&contractType=AWARD&docType=D'
    driver.get(URL)
    try:
        extent = driver.find_element(By.CSS_SELECTOR, "#extentCompeted option:checked").text
    except:
        try:
            extent = driver.find_element(By.ID, "competitionInformationForDisplay").get_attribute("value")
        except:
            extent = 'Error'
    bid_num = driver.find_element(By.ID, "numberOfOffersReceived").get_attribute("value")
    return 'Open' if extent not in special_comp_params else 'Restricted', bid_num

# Function to process a chunk of rows
def process_rows(chunk, driver):
    comp_type = []
    bids = []
    for row in chunk:
        extent, bid_num = process_row(row, driver)
        comp_type.append(extent)
        bids.append(bid_num)
    return comp_type, bids

# Main function to setup concurrent processing
def concurrent_process(df, ranges, num_workers=8):
    comp_type = []
    bids = []

    # Create a pool of drivers
    drivers = [driversetup() for _ in range(num_workers)]

    # Create threads
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = []
        
        chunk_size = 1000 
        for i in range(ranges[0], ranges[1], chunk_size):
            chunk = df[i:i+chunk_size]
            driver = drivers[i % num_workers]  # Assign a driver from the pool
            futures.append(executor.submit(process_rows, chunk, driver))
        
        # Gather results
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing rows"):
            chunk_comp_type, chunk_bids = future.result()
            comp_type.extend(chunk_comp_type)
            bids.extend(chunk_bids)
    
    # Close all drivers
    for driver in drivers:
        driver.quit()
    
    return comp_type, bids

# Example usage
ranges = (0, 52996)  # Now working with the larger range
comp_type, bids = concurrent_process(df, ranges)

Processing rows: 100%|█████████████████████████████████████████████████████████████████| 53/53 [11:19<00:00, 12.82s/it]


In [3]:
chunks

[shape: (5_296, 10)
 ┌────────────────┬────────┬─────────────┬────────────────┬───┬────────────────┬────────────┬───────────────┬───────────┐
 │ Contract ID    ┆ NAICS  ┆ Date Signed ┆ Contracting    ┆ … ┆ Business       ┆ Action     ┆ Modification  ┆ Total     │
 │ ---            ┆ ---    ┆ ---         ┆ Agency ID      ┆   ┆ Entity ID      ┆ Obligation ┆ Count         ┆ Modified  │
 │ str            ┆ i64    ┆ str         ┆ ---            ┆   ┆ ---            ┆ ($)        ┆ ---           ┆ Action    │
 │                ┆        ┆             ┆ str            ┆   ┆ str            ┆ ---        ┆ i64           ┆ Obligati… │
 │                ┆        ┆             ┆                ┆   ┆                ┆ i64        ┆               ┆ ---       │
 │                ┆        ┆             ┆                ┆   ┆                ┆            ┆               ┆ i64       │
 ╞════════════════╪════════╪═════════════╪════════════════╪═══╪════════════════╪════════════╪═══════════════╪═══════════╡
 │ G

In [14]:
pl.Series(comp_type)

"""Open"""
"""Open"""
"""Open"""
"""Open"""
"""Open"""
…
"""Open"""
"""Open"""
"""Open"""
"""Open"""
"""Open"""


In [12]:
len(comp_type)

530

In [15]:
###
# this works for just one thread
# 
###
driver = driversetup()
ranges = (0,2500)
comp_type = []
bids = []

for row in tqdm(df[ranges[0]:ranges[1]].iter_rows(named=True), total=ranges[1]-ranges[0], desc="Processing rows"):
    contract_id = row['Contract ID']
    agency_id = row['Contracting Agency ID']
    agency_id = agency_id if agency_id != '2100' else '9700'
    URL = f'https://www.fpds.gov/ezsearch/jsp/viewLinkController.jsp?agencyID={agency_id}&PIID={contract_id}&modNumber=0&transactionNumber=0&idvAgencyID=&idvPIID=&actionSource=searchScreen&actionCode=&documentVersion=1.0&contractType=AWARD&docType=D'
    driver.get(URL)
    # get extent competed and bids
    try:
        extent = driver.find_element(By.CSS_SELECTOR, "#extentCompeted option:checked").text
    except:
        extent = driver.find_element(By.ID, "competitionInformationForDisplay").get_attribute("value")
    bid_num = driver.find_element(By.ID, "numberOfOffersReceived").get_attribute("value")
    # add to lists
    comp_type.append('Open' if extent not in special_comp_params else 'Restricted')
    bids.append(bid_num)

Processing rows: 100%|███████████████████████████████████████████████████████████| 2500/2500 [1:52:09<00:00,  2.69s/it]


In [16]:
df = df[ranges[0]:ranges[1]]
merged_df = df.with_columns([
    pl.Series('Competition Type', comp_type),
    pl.Series('Bids', bids)
])

save_dir = os.path.join(os.getcwd(), f'{ranges[0]}_{ranges[1]}.csv')
merged_df.write_csv(save_dir)

In [69]:
###
# this works for just one thread
# 
###

cur_dir = os.path.dirname(os.getcwd())
naics_data_dir = os.path.join(cur_dir, 'NAICS Processed')
df = pl.read_csv(os.path.join(naics_data_dir, '236220_second_proc.csv'))
driver = driversetup()

ranges = (2500,5000)
comp_type = []
bids = []

for row in tqdm(df[ranges[0]:ranges[1]].iter_rows(named=True), total=ranges[1]-ranges[0], desc="Processing rows"):
    contract_id = row['Contract ID']
    agency_id = row['Contracting Agency ID']
    agency_id = agency_id if agency_id != '2100' else '9700'
    URL = f'https://www.fpds.gov/ezsearch/jsp/viewLinkController.jsp?agencyID={agency_id}&PIID={contract_id}&modNumber=0&transactionNumber=0&idvAgencyID=&idvPIID=&actionSource=searchScreen&actionCode=&documentVersion=1.0&contractType=AWARD&docType=D'
    driver.get(URL)
    # get extent competed and bids
    try:
        extent = driver.find_element(By.CSS_SELECTOR, "#extentCompeted option:checked").text
    except:
        extent = driver.find_element(By.ID, "competitionInformationForDisplay").get_attribute("value")
    bid_num = driver.find_element(By.ID, "numberOfOffersReceived").get_attribute("value")
    # add to lists
    comp_type.append('Open' if extent not in special_comp_params else 'Restricted')
    bids.append(bid_num)
    
df = df[ranges[0]:ranges[1]]
merged_df = df.with_columns([
    pl.Series('Competition Type', comp_type),
    pl.Series('Bids', bids)
])

save_dir = os.path.join(os.getcwd(), f'{ranges[0]}_{ranges[1]}.csv')
merged_df.write_csv(save_dir)

Processing rows: 100%|███████████████████████████████████████████████████████████| 2500/2500 [1:19:06<00:00,  1.90s/it]


In [17]:
###
# this works for just one thread
# 
###

cur_dir = os.path.dirname(os.getcwd())
naics_data_dir = os.path.join(cur_dir, 'NAICS Processed')
df = pl.read_csv(os.path.join(naics_data_dir, '236220_second_proc.csv'))
driver = driversetup()

ranges = (5000,7500)
comp_type = []
bids = []

for row in tqdm(df[ranges[0]:ranges[1]].iter_rows(named=True), total=ranges[1]-ranges[0], desc="Processing rows"):
    contract_id = row['Contract ID']
    agency_id = row['Contracting Agency ID']
    agency_id = agency_id if agency_id != '2100' else '9700'
    URL = f'https://www.fpds.gov/ezsearch/jsp/viewLinkController.jsp?agencyID={agency_id}&PIID={contract_id}&modNumber=0&transactionNumber=0&idvAgencyID=&idvPIID=&actionSource=searchScreen&actionCode=&documentVersion=1.0&contractType=AWARD&docType=D'
    driver.get(URL)
    # get extent competed and bids
    try:
        extent = driver.find_element(By.CSS_SELECTOR, "#extentCompeted option:checked").text
    except:
        extent = driver.find_element(By.ID, "competitionInformationForDisplay").get_attribute("value")
    bid_num = driver.find_element(By.ID, "numberOfOffersReceived").get_attribute("value")
    # add to lists
    comp_type.append('Open' if extent not in special_comp_params else 'Restricted')
    bids.append(bid_num)
    
df = df[ranges[0]:ranges[1]]
merged_df = df.with_columns([
    pl.Series('Competition Type', comp_type),
    pl.Series('Bids', bids)
])

save_dir = os.path.join(os.getcwd(), f'{ranges[0]}_{ranges[1]}.csv')
merged_df.write_csv(save_dir)

Processing rows: 100%|███████████████████████████████████████████████████████████| 2500/2500 [1:40:29<00:00,  2.41s/it]


In [72]:
###
# this works for just one thread
# 
###

cur_dir = os.path.dirname(os.getcwd())
naics_data_dir = os.path.join(cur_dir, 'NAICS Processed')
df = pl.read_csv(os.path.join(naics_data_dir, '236220_second_proc.csv'))
driver = driversetup()

ranges = (7500,10000)
comp_type = []
bids = []

for row in tqdm(df[ranges[0]:ranges[1]].iter_rows(named=True), total=ranges[1]-ranges[0], desc="Processing rows"):
    contract_id = row['Contract ID']
    agency_id = row['Contracting Agency ID']
    agency_id = agency_id if agency_id != '2100' else '9700'
    URL = f'https://www.fpds.gov/ezsearch/jsp/viewLinkController.jsp?agencyID={agency_id}&PIID={contract_id}&modNumber=0&transactionNumber=0&idvAgencyID=&idvPIID=&actionSource=searchScreen&actionCode=&documentVersion=1.0&contractType=AWARD&docType=D'
    driver.get(URL)
    # get extent competed and bids
    try:
        extent = driver.find_element(By.CSS_SELECTOR, "#extentCompeted option:checked").text
    except:
        extent = driver.find_element(By.ID, "competitionInformationForDisplay").get_attribute("value")
    bid_num = driver.find_element(By.ID, "numberOfOffersReceived").get_attribute("value")
    # add to lists
    comp_type.append('Open' if extent not in special_comp_params else 'Restricted')
    bids.append(bid_num)
    
df = df[ranges[0]:ranges[1]]
merged_df = df.with_columns([
    pl.Series('Competition Type', comp_type),
    pl.Series('Bids', bids)
])

save_dir = os.path.join(os.getcwd(), f'{ranges[0]}_{ranges[1]}.csv')
merged_df.write_csv(save_dir)

Processing rows: 100%|███████████████████████████████████████████████████████████| 2500/2500 [1:49:45<00:00,  2.63s/it]


In [71]:
URL

'https://www.fpds.gov/ezsearch/jsp/viewLinkController.jsp?agencyID=9700&PIID=W912DW07C0008&modNumber=0&transactionNumber=0&idvAgencyID=&idvPIID=&actionSource=searchScreen&actionCode=&documentVersion=1.0&contractType=AWARD&docType=D'

In [3]:
###
# this works for just one thread
# 
###

cur_dir = os.path.dirname(os.getcwd())
naics_data_dir = os.path.join(cur_dir, 'NAICS Processed')
df = pl.read_csv(os.path.join(naics_data_dir, '236220_second_proc.csv'))
driver = driversetup()

ranges = (10000,12500)
comp_type = []
bids = []

for row in tqdm(df[ranges[0]:ranges[1]].iter_rows(named=True), total=ranges[1]-ranges[0], desc="Processing rows"):
    contract_id = row['Contract ID']
    agency_id = row['Contracting Agency ID']
    agency_id = agency_id if agency_id != '2100' else '9700'
    URL = f'https://www.fpds.gov/ezsearch/jsp/viewLinkController.jsp?agencyID={agency_id}&PIID={contract_id}&modNumber=0&transactionNumber=0&idvAgencyID=&idvPIID=&actionSource=searchScreen&actionCode=&documentVersion=1.0&contractType=AWARD&docType=D'
    driver.get(URL)
    # get extent competed and bids
    try:
        extent = driver.find_element(By.CSS_SELECTOR, "#extentCompeted option:checked").text
    except:
        extent = driver.find_element(By.ID, "competitionInformationForDisplay").get_attribute("value")
    bid_num = driver.find_element(By.ID, "numberOfOffersReceived").get_attribute("value")
    # add to lists
    comp_type.append('Open' if extent not in special_comp_params else 'Restricted')
    bids.append(bid_num)
    
df = df[ranges[0]:ranges[1]]
merged_df = df.with_columns([
    pl.Series('Competition Type', comp_type),
    pl.Series('Bids', bids)
])

save_dir = os.path.join(os.getcwd(), f'{ranges[0]}_{ranges[1]}.csv')
merged_df.write_csv(save_dir)

Processing rows: 100%|███████████████████████████████████████████████████████████| 2500/2500 [1:31:39<00:00,  2.20s/it]


In [None]:
comp_type = []
bids = []

In [8]:
###
# this works for just one thread
# 
###

cur_dir = os.path.dirname(os.getcwd())
naics_data_dir = os.path.join(cur_dir, 'NAICS Processed')
df = pl.read_csv(os.path.join(naics_data_dir, '236220_second_proc.csv'))
driver = driversetup()

ranges = (12500,15000)
# comp_type = []
# bids = []

for row in tqdm(df[ranges[0]+len(comp_type):ranges[1]].iter_rows(named=True), total=ranges[1]-ranges[0]-len(comp_type), desc="Processing rows"):
    contract_id = row['Contract ID']
    agency_id = row['Contracting Agency ID']
    agency_id = agency_id if agency_id != '2100' else '9700'
    URL = f'https://www.fpds.gov/ezsearch/jsp/viewLinkController.jsp?agencyID={agency_id}&PIID={contract_id}&modNumber=0&transactionNumber=0&idvAgencyID=&idvPIID=&actionSource=searchScreen&actionCode=&documentVersion=1.0&contractType=AWARD&docType=D'
    driver.get(URL)
    # get extent competed and bids
    try:
        extent = driver.find_element(By.CSS_SELECTOR, "#extentCompeted option:checked").text
    except:
        time.sleep(0.5)
        extent = driver.find_element(By.ID, "competitionInformationForDisplay").get_attribute("value")
    bid_num = driver.find_element(By.ID, "numberOfOffersReceived").get_attribute("value")
    # add to lists
    comp_type.append('Open' if extent not in special_comp_params else 'Restricted')
    bids.append(bid_num)
    
df = df[ranges[0]:ranges[1]]
merged_df = df.with_columns([
    pl.Series('Competition Type', comp_type),
    pl.Series('Bids', bids)
])

save_dir = os.path.join(os.getcwd(), f'{ranges[0]}_{ranges[1]}.csv')
merged_df.write_csv(save_dir)

Processing rows: 100%|███████████████████████████████████████████████████████████████| 525/525 [56:33<00:00,  6.46s/it]


In [None]:
###
# this works for just one thread
# 
###

cur_dir = os.path.dirname(os.getcwd())
naics_data_dir = os.path.join(cur_dir, 'NAICS Processed')
df = pl.read_csv(os.path.join(naics_data_dir, '236220_second_proc.csv'))
driver = driversetup()

ranges = (50000,52970)
comp_type = []
bids = []

for row in tqdm(df[ranges[0]+len(comp_type):ranges[1]].iter_rows(named=True), total=ranges[1]-ranges[0]-len(comp_type), desc="Processing rows"):
    contract_id = row['Contract ID']
    agency_id = row['Contracting Agency ID']
    agency_id = agency_id if agency_id != '2100' else '9700'
    URL = f'https://www.fpds.gov/ezsearch/jsp/viewLinkController.jsp?agencyID={agency_id}&PIID={contract_id}&modNumber=0&transactionNumber=0&idvAgencyID=&idvPIID=&actionSource=searchScreen&actionCode=&documentVersion=1.0&contractType=AWARD&docType=D'
    driver.get(URL)
    # get extent competed and bids
    try:
        extent = driver.find_element(By.CSS_SELECTOR, "#extentCompeted option:checked").text
    except:
        time.sleep(0.5)
        extent = driver.find_element(By.ID, "competitionInformationForDisplay").get_attribute("value")
    bid_num = driver.find_element(By.ID, "numberOfOffersReceived").get_attribute("value")
    # add to lists
    comp_type.append('Open' if extent not in special_comp_params else 'Restricted')
    bids.append(bid_num)
    
df = df[ranges[0]:ranges[1]]
merged_df = df.with_columns([
    pl.Series('Competition Type', comp_type),
    pl.Series('Bids', bids)
])

save_dir = os.path.join(os.getcwd(), f'{ranges[0]}_{ranges[1]}.csv')
merged_df.write_csv(save_dir)

In [None]:
###
# this works for just one thread
# 
###

cur_dir = os.path.dirname(os.getcwd())
naics_data_dir = os.path.join(cur_dir, 'NAICS Processed')
df = pl.read_csv(os.path.join(naics_data_dir, '236220_second_proc.csv'))
driver = driversetup()

ranges = (15000,17500)
comp_type = []
bids = []

for row in tqdm(df[ranges[0]:ranges[1]].iter_rows(named=True), total=ranges[1]-ranges[0], desc="Processing rows"):
    contract_id = row['Contract ID']
    agency_id = row['Contracting Agency ID']
    agency_id = agency_id if agency_id != '2100' else '9700'
    URL = f'https://www.fpds.gov/ezsearch/jsp/viewLinkController.jsp?agencyID={agency_id}&PIID={contract_id}&modNumber=0&transactionNumber=0&idvAgencyID=&idvPIID=&actionSource=searchScreen&actionCode=&documentVersion=1.0&contractType=AWARD&docType=D'
    driver.get(URL)
    # get extent competed and bids
    try:
        extent = driver.find_element(By.CSS_SELECTOR, "#extentCompeted option:checked").text
    except:
        extent = driver.find_element(By.ID, "competitionInformationForDisplay").get_attribute("value")
    bid_num = driver.find_element(By.ID, "numberOfOffersReceived").get_attribute("value")
    # add to lists
    comp_type.append('Open' if extent not in special_comp_params else 'Restricted')
    bids.append(bid_num)
    
df = df[ranges[0]:ranges[1]]
merged_df = df.with_columns([
    pl.Series('Competition Type', comp_type),
    pl.Series('Bids', bids)
])

save_dir = os.path.join(os.getcwd(), f'{ranges[0]}_{ranges[1]}.csv')
merged_df.write_csv(save_dir)

In [None]:
###
# this works for just one thread
# 
###

cur_dir = os.path.dirname(os.getcwd())
naics_data_dir = os.path.join(cur_dir, 'NAICS Processed')
df = pl.read_csv(os.path.join(naics_data_dir, '236220_second_proc.csv'))
driver = driversetup()

ranges = (17500,20000)
comp_type = []
bids = []

for row in tqdm(df[ranges[0]:ranges[1]].iter_rows(named=True), total=ranges[1]-ranges[0], desc="Processing rows"):
    contract_id = row['Contract ID']
    agency_id = row['Contracting Agency ID']
    agency_id = agency_id if agency_id != '2100' else '9700'
    URL = f'https://www.fpds.gov/ezsearch/jsp/viewLinkController.jsp?agencyID={agency_id}&PIID={contract_id}&modNumber=0&transactionNumber=0&idvAgencyID=&idvPIID=&actionSource=searchScreen&actionCode=&documentVersion=1.0&contractType=AWARD&docType=D'
    driver.get(URL)
    # get extent competed and bids
    try:
        extent = driver.find_element(By.CSS_SELECTOR, "#extentCompeted option:checked").text
    except:
        extent = driver.find_element(By.ID, "competitionInformationForDisplay").get_attribute("value")
    bid_num = driver.find_element(By.ID, "numberOfOffersReceived").get_attribute("value")
    # add to lists
    comp_type.append('Open' if extent not in special_comp_params else 'Restricted')
    bids.append(bid_num)
    
df = df[ranges[0]:ranges[1]]
merged_df = df.with_columns([
    pl.Series('Competition Type', comp_type),
    pl.Series('Bids', bids)
])

save_dir = os.path.join(os.getcwd(), f'{ranges[0]}_{ranges[1]}.csv')
merged_df.write_csv(save_dir)

In [None]:
###
# this works for just one thread
# 
###

cur_dir = os.path.dirname(os.getcwd())
naics_data_dir = os.path.join(cur_dir, 'NAICS Processed')
df = pl.read_csv(os.path.join(naics_data_dir, '236220_second_proc.csv'))
driver = driversetup()

ranges = (50000,52970)
comp_type = []
bids = []

for row in tqdm(df[ranges[0]:ranges[1]].iter_rows(named=True), total=ranges[1]-ranges[0], desc="Processing rows"):
    contract_id = row['Contract ID']
    agency_id = row['Contracting Agency ID']
    agency_id = agency_id if agency_id != '2100' else '9700'
    URL = f'https://www.fpds.gov/ezsearch/jsp/viewLinkController.jsp?agencyID={agency_id}&PIID={contract_id}&modNumber=0&transactionNumber=0&idvAgencyID=&idvPIID=&actionSource=searchScreen&actionCode=&documentVersion=1.0&contractType=AWARD&docType=D'
    driver.get(URL)
    # get extent competed and bids
    try:
        extent = driver.find_element(By.CSS_SELECTOR, "#extentCompeted option:checked").text
    except:
        extent = driver.find_element(By.ID, "competitionInformationForDisplay").get_attribute("value")
    bid_num = driver.find_element(By.ID, "numberOfOffersReceived").get_attribute("value")
    # add to lists
    comp_type.append('Open' if extent not in special_comp_params else 'Restricted')
    bids.append(bid_num)
    
df = df[ranges[0]:ranges[1]]
merged_df = df.with_columns([
    pl.Series('Competition Type', comp_type),
    pl.Series('Bids', bids)
])

save_dir = os.path.join(os.getcwd(), f'{ranges[0]}_{ranges[1]}.csv')
merged_df.write_csv(save_dir)

### USA Spending Stuff

In [49]:
def driversetup():
    options = webdriver.ChromeOptions()
    #run Selenium in headless mode
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    #overcome limited resource problems
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("lang=en")
    #open Browser in maximized mode
    options.add_argument("start-maximized")
    #disable infobars
    options.add_argument("disable-infobars")
    #disable extension
    options.add_argument("--disable-extensions")
#     options.add_argument("--incognito")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    driver = webdriver.Chrome(options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});")
    return driver

# URL = 'https://www.usaspending.gov/search'
# cur_dir = os.path.dirname(os.getcwd())
# naics_data_dir = os.path.join(cur_dir, 'NAICS Processed')
# df = pl.read_csv(os.path.join(naics_data_dir, '236220_second_proc.csv'))

driver = driversetup()

contract_id = 'GS11P14MAP0340'
agency_id = '4740'
agency_id = agency_id if agency_id != '2100' else '9700'
URL = f'https://www.usaspending.gov/award/CONT_AWD_{contract_id}_{agency_id}_-NONE-_-NONE-'
driver.get(URL)
time.sleep(2)
subawards = driver.find_element(By.XPATH, "/html/body/div/div/main/div/div[6]/div/div[2]/div[1]/div[3]/div/div/div/div[2]").text

In [68]:
driver = driversetup()
ranges = (0,2500)

contract_id = row['Contract ID']
agency_id = row['Contracting Agency ID']
agency_id = agency_id if agency_id != '2100' else '9700'
URL = f'https://www.usaspending.gov/award/CONT_AWD_{contract_id}_{agency_id}_-NONE-_-NONE-'
driver.get(URL)

for row in tqdm(df[ranges[0]:ranges[1]].iter_rows(named=True), total=ranges[1]-ranges[0], desc="Processing rows"):
    contract_id = row['Contract ID']
    agency_id = row['Contracting Agency ID']
    agency_id = agency_id if agency_id != '2100' else '9700'
    URL = f'https://www.usaspending.gov/award/CONT_AWD_{contract_id}_{agency_id}_-NONE-_-NONE-'
    driver.get(URL)
    time.sleep(3)
    try:
        subawards = driver.find_element(By.XPATH, "/html/body/div/div/main/div/div[6]/div/div[2]/div[1]/div[3]/div/div/div/div[2]").text
    except:
            time.sleep(10)
            subawards = driver.find_element(By.XPATH, "/html/body/div/div/main/div/div[5]/div/div[2]/div[1]/div[3]/div/div/div/div[2]").text
    if subawards != '0':
        print(contract_id)
        break

Processing rows:   2%|▉                                                            | 39/2500 [03:59<4:11:42,  6.14s/it]


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/div/div/main/div/div[5]/div/div[2]/div[1]/div[3]/div/div/div/div[2]"}
  (Session info: chrome=134.0.6998.89); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF78545FE45+26629]
	(No symbol) [0x00007FF7853C6010]
	(No symbol) [0x00007FF78525931A]
	(No symbol) [0x00007FF7852AF8E7]
	(No symbol) [0x00007FF7852AFB1C]
	(No symbol) [0x00007FF7853034A7]
	(No symbol) [0x00007FF7852D7AEF]
	(No symbol) [0x00007FF785300169]
	(No symbol) [0x00007FF7852D7883]
	(No symbol) [0x00007FF7852A0550]
	(No symbol) [0x00007FF7852A1803]
	GetHandleVerifier [0x00007FF7857B72BD+3529853]
	GetHandleVerifier [0x00007FF7857CDA22+3621858]
	GetHandleVerifier [0x00007FF7857C24D3+3575443]
	GetHandleVerifier [0x00007FF78552B77A+860474]
	(No symbol) [0x00007FF7853D088F]
	(No symbol) [0x00007FF7853CCBC4]
	(No symbol) [0x00007FF7853CCD66]
	(No symbol) [0x00007FF7853BC2C9]
	BaseThreadInitThunk [0x00007FFF4213E8D7+23]
	RtlUserThreadStart [0x00007FFF432FBF6C+44]


In [66]:
URL

'https://www.usaspending.gov/award/CONT_AWD_IND0407CT66810_1406_-NONE-_-NONE-'

In [55]:
df

Contract ID,NAICS,Date Signed,Contracting Agency ID,PSC,Region,Business Entity ID,Action Obligation ($),Modification Count,Total Modified Action Obligation ($)
str,i64,str,str,str,str,str,i64,i64,i64
"""GS11P14MAP0340""",236220,"""2014-10-16""","""4740""","""Z2AA""","""Southeast""","""RWDWFG6WGRK9""",644325,1,28426
"""IND0407CT66810""",236220,"""2006-12-21""","""1406""","""Y199""","""West""","""U4K9M66MUHR9""",68450,0,0
"""W912LD06C0036""",236220,"""2006-09-25""","""2100""","""Y111""","""Northeast""","""EATDZJL6JFJ5""",12573650,15,556384
"""GS11P06ZGC0339""",236220,"""2006-10-17""","""4740""","""J039""","""Southeast""","""QKCFMFL4MKT6""",11943,1,1504
"""HHSD200200618928C""",236220,"""2006-08-30""","""7523""","""Y111""","""Midwest""","""TTHNY8N2PQR3""",192500,1,38349
…,…,…,…,…,…,…,…,…,…
"""W90U4210C4063""",236220,"""2010-05-05""","""2100""","""Y111""","""Southeast""","""LN9PU5M2YZN5""",271452,1,0
"""FA461013C0017""",236220,"""2013-06-17""","""5700""","""Z2JZ""","""Midwest""","""ZMS1ZNDY2XV6""",60893,0,0
"""697DCK18C00213""",236220,"""2018-05-03""","""6920""","""Y1BC""","""West""","""X9UNMCNK3A64""",347768,2,0
"""N4008006C1049""",236220,"""2006-08-02""","""1700""","""Z199""","""Southeast""","""CEHNA6XZU795""",256297,3,38835


In [None]:
    contract_id = row['Contract ID']
    agency_id = row['Contracting Agency ID']
    agency_id = agency_id if agency_id != '2100' else '9700'
    URL = f'https://www.usaspending.gov/award/CONT_AWD_{contract_id}_{agency_id}_-NONE-_-NONE-'

In [9]:
df.shape

(2500, 10)