In [99]:
import pandas as pd
import os

In [100]:
def process_app_records(file_path):

    # Read the file with '|' as the delimiter
    df = pd.read_csv(file_path, delimiter="|", header=None, dtype=str)

    # Filter for 'APP' records
    app_records = df[df[0] == "APP"].reset_index(drop=True)

    # Assign the 52 column names exactly as defined in the document
    app_columns = [
        "Record_Type", "Applicant_FRN", "Auction_ID", "File_Number", "Applicant_Name",
        "First_Name", "Middle_Name", "Last_Name", "Suffix", "State_Or_Citizenship",
        "Applicant_Status", "Legal_Classification", "Reserved1", "Reserved2", "Reserved3",
        "Noncommercial_Status", "Address1", "Address2", "City", "State", "Zip_Code",
        "Country", "Bidding_Option", "Bidding_Credit", "New_Entrant_Credit",
        "Gross_Revenue_Lower", "Gross_Revenue_Upper", "Closed_Bidding_Eligibility",
        "Bidding_Credit_Percentage", "Certifier_First_Name", "Certifier_Middle_Initial",
        "Certifier_Last_Name", "Certifier_Suffix", "Certifier_Title", "Prior_Defaulter",
        "Financial_Statement_Type", "Gross_Revenue_Most_Recent", "Recent_Year_End",
        "Gross_Revenue_One_Year_Prior", "One_Year_Prior_End", "Gross_Revenue_Two_Years_Prior",
        "Two_Years_Prior_End", "Total_Assets", "Aggregate_Gross_Revenues",
        "Aggregate_Gross_Revenues_1", "Aggregate_Gross_Revenues_2", "Aggregate_Total_Assets",
        "Aggregate_Total_Assets_1", "Aggregate_Total_Assets_2", "Financial_Statement",
        "Aggregate_Financial_Statement", "Aggregate_Credits"
    ]

    # Assign the column names
    app_records.columns = app_columns

    return app_records

In [101]:
# Folder containing the .txt files
folder_path = "/Users/Enzo/Desktop/JMP/form175_data"

# Process all .txt files in the folder
all_dataframes = []
for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):
        file_path = os.path.join(folder_path, file_name)
        df = process_app_records(file_path)
        all_dataframes.append(df)

# Concatenate all dataframes into one unique dataframe
final_dataframe = pd.concat(all_dataframes, ignore_index=True)

In [102]:
### CLEANING

# Remove duplicates
bidder_data_form175 = final_dataframe.drop_duplicates()

path_to_data = '/Users/Enzo/Desktop/JMP/114402-V1/Replication-Fox-and-Bajari/data/'
bidder_data = pd.read_csv(path_to_data + 'biddercblk_03_28_2004_pln.csv')


# Extract rows where 'co_name' is not in 'Applicant_Name'
missing_rows_bidder_data = bidder_data[~bidder_data['co_name'].isin(bidder_data_form175['Applicant_Name'])]

missing_rows_form_175 = bidder_data_form175[~bidder_data_form175['Applicant_Name'].isin(bidder_data['co_name'])]



bidder_data_form175 = bidder_data_form175.copy()
bidder_data_form175['co_name'] = bidder_data_form175['Applicant_Name']
bidder_data_form175[~bidder_data_form175['Applicant_Name'].isin(bidder_data['co_name'])]['co_name']


# Identify the index of the first row where the condition is met
index_to_update = bidder_data_form175[~bidder_data_form175['Applicant_Name'].isin(bidder_data['co_name'])].index


bidder_data_form175.loc[index_to_update[0], 'co_name'] = 'Betty A. Gleaton'
bidder_data_form175.loc[index_to_update[1], 'co_name'] = 'HARVEY LEONG'
bidder_data_form175.loc[index_to_update[2], 'co_name'] = 'William Ingram'
bidder_data_form175.loc[index_to_update[3], 'co_name'] = 'ELIZABETH R. GUEST'
bidder_data_form175.loc[index_to_update[4], 'co_name'] = 'Mark M. Guest'
bidder_data_form175.loc[index_to_update[5], 'co_name'] = 'Vincent  D. McBride'
bidder_data_form175.loc[index_to_update[6], 'co_name'] = 'ADILIA M. AGUILAR'
bidder_data_form175.loc[index_to_update[7], 'co_name'] = 'Shawn Capistrano'
bidder_data_form175.loc[index_to_update[8], 'co_name'] = 'GLENN ISHIHARA'
bidder_data_form175.loc[index_to_update[9], 'co_name'] = 'Harold L. Sudbury, Jr.'


In [103]:
import os

# Define the file path
output_file = os.path.join(folder_path, "bidder_data_form175.csv")

# Check if the file exists
if not os.path.exists(output_file):
    # Save the DataFrame to a .csv file
    bidder_data_form175.to_csv(output_file, index=False)
    print(f"File saved at: {output_file}")
else:
    print(f"File already exists: {output_file}")


File already exists: /Users/Enzo/Desktop/JMP/form175_data/bidder_data_form175.csv
