In [1]:
import pandas as pd
from pathlib import Path

from mndot_bid_etl.transform.abstract import read_abstract_csv

#### Step 1.)
Read a csv into separate dataframes. Use data/csv/2022/220002.csv for testing.

In [2]:
csv_file = Path("../data/csv/2022/220002.csv").resolve()
abstract = read_abstract_csv(csv_file)

In [3]:
contract_df = abstract.contract_df
contract_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Letting Date     1 non-null      object
 1   Job Description  1 non-null      object
 2   Contract Id      1 non-null      object
 3   SP Number        1 non-null      object
 4   District         1 non-null      object
 5   County           1 non-null      object
dtypes: object(6)
memory usage: 176.0+ bytes


In [4]:
contract_df.at[0, "Contract Id"]

'220002'

In [5]:
bid_df = abstract.bid_df
bid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   ContractId                   142 non-null    object
 1   SectionDescription           142 non-null    object
 2   LineNumber                   142 non-null    object
 3   ItemNumber                   142 non-null    object
 4   ItemDescription              142 non-null    object
 5   UnitPrice                    142 non-null    object
 6   Quantity                     142 non-null    object
 7   UnitName                     142 non-null    object
 8   Engineers (Unit Price)       142 non-null    object
 9   Engineers (Extended Amount)  142 non-null    object
 10  0000198793 (Unit Price)      142 non-null    object
 11  0000198793 (Extension)       142 non-null    object
 12  0000210000 (Unit Price)      142 non-null    object
 13  0000210000 (Extension)       142 no

In [6]:
bid_df.at[0, "ContractId"]

'220002'

In [7]:
bidder_df = abstract.bidder_df
bidder_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Bidder Number  3 non-null      object
 1   Bidder Name    3 non-null      object
 2   Bidder Amount  3 non-null      object
dtypes: object(3)
memory usage: 200.0+ bytes


In [8]:
bidder_df.at[0, "Bidder Number"]

'0000198793'

#### Step 2.)
Transform the bid_df for serialization into initial bid objects.

The bid object should contain: ```item_id, bidder_id, quantity, unit_price```

In [9]:
bid_columns = bid_df.columns
print(bid_columns)

Index(['ContractId', 'SectionDescription', 'LineNumber', 'ItemNumber',
       'ItemDescription', 'UnitPrice', 'Quantity', 'UnitName',
       'Engineers (Unit Price)', 'Engineers (Extended Amount)',
       '0000198793 (Unit Price) ', '0000198793 (Extension)  ',
       '0000210000 (Unit Price) ', '0000210000 (Extension)  ',
       '0000207897 (Unit Price) ', '0000207897 (Extension)  '],
      dtype='object')


In [10]:
def get_column_names_containing_string(df: pd.DataFrame, search_strings: list[str]) -> list[str]:
    drop_columns = []
    for column_name in bid_df.columns.to_list():
        for substring in search_strings:
            if substring in column_name:
                drop_columns.append(column_name)
    return drop_columns


In [11]:
bid_search_strings = [
    "ContractId",
    "SectionDescription",
    "LineNumber",
    "ItemDescription",
    "UnitPrice",
    "UnitName",
    "Ext"
]

drop_columns = get_column_names_containing_string(bid_df, bid_search_strings)

bid_df_dropped = bid_df.drop(columns=drop_columns)
bid_df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ItemNumber                142 non-null    object
 1   Quantity                  142 non-null    object
 2   Engineers (Unit Price)    142 non-null    object
 3   0000198793 (Unit Price)   142 non-null    object
 4   0000210000 (Unit Price)   142 non-null    object
 5   0000207897 (Unit Price)   142 non-null    object
dtypes: object(6)
memory usage: 6.8+ KB


In [12]:
def rename_bid_columns(column_name: str) -> str:
    match column_name:
        case "ItemNumber":
            return "item_id"
        case "Quantity":
            return column_name.lower()
        case other:
            return column_name.split(" ")[0].lower()

    # if column_name == "ItemNumber":
    #     return "item_id"
    # elif column_name == "Quantity":
    #     return column_name.lower()
    # else:
    #     return column_name.split(" ")[0].lower()

bid_df_renamed = bid_df_dropped.rename(columns=rename_bid_columns)
bid_df_renamed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   item_id     142 non-null    object
 1   quantity    142 non-null    object
 2   engineers   142 non-null    object
 3   0000198793  142 non-null    object
 4   0000210000  142 non-null    object
 5   0000207897  142 non-null    object
dtypes: object(6)
memory usage: 6.8+ KB


In [13]:
def format_item_id(id: str) -> str:
    return id[:4] + "." + id[4:]

def format_quantity(quantity: str) -> float:
    return float(quantity.strip())

def format_price(price: str) -> int:
    cleaned_str = price.strip().replace("$", "").replace(",", "")
    return int(float(cleaned_str) * 100)


In [14]:
bid_df_formatted = bid_df_renamed.copy()
bid_df_formatted["item_id"] = bid_df_formatted["item_id"].apply(format_item_id)
bid_df_formatted["quantity"] = bid_df_formatted["quantity"].apply(format_quantity)
bid_df_formatted["engineers"] = bid_df_formatted["engineers"].apply(format_price)
bid_df_formatted["0000198793"] = bid_df_formatted["0000198793"].apply(format_price)
bid_df_formatted["0000210000"] = bid_df_formatted["0000210000"].apply(format_price)
bid_df_formatted["0000207897"] = bid_df_formatted["0000207897"].apply(format_price)

bid_df_formatted.head()

Unnamed: 0,item_id,quantity,engineers,0000198793,0000210000,0000207897
0,2011.601/01000,1.0,1500000,980000,1200000,980000
1,2021.501/00010,1.0,25000000,20900000,31700000,21600000
2,2051.501/00010,1.0,100000,100,100,100
3,2101.505/00020,1.4,800000,350000,350000,800000
4,2101.505/00030,1.4,400000,370000,370000,550000


In [15]:
bid_df_melted = bid_df_formatted.melt(
    id_vars=["item_id", "quantity"], var_name="bidder_id", value_name="unit_price"
)
pd.set_option("display.max_rows", 500)


In [16]:
bid_df_melted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568 entries, 0 to 567
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   item_id     568 non-null    object 
 1   quantity    568 non-null    float64
 2   bidder_id   568 non-null    object 
 3   unit_price  568 non-null    int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 17.9+ KB


In [17]:
bid_df_final = bid_df_melted.copy()
bid_df_final['contract_id'] = "220002"
bid_df_final['is_winning_bid'] = bid_df_final['bidder_id'] == "0000198793"

bid_df_final.head()

Unnamed: 0,item_id,quantity,bidder_id,unit_price,contract_id,is_winning_bid
0,2011.601/01000,1.0,engineers,1500000,220002,False
1,2021.501/00010,1.0,engineers,25000000,220002,False
2,2051.501/00010,1.0,engineers,100000,220002,False
3,2101.505/00020,1.4,engineers,800000,220002,False
4,2101.505/00030,1.4,engineers,400000,220002,False


In [18]:
bid_dict = bid_df_final.to_dict(orient="records")

for idx in range(10):
    print(bid_dict[idx])


{'item_id': '2011.601/01000', 'quantity': 1.0, 'bidder_id': 'engineers', 'unit_price': 1500000, 'contract_id': '220002', 'is_winning_bid': False}
{'item_id': '2021.501/00010', 'quantity': 1.0, 'bidder_id': 'engineers', 'unit_price': 25000000, 'contract_id': '220002', 'is_winning_bid': False}
{'item_id': '2051.501/00010', 'quantity': 1.0, 'bidder_id': 'engineers', 'unit_price': 100000, 'contract_id': '220002', 'is_winning_bid': False}
{'item_id': '2101.505/00020', 'quantity': 1.4, 'bidder_id': 'engineers', 'unit_price': 800000, 'contract_id': '220002', 'is_winning_bid': False}
{'item_id': '2101.505/00030', 'quantity': 1.4, 'bidder_id': 'engineers', 'unit_price': 400000, 'contract_id': '220002', 'is_winning_bid': False}
{'item_id': '2101.524/00050', 'quantity': 4.0, 'bidder_id': 'engineers', 'unit_price': 12500, 'contract_id': '220002', 'is_winning_bid': False}
{'item_id': '2104.502/00120', 'quantity': 67.0, 'bidder_id': 'engineers', 'unit_price': 18641, 'contract_id': '220002', 'is_winn