In [1]:
import pandas as pd
from pathlib import Path

from mndot_bid_etl.transform.item_list import generate_item_list
from mndot_bid_etl.transform.abstract import read_abstract_csv
from mndot_bid_etl.transform.transform_bid import transform_bid_df

#### Load Data

In [2]:
item_list_2016_csv = Path("../data/csv/item_list_2016.csv").resolve()
item_list_2018_csv = Path("../data/csv/item_list_2018.csv").resolve()
item_list_2020_csv = Path("../data/csv/item_list_2020.csv").resolve()

item_list = generate_item_list(item_list_2016_csv, item_list_2018_csv, item_list_2020_csv)

#### Analyze Spec Year of Contract Id 220002

In [3]:
abstract_220002_csv = Path("../data/csv/2022/220002.csv").resolve()
abstract_220002 = read_abstract_csv(abstract_220002_csv)

In [4]:
abstract_220002.contract_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Letting Date     1 non-null      object
 1   Job Description  1 non-null      object
 2   Contract Id      1 non-null      object
 3   SP Number        1 non-null      object
 4   District         1 non-null      object
 5   County           1 non-null      object
dtypes: object(6)
memory usage: 176.0+ bytes


In [5]:
abstract_220002.letting_date

datetime.date(2022, 1, 28)

In [6]:
bid_df = transform_bid_df(abstract_220002.bid_df, abstract_220002.contract_id)
bid_df.head()

Unnamed: 0,item_id,long_description,quantity,bidder_id,unit_price,contract_id
0,2011.601/01000,AS BUILT,1.0,engineers,1500000,220002
1,2021.501/00010,MOBILIZATION,1.0,engineers,25000000,220002
2,2051.501/00010,MAINT AND RESTORATION OF HAUL ROADS,1.0,engineers,100000,220002
3,2101.505/00020,CLEARING,1.4,engineers,800000,220002
4,2101.505/00030,GRUBBING,1.4,engineers,400000,220002


In [7]:
def generate_compound_ids(df: pd.DataFrame) -> pd.Series:
    return df["item_id"] + "_" + df["long_description"]

In [8]:
bid_df_compound_id = bid_df.assign(compound_id=generate_compound_ids(bid_df))
bid_df_compound_id.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568 entries, 0 to 567
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   item_id           568 non-null    object 
 1   long_description  568 non-null    object 
 2   quantity          568 non-null    float64
 3   bidder_id         568 non-null    object 
 4   unit_price        568 non-null    int64  
 5   contract_id       568 non-null    object 
 6   compound_id       568 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 31.2+ KB


In [9]:
item_list.compound_id_matrix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14426 entries, 2011.601/00003_CONSTRUCTION SURVEYING to 2582.503/80312_12" DOTTED LINE PREFORM THERMO GROUND IN CONT ESR
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2016  14426 non-null  bool 
 1   is_in_2018  14426 non-null  bool 
 2   is_in_2020  14426 non-null  bool 
dtypes: bool(3)
memory usage: 155.0+ KB


In [12]:
# This should succeed
item_list.compound_id_matrix.loc["2360.509/14500_TYPE SP 9.5 WEARING COURSE MIXTURE (4,E)"]

is_in_2016    False
is_in_2018     True
is_in_2020     True
Name: 2360.509/14500_TYPE SP 9.5 WEARING COURSE MIXTURE (4,E), dtype: bool

In [13]:
compound_id_matrix_dump = Path("../data/csv/compound_id_matrix_dump.csv").resolve()
with open(compound_id_matrix_dump, "w") as f:
    item_list.compound_id_matrix.to_csv(f, quotechar="'")

In [14]:
bid_df_matrix = bid_df_compound_id.merge(item_list.compound_id_matrix, how="left", left_on="compound_id", right_index=True)
bid_df_matrix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568 entries, 0 to 567
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   item_id           568 non-null    object 
 1   long_description  568 non-null    object 
 2   quantity          568 non-null    float64
 3   bidder_id         568 non-null    object 
 4   unit_price        568 non-null    int64  
 5   contract_id       568 non-null    object 
 6   compound_id       568 non-null    object 
 7   is_in_2016        568 non-null    bool   
 8   is_in_2018        568 non-null    bool   
 9   is_in_2020        568 non-null    bool   
dtypes: bool(3), float64(1), int64(1), object(5)
memory usage: 32.9+ KB


In [15]:
bid_df_matrix.iloc[49]

item_id                                                2360.509/14500
long_description             TYPE SP 9.5 WEARING COURSE MIXTURE (4,E)
quantity                                                       4585.0
bidder_id                                                   engineers
unit_price                                                       6919
contract_id                                                    220002
compound_id         2360.509/14500_TYPE SP 9.5 WEARING COURSE MIXT...
is_in_2016                                                      False
is_in_2018                                                       True
is_in_2020                                                       True
Name: 49, dtype: object

In [16]:
bid_df_matrix[bid_df_matrix["is_in_2016"].isna()]

Unnamed: 0,item_id,long_description,quantity,bidder_id,unit_price,contract_id,compound_id,is_in_2016,is_in_2018,is_in_2020
