In [30]:
import sys
sys.path.append("..")

from pathlib import Path
import pandas as pd

from mndot_bid_etl.reader.abstract import create_abstract_data_from_csv
from mndot_bid_etl.reader.item import create_item_data_from_csv
from mndot_bid_etl.transform.functional.transform_bid import transform_bid_df
from mndot_bid_etl.transform.functional.transform_bidder import transform_bidder_df
from mndot_bid_etl.transform.functional.transform_contract import transform_contract_df

In [31]:
abstract_data = create_abstract_data_from_csv(Path("../data/csv/2021/210022.csv").resolve())

item_data_2020 = create_item_data_from_csv(Path("../data/csv/item_list_2020.csv").resolve())
item_data_2018 = create_item_data_from_csv(Path("../data/csv/item_list_2018.csv").resolve())
item_data_2016 = create_item_data_from_csv(Path("../data/csv/item_list_2016.csv").resolve())

In [32]:
from mndot_bid_etl.transform import item

df_2020 = item.transform_item(item_data_2020.df)
df_2018 = item.transform_item(item_data_2018.df)
df_2016 = item.transform_item(item_data_2016.df)

In [33]:
from mndot_bid_etl.transform import spec_year

matrix = spec_year.create_spec_year_matrix([df_2016, df_2018, df_2020])
matrix.head()

Unnamed: 0_level_0,2016,2018,2020
merged_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011_601_00003_CONSTRUCTION SURVEYING,True,True,True
2011_601_00010_VIBRATION MONITORING,True,True,True
2011_601_00015_SCOUR MONITORING,True,True,True
2011_601_00020_REVISED BRIDGE PLANS,True,True,True
2011_601_00040_DESIGN,True,False,False


In [34]:
df = abstract_data.bid_df

In [35]:
df.head()

Unnamed: 0,ContractId,SectionDescription,LineNumber,ItemNumber,ItemDescription,UnitPrice,Quantity,UnitName,Engineers (Unit Price),Engineers (Extended Amount),0000203941 (Unit Price),0000203941 (Extension),0000211900 (Unit Price),0000211900 (Extension)
0,210022,0001 - SP 8825-837 (90% NHPP/ 10% STATE),5,2011601/01000,AS BUILT,7500.0,1,LS,7500.0,7500.0,"$8,000.00","$8,000.00","$8,750.00","$8,750.00"
1,210022,0001 - SP 8825-837 (90% NHPP/ 10% STATE),10,2021501/00010,MOBILIZATION,45000.0,1,LS,45000.0,45000.0,"$50,000.00","$50,000.00","$75,000.00","$75,000.00"
2,210022,0001 - SP 8825-837 (90% NHPP/ 10% STATE),15,2102503/00010,PAVEMENT MARKING REMOVAL,0.84833,7000,L F,0.84833,5938.31,$1.50,"$10,500.00",$1.00,"$7,000.00"
3,210022,0001 - SP 8825-837 (90% NHPP/ 10% STATE),20,2104502/01240,REMOVE SIGN TYPE C,61.31021,5,EACH,61.31021,306.55105,$85.00,$425.00,$75.00,$375.00
4,210022,0001 - SP 8825-837 (90% NHPP/ 10% STATE),25,2104502/01270,REMOVE SIGN TYPE EO,200.0,1,EACH,200.0,200.0,$100.00,$100.00,$300.00,$300.00


In [36]:
df.shape

(78, 14)

In [37]:
import pandas as pd

def format_long_description(long_description: str) -> str:
    return long_description.strip().replace("''", '"')


abstract_items_df = pd.DataFrame()

abstract_items_df["merged_id"] = (
    df["ItemNumber"].str.slice(0, 4)
    + "_"
    + df["ItemNumber"].str.slice(4,7)
    + "_"
    + df["ItemNumber"].str.slice(8)
    + "_"
    + df["ItemDescription"].apply(format_long_description)
)
abstract_items_df.set_index("merged_id", inplace=True)
abstract_items_df.head()

2011_601_01000_AS BUILT
2021_501_00010_MOBILIZATION
2102_503_00010_PAVEMENT MARKING REMOVAL
2104_502_01240_REMOVE SIGN TYPE C
2104_502_01270_REMOVE SIGN TYPE EO


In [51]:
abstract_items_df.

<class 'pandas.core.frame.DataFrame'>
Index: 78 entries, 2011_601_01000_AS BUILT to 2582_518_04070_PAVEMENT MESSAGE PREFORM THERMOPLASTIC GROUND IN CONTRAST ENHANCED SKID RESISTANCE
Empty DataFrame


In [52]:
merged_df = abstract_items_df.merge(matrix, how="left", on="merged_id", indicator=True)
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 78 entries, 2011_601_01000_AS BUILT to 2582_518_04070_PAVEMENT MESSAGE PREFORM THERMOPLASTIC GROUND IN CONTRAST ENHANCED SKID RESISTANCE
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   2016    77 non-null     object  
 1   2018    77 non-null     object  
 2   2020    77 non-null     object  
 3   _merge  78 non-null     category
dtypes: category(1), object(3)
memory usage: 2.6+ KB


In [54]:
merged_df[merged_df["_merge"] != "both"]

Unnamed: 0_level_0,2016,2018,2020,_merge
merged_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2211_507_00200_AGGREGATE BASE (CV) CLASS 5Q,,,,left_only


In [40]:
merged_df[merged_df["2018"] == False]

Unnamed: 0_level_0,2016,2018,2020
merged_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [41]:
merged_df.columns.to_list()

['2016', '2018', '2020']

In [42]:
count_2016 = merged_df["2016"].value_counts()[True]
count_2016

6

In [43]:
count_2018 = merged_df["2018"].value_counts()[True]
count_2018

77

In [44]:
count_2020 = merged_df["2020"].value_counts()[True]
count_2020

76

In [45]:
test = {"2016": 100, "2018": 101, "2020": 101}
max(test, key=test.get)

'2018'

In [46]:
estimated_spec_year = spec_year.determine_contract_spec_year(matrix, abstract_data.bid_df)
estimated_spec_year

"{'item_count': 78, '2016_matches': 6, '2018_matches': 77, '2020_matches': 76}"