In [1]:
import sys
sys.path.append("..")

from pathlib import Path
import pandas as pd

from mndot_bid_etl.reader.abstract import create_abstract_data_from_csv
from mndot_bid_etl.reader.item import create_item_data_from_csv
from mndot_bid_etl.transform.functional.transform_bid import transform_bid_df
from mndot_bid_etl.transform.functional.transform_bidder import transform_bidder_df
from mndot_bid_etl.transform.functional.transform_contract import transform_contract_df

In [2]:
abstract_data = create_abstract_data_from_csv(Path("../data/csv/2022/220002.csv").resolve())

item_data_2020 = create_item_data_from_csv(Path("../data/csv/item_list_2020.csv").resolve())
item_data_2018 = create_item_data_from_csv(Path("../data/csv/item_list_2018.csv").resolve())
item_data_2016 = create_item_data_from_csv(Path("../data/csv/item_list_2016.csv").resolve())

In [3]:
from mndot_bid_etl.transform import item

df_2020 = item.transform_item(item_data_2020.df)
df_2018 = item.transform_item(item_data_2018.df)
df_2016 = item.transform_item(item_data_2016.df)

In [4]:
df_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8034 entries, 0 to 8033
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   spec_year          8034 non-null   string
 1   spec_code          8034 non-null   string
 2   unit_code          8034 non-null   string
 3   item_code          8034 non-null   string
 4   short_description  8034 non-null   string
 5   long_description   8034 non-null   string
 6   unit               8034 non-null   string
 7   unit_abbreviation  8034 non-null   string
dtypes: string(8)
memory usage: 502.2 KB


In [5]:
def generate_in_spec_year_df(df: pd.DataFrame) -> pd.DataFrame:
    spec_year = df.at[0, "spec_year"]

    merged_id = (
        df["spec_code"]
        + "_"
        + df["unit_code"]
        + "_"
        + df["item_code"]
        + "_"
        + df["long_description"]
    )

    out_df = pd.DataFrame({"merged_id": merged_id, spec_year: True})
    return out_df.set_index("merged_id")

In [6]:
in_2016 = generate_in_spec_year_df(df_2016)
in_2018 = generate_in_spec_year_df(df_2018)
in_2020 = generate_in_spec_year_df(df_2020)

In [7]:
spec_year_matrix = in_2016.join(other=[in_2018, in_2020], how="outer").fillna(value=False)
spec_year_matrix.head()

Unnamed: 0_level_0,2016,2018,2020
merged_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011_601_00003,True,True,True
2011_601_00010,True,True,True
2011_601_00015,True,True,True
2011_601_00020,True,True,True
2011_601_00040,True,False,False


In [8]:
spec_year_matrix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14338 entries, 2011_601_00003 to 2582_503_80312
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   2016    14338 non-null  bool 
 1   2018    14338 non-null  bool 
 2   2020    14338 non-null  bool 
dtypes: bool(3)
memory usage: 154.0 KB


In [9]:
from mndot_bid_etl.transform import spec_year

matrix = spec_year.create_spec_year_matrix([df_2016, df_2018, df_2020])
matrix.head()

Unnamed: 0_level_0,2016,2018,2020
merged_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011_601_00003_CONSTRUCTION SURVEYING,True,True,True
2011_601_00010_VIBRATION MONITORING,True,True,True
2011_601_00015_SCOUR MONITORING,True,True,True
2011_601_00020_REVISED BRIDGE PLANS,True,True,True
2011_601_00040_DESIGN,True,False,False


In [10]:
matrix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14426 entries, 2011_601_00003_CONSTRUCTION SURVEYING to 2582_503_80312_12" DOTTED LINE PREFORM THERMO GROUND IN CONT ESR
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   2016    14426 non-null  bool 
 1   2018    14426 non-null  bool 
 2   2020    14426 non-null  bool 
dtypes: bool(3)
memory usage: 155.0 KB
