In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from mndot_bid_etl.transform.item_list import generate_item_list

#### Load Data

In [2]:
item_list_2016_csv = Path("../data/csv/item_list_2016.csv")
item_list_2018_csv = Path("../data/csv/item_list_2018.csv")
item_list_2020_csv = Path("../data/csv/item_list_2020.csv")

item_list = generate_item_list(item_list_2016_csv, item_list_2018_csv, item_list_2020_csv)

In [3]:
transformed_item_list_2016_csv = Path("../data/csv/transformed_item_list_2016.csv").resolve()
transformed_item_list_2018_csv = Path("../data/csv/transformed_item_list_2018.csv").resolve()
transformed_item_list_2020_csv = Path("../data/csv/transformed_item_list_2020.csv").resolve()

with open(transformed_item_list_2016_csv, "w") as f:
    item_list.item_2016_df.to_csv(f, quotechar="'")
with open(transformed_item_list_2018_csv, "w") as f:
    item_list.item_2018_df.to_csv(f, quotechar="'")
with open(transformed_item_list_2020_csv, "w") as f:
    item_list.item_2020_df.to_csv(f, quotechar="'")

In [4]:
item_list.item_2016_df.info()
# item_list.item_2016_df.astype("string").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7713 entries, 0 to 7712
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                7713 non-null   string
 1   description       7713 non-null   string
 2   long_description  7713 non-null   string
 3   unit              7713 non-null   string
 4   unit_description  7713 non-null   string
 5   spec_year         7713 non-null   string
dtypes: string(6)
memory usage: 361.7 KB


In [5]:
item_list.item_2018_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8032 entries, 0 to 8031
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                8032 non-null   string
 1   description       8032 non-null   string
 2   long_description  8032 non-null   string
 3   unit              8032 non-null   string
 4   unit_description  8032 non-null   string
 5   spec_year         8032 non-null   string
dtypes: string(6)
memory usage: 376.6 KB


In [6]:
item_list.item_2020_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8034 entries, 0 to 8033
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                8034 non-null   string
 1   description       8034 non-null   string
 2   long_description  8034 non-null   string
 3   unit              8034 non-null   string
 4   unit_description  8034 non-null   string
 5   spec_year         8034 non-null   string
dtypes: string(6)
memory usage: 376.7 KB


#### ID Matrix

In [7]:
item_2020_ids = pd.DataFrame(index=item_list.item_2020_df["id"]).assign(is_in_2020=True)
item_2020_ids.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8034 entries, 2011.601/00003 to 2999.501/00010
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2020  8034 non-null   bool 
dtypes: bool(1)
memory usage: 70.6 KB


In [8]:
item_2018_ids = pd.DataFrame(index=item_list.item_2018_df["id"]).assign(is_in_2018=True)
item_2018_ids.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8032 entries, 2011.601/00003 to 2999.501/00010
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2018  8032 non-null   bool 
dtypes: bool(1)
memory usage: 70.6 KB


In [9]:
item_2016_ids = pd.DataFrame(index=item_list.item_2016_df["id"]).assign(is_in_2016=True)
item_2016_ids.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7713 entries, 2011.601/00003 to 2999.509/00001
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2016  7713 non-null   bool 
dtypes: bool(1)
memory usage: 67.8 KB


In [10]:
item_id_matrix = item_2016_ids.join(other=[item_2018_ids, item_2020_ids], how="outer")
item_id_matrix.fillna(value=False, inplace=True)
item_id_matrix.sort_index(inplace=True)
item_id_matrix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14338 entries, 2011.601/00003 to 2999.509/00001
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2016  14338 non-null  bool 
 1   is_in_2018  14338 non-null  bool 
 2   is_in_2020  14338 non-null  bool 
dtypes: bool(3)
memory usage: 154.0+ KB


In [11]:
item_id_matrix_reindex = item_id_matrix[~item_id_matrix.index.duplicated()]
item_id_matrix_reindex.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14338 entries, 2011.601/00003 to 2999.509/00001
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2016  14338 non-null  bool 
 1   is_in_2018  14338 non-null  bool 
 2   is_in_2020  14338 non-null  bool 
dtypes: bool(3)
memory usage: 154.0+ KB


In [12]:
with open(Path("../data/csv/item_id_matrix_dump.csv"), "w") as f:
    item_id_matrix.to_csv(f)

#### Long Description Matrix

In [13]:
item_2020_long_desc = pd.DataFrame(
    index=item_list.item_2020_df["long_description"]
).assign(is_in_2020=True)
item_2020_long_desc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8034 entries, CONSTRUCTION SURVEYING to END TAPE
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2020  8034 non-null   bool 
dtypes: bool(1)
memory usage: 70.6 KB


In [14]:
item_2020_long_desc_reindex = item_2020_long_desc[~item_2020_long_desc.index.duplicated()]
item_2020_long_desc_reindex.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7459 entries, CONSTRUCTION SURVEYING to END TAPE
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2020  7459 non-null   bool 
dtypes: bool(1)
memory usage: 65.6 KB


In [15]:
item_2018_long_desc = pd.DataFrame(
    index=item_list.item_2018_df["long_description"]
).assign(is_in_2018=True)
item_2018_long_desc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8032 entries, CONSTRUCTION SURVEYING to END TAPE
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2018  8032 non-null   bool 
dtypes: bool(1)
memory usage: 70.6 KB


In [16]:
item_2018_long_desc_reindex = item_2018_long_desc[~item_2018_long_desc.index.duplicated()]
item_2018_long_desc_reindex.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7468 entries, CONSTRUCTION SURVEYING to END TAPE
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2018  7468 non-null   bool 
dtypes: bool(1)
memory usage: 65.6 KB


In [17]:
item_2016_long_desc = pd.DataFrame(
    index=item_list.item_2016_df["long_description"]
).assign(is_in_2016=True)
item_2016_long_desc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7713 entries, CONSTRUCTION SURVEYING to END TAPE
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2016  7713 non-null   bool 
dtypes: bool(1)
memory usage: 67.8 KB


In [18]:
item_2016_long_desc_reindex = item_2016_long_desc[~item_2016_long_desc.index.duplicated()]
item_2016_long_desc_reindex.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7201 entries, CONSTRUCTION SURVEYING to END TAPE
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2016  7201 non-null   bool 
dtypes: bool(1)
memory usage: 63.3 KB


In [19]:
item_long_desc_matrix = item_2016_long_desc.join(other=[item_2018_long_desc, item_2020_long_desc], how="outer")
item_long_desc_matrix.fillna(value=False, inplace=True)
item_long_desc_matrix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 164344 entries, 0.5 CU YD SHOVEL to YELLOW TUBE DELINEATORS
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   is_in_2016  164344 non-null  bool 
 1   is_in_2018  164344 non-null  bool 
 2   is_in_2020  164344 non-null  bool 
dtypes: bool(3)
memory usage: 1.7 MB


In [20]:
item_long_desc_matrix_reindex = item_long_desc_matrix[~item_long_desc_matrix.index.duplicated()]
item_long_desc_matrix_reindex.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7988 entries, 0.5 CU YD SHOVEL to YELLOW TUBE DELINEATORS
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2016  7988 non-null   bool 
 1   is_in_2018  7988 non-null   bool 
 2   is_in_2020  7988 non-null   bool 
dtypes: bool(3)
memory usage: 85.8 KB


In [21]:
item_long_desc_matrix2 = item_2016_long_desc_reindex.join(other=[item_2018_long_desc_reindex, item_2020_long_desc_reindex], how="outer")
item_long_desc_matrix2.fillna(value=False, inplace=True)
item_long_desc_matrix2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7988 entries, CONSTRUCTION SURVEYING to 12" DOTTED LINE PREFORM THERMO GROUND IN CONT ESR
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2016  7988 non-null   bool 
 1   is_in_2018  7988 non-null   bool 
 2   is_in_2020  7988 non-null   bool 
dtypes: bool(3)
memory usage: 85.8+ KB


#### Combined ID & Long Description Matrix

In [22]:

compound_id_2020 = item_list.item_2020_df["id"] + " " + item_list.item_2020_df["long_description"]
compound_id_2020_df = pd.DataFrame(index=compound_id_2020).assign(is_in_2020=True)
compound_id_2020_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8034 entries, 2011.601/00003 CONSTRUCTION SURVEYING to 2999.501/00010 END TAPE
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2020  8034 non-null   bool 
dtypes: bool(1)
memory usage: 70.6 KB


In [23]:
compound_id_2018 = item_list.item_2018_df["id"] + " " + item_list.item_2018_df["long_description"]
compound_id_2018_df = pd.DataFrame(index=compound_id_2018).assign(is_in_2018=True)
compound_id_2018_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8032 entries, 2011.601/00003 CONSTRUCTION SURVEYING to 2999.501/00010 END TAPE
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2018  8032 non-null   bool 
dtypes: bool(1)
memory usage: 70.6 KB


In [24]:
compound_id_2016 = item_list.item_2016_df["id"] + " " + item_list.item_2016_df["long_description"]
compound_id_2016_df = pd.DataFrame(index=compound_id_2016).assign(is_in_2016=True)
compound_id_2016_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7713 entries, 2011.601/00003 CONSTRUCTION SURVEYING to 2999.509/00001 END TAPE
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2016  7713 non-null   bool 
dtypes: bool(1)
memory usage: 67.8 KB


In [25]:
compound_id_matrix = compound_id_2016_df.join(other=[compound_id_2018_df, compound_id_2020_df], how="outer")
compound_id_matrix.fillna(value=False, inplace=True)
compound_id_matrix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14426 entries, 2011.601/00003 CONSTRUCTION SURVEYING to 2582.503/80312 12" DOTTED LINE PREFORM THERMO GROUND IN CONT ESR
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2016  14426 non-null  bool 
 1   is_in_2018  14426 non-null  bool 
 2   is_in_2020  14426 non-null  bool 
dtypes: bool(3)
memory usage: 155.0+ KB


In [26]:
compound_id_matrix_reindex = compound_id_matrix[~compound_id_matrix.index.duplicated()]
compound_id_matrix_reindex.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14426 entries, 2011.601/00003 CONSTRUCTION SURVEYING to 2582.503/80312 12" DOTTED LINE PREFORM THERMO GROUND IN CONT ESR
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   is_in_2016  14426 non-null  bool 
 1   is_in_2018  14426 non-null  bool 
 2   is_in_2020  14426 non-null  bool 
dtypes: bool(3)
memory usage: 155.0+ KB


In [27]:
unique_2020_items = compound_id_matrix.query(
    "is_in_2016 == False & is_in_2018 == False & is_in_2020 == True"
).index.to_list()
len(unique_2020_items)


592

In [28]:
unique_2018_items = compound_id_matrix.query(
    "is_in_2016 == False & is_in_2018 == True & is_in_2020 == False"
).index.to_list()
len(unique_2018_items)

446

In [29]:
unique_2016_items = compound_id_matrix.query(
    "is_in_2016 == True & is_in_2018 == False & is_in_2020 == False"
).index.to_list()
len(unique_2016_items)

5800

#### Test ItemList Compound Matrix & Unique Compound Id Sets

In [30]:
assert item_list.compound_id_matrix.shape == (14426, 3)

In [31]:
assert len(item_list.unique_2016_compound_ids) == 5800

In [32]:
assert len(item_list.unique_2018_compound_ids) == 446

In [33]:
assert len(item_list.unique_2020_compound_ids) == 592

In [34]:
item_list.compound_id_matrix.head()

Unnamed: 0,is_in_2016,is_in_2018,is_in_2020
2011.601/00003_CONSTRUCTION SURVEYING,True,True,True
2011.601/00010_VIBRATION MONITORING,True,True,True
2011.601/00015_SCOUR MONITORING,True,True,True
2011.601/00020_REVISED BRIDGE PLANS,True,True,True
2011.601/00040_DESIGN,True,False,False


In [35]:
item_list.compound_id_matrix.iloc[10315].name

'2501.502/36015_15" CS SAFETY APRON'