In [1]:
import sys
sys.path.append("..")

from pathlib import Path

from mndot_bid_etl.reader.abstract import create_abstract_data_from_csv
from mndot_bid_etl.reader.item import create_item_data_from_csv
from mndot_bid_etl.transform.functional.transform_bid import transform_bid_df
from mndot_bid_etl.transform.functional.transform_bidder import transform_bidder_df
from mndot_bid_etl.transform.functional.transform_contract import transform_contract_df

In [2]:
abstract_data = create_abstract_data_from_csv(Path("../data/csv/2022/220002.csv").resolve())

item_data_2020 = create_item_data_from_csv(Path("../data/csv/item_list_2020.csv").resolve())

# Item Transformer (Object)

In [3]:
from mndot_bid_etl.transform.recipes import item_transformer

transformed_item_2020_df = item_transformer.apply(item_data_2020.df)
transformed_item_2020_df.head()

Unnamed: 0,id,description,long_description,unit,unit_description,spec_year
0,2011.601/00003,CONSTRUCTION SURVEYING,CONSTRUCTION SURVEYING,LS,LUMP SUM,2020
1,2011.601/00010,VIBRATION MONITORING,VIBRATION MONITORING,LS,LUMP SUM,2020
2,2011.601/00015,SCOUR MONITORING,SCOUR MONITORING,LS,LUMP SUM,2020
3,2011.601/00018,SETTLEMENT MONITORING,SETTLEMENT MONITORING,LS,LUMP SUM,2020
4,2011.601/00020,REVISED BRIDGE PLANS,REVISED BRIDGE PLANS,LS,LUMP SUM,2020


In [4]:
transformed_item_2020_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8034 entries, 0 to 8033
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                8034 non-null   string
 1   description       8034 non-null   string
 2   long_description  8034 non-null   string
 3   unit              8034 non-null   string
 4   unit_description  8034 non-null   string
 5   spec_year         8034 non-null   string
dtypes: string(6)
memory usage: 376.7 KB


In [5]:
transformed_item_2020_df.iloc[1110]

id                                              2118.507/00050
description              AGGREGATE SURFACING (LV), CLASS 5 MOD
long_description    AGGREGATE SURFACING (LV), CLASS 5 MODIFIED
unit                                                       C Y
unit_description                                         CU YD
spec_year                                                 2020
Name: 1110, dtype: string

# New transformation logic to match API

In [6]:
df =item_data_2020.df
df.head()

Unnamed: 0,Item Number,Short Description,Long Description,Unit Name,Plan Unit Description,Spec Year,Unnamed: 6
0,2011.601/00003,CONSTRUCTION SURVEYING,CONSTRUCTION SURVEYING,LS,LUMP SUM,20,
1,2011.601/00010,VIBRATION MONITORING,VIBRATION MONITORING,LS,LUMP SUM,20,
2,2011.601/00015,SCOUR MONITORING,SCOUR MONITORING,LS,LUMP SUM,20,
3,2011.601/00018,SETTLEMENT MONITORING,SETTLEMENT MONITORING,LS,LUMP SUM,20,
4,2011.601/00020,REVISED BRIDGE PLANS,REVISED BRIDGE PLANS,LS,LUMP SUM,20,


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8034 entries, 0 to 8033
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Item Number            8034 non-null   string
 1   Short Description      8034 non-null   string
 2   Long Description       8034 non-null   string
 3   Unit Name              8034 non-null   string
 4   Plan Unit Description  8034 non-null   string
 5   Spec Year              8034 non-null   string
 6   Unnamed: 6             0 non-null      string
dtypes: string(7)
memory usage: 439.5 KB


In [8]:
import pandas as pd

In [9]:
out_df = pd.DataFrame()
out_df.head()

In [23]:
out_df["spec_year"] = "20" + df["Spec Year"]
out_df.head()

Unnamed: 0,spec_year,spec_code,unit_code,item_code
0,2020,2011,601,3
1,2020,2011,601,10
2,2020,2011,601,15
3,2020,2011,601,18
4,2020,2011,601,20


In [24]:
out_df["spec_code"] = df["Item Number"].str.slice(0, 4)
out_df.head()

Unnamed: 0,spec_year,spec_code,unit_code,item_code
0,2020,2011,601,3
1,2020,2011,601,10
2,2020,2011,601,15
3,2020,2011,601,18
4,2020,2011,601,20


In [25]:
out_df["unit_code"] = df["Item Number"].str.slice(5, 8)
out_df.head()

Unnamed: 0,spec_year,spec_code,unit_code,item_code
0,2020,2011,601,3
1,2020,2011,601,10
2,2020,2011,601,15
3,2020,2011,601,18
4,2020,2011,601,20


In [26]:
out_df["item_code"] = df["Item Number"].str.slice(9, 14)
out_df.head()

Unnamed: 0,spec_year,spec_code,unit_code,item_code
0,2020,2011,601,3
1,2020,2011,601,10
2,2020,2011,601,15
3,2020,2011,601,18
4,2020,2011,601,20


In [27]:
out_df["short_description"] = df["Short Description"].str.strip().str.replace(";", ",")
out_df.head()

Unnamed: 0,spec_year,spec_code,unit_code,item_code,short_description
0,2020,2011,601,3,CONSTRUCTION SURVEYING
1,2020,2011,601,10,VIBRATION MONITORING
2,2020,2011,601,15,SCOUR MONITORING
3,2020,2011,601,18,SETTLEMENT MONITORING
4,2020,2011,601,20,REVISED BRIDGE PLANS


In [28]:
out_df["long_description"] = df["Long Description"].str.strip().str.replace(";", ",")
out_df.head()

Unnamed: 0,spec_year,spec_code,unit_code,item_code,short_description,long_description
0,2020,2011,601,3,CONSTRUCTION SURVEYING,CONSTRUCTION SURVEYING
1,2020,2011,601,10,VIBRATION MONITORING,VIBRATION MONITORING
2,2020,2011,601,15,SCOUR MONITORING,SCOUR MONITORING
3,2020,2011,601,18,SETTLEMENT MONITORING,SETTLEMENT MONITORING
4,2020,2011,601,20,REVISED BRIDGE PLANS,REVISED BRIDGE PLANS


In [30]:
out_df["unit"] = df["Plan Unit Description"].str.strip()
out_df.head()

Unnamed: 0,spec_year,spec_code,unit_code,item_code,short_description,long_description,unit
0,2020,2011,601,3,CONSTRUCTION SURVEYING,CONSTRUCTION SURVEYING,LUMP SUM
1,2020,2011,601,10,VIBRATION MONITORING,VIBRATION MONITORING,LUMP SUM
2,2020,2011,601,15,SCOUR MONITORING,SCOUR MONITORING,LUMP SUM
3,2020,2011,601,18,SETTLEMENT MONITORING,SETTLEMENT MONITORING,LUMP SUM
4,2020,2011,601,20,REVISED BRIDGE PLANS,REVISED BRIDGE PLANS,LUMP SUM


In [31]:
out_df["unit_abbreviation"] = df["Unit Name"].str.strip().str.replace(" ", "")
out_df.head()

Unnamed: 0,spec_year,spec_code,unit_code,item_code,short_description,long_description,unit,unit_abbreviation
0,2020,2011,601,3,CONSTRUCTION SURVEYING,CONSTRUCTION SURVEYING,LUMP SUM,LS
1,2020,2011,601,10,VIBRATION MONITORING,VIBRATION MONITORING,LUMP SUM,LS
2,2020,2011,601,15,SCOUR MONITORING,SCOUR MONITORING,LUMP SUM,LS
3,2020,2011,601,18,SETTLEMENT MONITORING,SETTLEMENT MONITORING,LUMP SUM,LS
4,2020,2011,601,20,REVISED BRIDGE PLANS,REVISED BRIDGE PLANS,LUMP SUM,LS


In [32]:
out_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8034 entries, 0 to 8033
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   spec_year          8034 non-null   string
 1   spec_code          8034 non-null   string
 2   unit_code          8034 non-null   string
 3   item_code          8034 non-null   string
 4   short_description  8034 non-null   string
 5   long_description   8034 non-null   string
 6   unit               8034 non-null   string
 7   unit_abbreviation  8034 non-null   string
dtypes: string(8)
memory usage: 502.2 KB


In [33]:
from mndot_bid_etl.transform import item

transformed_item_df = item.transform_item(item_data_2020.df)
transformed_item_df.head()

Unnamed: 0,spec_year,spec_code,unit_code,item_code,short_description,long_description,unit,unit_abbreviation
0,2020,2011,601,3,CONSTRUCTION SURVEYING,CONSTRUCTION SURVEYING,LUMP SUM,LS
1,2020,2011,601,10,VIBRATION MONITORING,VIBRATION MONITORING,LUMP SUM,LS
2,2020,2011,601,15,SCOUR MONITORING,SCOUR MONITORING,LUMP SUM,LS
3,2020,2011,601,18,SETTLEMENT MONITORING,SETTLEMENT MONITORING,LUMP SUM,LS
4,2020,2011,601,20,REVISED BRIDGE PLANS,REVISED BRIDGE PLANS,LUMP SUM,LS


In [34]:
transformed_item_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8034 entries, 0 to 8033
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   spec_year          8034 non-null   string
 1   spec_code          8034 non-null   string
 2   unit_code          8034 non-null   string
 3   item_code          8034 non-null   string
 4   short_description  8034 non-null   string
 5   long_description   8034 non-null   string
 6   unit               8034 non-null   string
 7   unit_abbreviation  8034 non-null   string
dtypes: string(8)
memory usage: 502.2 KB
