In [1]:
import pandas as pd
from pathlib import Path

In [2]:
csv_file = Path("../data/csv/item_list_2020.csv")

def read_item_csv(csv_file: Path) -> pd.DataFrame:
    with open(csv_file, "r") as f:
        return pd.read_csv(f, dtype="object")

item_df = read_item_csv(csv_file)

In [3]:
item_df.head()

Unnamed: 0,Item Number,Short Description,Long Description,Unit Name,Plan Unit Description,Spec Year,Unnamed: 6
0,2011.601/00003,CONSTRUCTION SURVEYING,CONSTRUCTION SURVEYING,LS,LUMP SUM,20,
1,2011.601/00010,VIBRATION MONITORING,VIBRATION MONITORING,LS,LUMP SUM,20,
2,2011.601/00015,SCOUR MONITORING,SCOUR MONITORING,LS,LUMP SUM,20,
3,2011.601/00018,SETTLEMENT MONITORING,SETTLEMENT MONITORING,LS,LUMP SUM,20,
4,2011.601/00020,REVISED BRIDGE PLANS,REVISED BRIDGE PLANS,LS,LUMP SUM,20,


In [4]:
item_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8034 entries, 0 to 8033
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Item Number            8034 non-null   object
 1   Short Description      8034 non-null   object
 2   Long Description       8034 non-null   object
 3   Unit Name              8034 non-null   object
 4   Plan Unit Description  8034 non-null   object
 5   Spec Year              8034 non-null   object
 6   Unnamed: 6             0 non-null      object
dtypes: object(7)
memory usage: 439.5+ KB


In [6]:
def filter_item_columns(df: pd.DataFrame) -> pd.DataFrame:
    filter_columns = [
        "Item Number",
        "Short Description",
        "Long Description",
        "Unit Name",
        "Plan Unit Description",
        "Spec Year",
    ]
    return df.filter(items=filter_columns, axis="columns")

item_df_filtered = filter_item_columns(item_df)
item_df_filtered.head()

Unnamed: 0,Item Number,Short Description,Long Description,Unit Name,Plan Unit Description,Spec Year
0,2011.601/00003,CONSTRUCTION SURVEYING,CONSTRUCTION SURVEYING,LS,LUMP SUM,20
1,2011.601/00010,VIBRATION MONITORING,VIBRATION MONITORING,LS,LUMP SUM,20
2,2011.601/00015,SCOUR MONITORING,SCOUR MONITORING,LS,LUMP SUM,20
3,2011.601/00018,SETTLEMENT MONITORING,SETTLEMENT MONITORING,LS,LUMP SUM,20
4,2011.601/00020,REVISED BRIDGE PLANS,REVISED BRIDGE PLANS,LS,LUMP SUM,20


In [7]:
def rename_item_columns(df: pd.DataFrame) -> pd.DataFrame:
    mapper = {
        "Item Number": "id",
        "Short Description": "description",
        "Long Description": "long_description",
        "Unit Name": "unit",
        "Plan Unit Description": "unit_description",
        "Spec Year": "spec_year",
    }
    return df.rename(columns=mapper)

item_df_renamed = rename_item_columns(item_df_filtered)
item_df_renamed.head()

Unnamed: 0,id,description,long_description,unit,unit_description,spec_year
0,2011.601/00003,CONSTRUCTION SURVEYING,CONSTRUCTION SURVEYING,LS,LUMP SUM,20
1,2011.601/00010,VIBRATION MONITORING,VIBRATION MONITORING,LS,LUMP SUM,20
2,2011.601/00015,SCOUR MONITORING,SCOUR MONITORING,LS,LUMP SUM,20
3,2011.601/00018,SETTLEMENT MONITORING,SETTLEMENT MONITORING,LS,LUMP SUM,20
4,2011.601/00020,REVISED BRIDGE PLANS,REVISED BRIDGE PLANS,LS,LUMP SUM,20


In [8]:
def format_item_values(df: pd.DataFrame) -> pd.DataFrame:
    formatted_df = df.copy()
    formatted_df["spec_year"] = formatted_df["spec_year"].apply(lambda x: int("20" + x))
    return formatted_df

item_df_formatted = format_item_values(item_df_renamed)


Unnamed: 0,id,description,long_description,unit,unit_description,spec_year
0,2011.601/00003,CONSTRUCTION SURVEYING,CONSTRUCTION SURVEYING,LS,LUMP SUM,2020
1,2011.601/00010,VIBRATION MONITORING,VIBRATION MONITORING,LS,LUMP SUM,2020
2,2011.601/00015,SCOUR MONITORING,SCOUR MONITORING,LS,LUMP SUM,2020
3,2011.601/00018,SETTLEMENT MONITORING,SETTLEMENT MONITORING,LS,LUMP SUM,2020
4,2011.601/00020,REVISED BRIDGE PLANS,REVISED BRIDGE PLANS,LS,LUMP SUM,2020


In [9]:
item_df_formatted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8034 entries, 0 to 8033
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                8034 non-null   object
 1   description       8034 non-null   object
 2   long_description  8034 non-null   object
 3   unit              8034 non-null   object
 4   unit_description  8034 non-null   object
 5   spec_year         8034 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 376.7+ KB
