In [1]:
# extract.py

# import relevant modules
import pandas as pd

# extract data
def extract_data(filepath: object) -> pd.DataFrame:
    """
       Simple Extract Function in Python with Error Handling
       :param filepath: str, file path to CSV data
       :output: pandas dataframe, extracted from CSV data
    """
    try:
        # Read the CSV file and store it in a dataframe
        df = pd.read_csv(filepath)

    # Handle exception if any of the files are missing
    except FileNotFoundError as e:
        print(f"Error: {e}")

    # Handle any other exceptions
    except Exception as e:
        print(f"Error: {e}")

    return df

In [5]:
# transform.py

# import modules
import pandas as pd

# transform data
def transform_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans and transforms crash data.
    :param df: pandas dataframe, extracted data
    :return: pandas dataframe, transformed data
    """
    
    # Drop duplicate rows
    df = df.drop_duplicates()

    # Convert mixed-type columns (strings & numbers) properly
    mixed_columns = ['crash_date_est_i', 'report_type', 'intersection_related_i',
                     'private_property_i', 'hit_and_run_i', 'photos_taken_i',
                     'statements_taken_i', 'work_zone_i', 'work_zone_type', 
                     'workers_present_i', 'most_severe_injury', 'location']

    for col in mixed_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")  # Convert to float, replacing bad values with NaN

    # Replace missing values in numeric columns with the mean
    df.fillna(df.select_dtypes(include=['number']).mean(), inplace=True)

    # Replace missing values in categorical columns with the mode
    for col in df.select_dtypes(include=['object']).columns:
        if not df[col].mode().empty:
            df[col].fillna(df[col].mode()[0], inplace=True)

    # Convert CRASH_DATE to datetime
    if 'CRASH_DATE' in df.columns:
        df['CRASH_DATE'] = pd.to_datetime(df['CRASH_DATE'], errors='coerce')

    # Convert integer-like columns properly
    if 'POSTED_SPEED_LIMIT' in df.columns:
        df['POSTED_SPEED_LIMIT'] = pd.to_numeric(df['POSTED_SPEED_LIMIT'], errors='coerce').astype('Int32')
    
    # convert every columns name to uppercase
    df.columns = df.columns.str.upper()

    return df

In [6]:
# load.py

# import relevant modules
import psycopg2

# establish connection to the Postgresql database
conn = psycopg2.connect(
    database="chicago_dmv",
    user="myuser",
    password="mypassword",
    host="localhost",
    port="5432"
)

# create a cursor object for running SQL queries
cur = conn.cursor()
print('successful creation of cursor object.')


# suggested continued learning: this function can be modified to be fully dynamic
def load_data(df: object, postgre_table: object, postgre_schema: object) -> object:
    """
    Load transformed data into respective PostgreSQL Table
    :param df: DataFrame with the data to insert
    :param postgre_table: The name of the target table
    :param postgre_schema: The schema in which the table resides
    :return: cursor object
    """
    insert_query = f"INSERT INTO {postgre_table} {postgre_schema};"

    for index, row in df.iterrows():
        if postgre_table == 'chicago_dmv.Crash':

            """
            crash_insert_PSQL: '''(CRASH_UNIT_ID,
                      CRASH_ID,
                      PERSON_ID,
                      VEHICLE_ID,
                      NUM_UNITS,
                      TOTAL_INJURIES)
                      VALUES (%s, %s, %s, %s, %s, %s)'''
            
            Columns in crashes_transformed_df: Index(['CRASH_RECORD_ID', 'RD_NO', 'CRASH_DATE_EST_I', 'CRASH_DATE',
                'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
                'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
                'TRAFFICWAY_TYPE', 'LANE_CNT', 'ALIGNMENT', 'ROADWAY_SURFACE_COND',
                'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE', 'INTERSECTION_RELATED_I',
                'PRIVATE_PROPERTY_I', 'HIT_AND_RUN_I', 'DAMAGE', 'DATE_POLICE_NOTIFIED',
                'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO',
                'STREET_DIRECTION', 'STREET_NAME', 'BEAT_OF_OCCURRENCE',
                'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I',
                'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 'NUM_UNITS',
                'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL',
                'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING',
                'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',
                'INJURIES_UNKNOWN', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH',
                'LATITUDE', 'LONGITUDE', 'LOCATION'],
                dtype='object')
            """
            """
            insert_values = (row['CRASH_RECORD_ID'],
                             # clearly, i was supposed to merge this table with the other two to get this data
                             # but it doesn't matter anyway, i'll just skip
                             # row['CRASH_ID'],
                             # row['PERSON_ID'],
                             # row['VEHICLE_ID'],
                             row['NUM_UNITS'],
                             row['INJURIES_TOTAL'])
            """
            insert_values = (
                row.get('CRASH_RECORD_ID', None), 
                None, 
                None, 
                None, 
                row.get('NUM_UNITS', None), 
                row.get('TOTAL_INJURIES', None)
            )

        elif postgre_table == 'chicago_dmv.Vehicle':
            """
            vehicle_insert_PSQL: '''(CRASH_UNIT_ID,
                        CRASH_ID,
                        CRASH_DATE,
                        VEHICLE_ID,
                        VEHICLE_MAKE,
                        VEHICLE_MODEL,
                        VEHICLE_YEAR,
                        VEHICLE_TYPE)
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'''
            
            Columns in vehicles_transformed_df: Index(['CRASH_UNIT_ID', 'CRASH_RECORD_ID', 'RD_NO', 'CRASH_DATE', 'UNIT_NO',
                'UNIT_TYPE', 'NUM_PASSENGERS', 'VEHICLE_ID', 'CMRC_VEH_I', 'MAKE',
                'MODEL', 'LIC_PLATE_STATE', 'VEHICLE_YEAR', 'VEHICLE_DEFECT',
                'VEHICLE_TYPE', 'VEHICLE_USE', 'TRAVEL_DIRECTION', 'MANEUVER',
                'TOWED_I', 'FIRE_I', 'OCCUPANT_CNT', 'EXCEED_SPEED_LIMIT_I', 'TOWED_BY',
                'TOWED_TO', 'AREA_00_I', 'AREA_01_I', 'AREA_02_I', 'AREA_03_I',
                'AREA_04_I', 'AREA_05_I', 'AREA_06_I', 'AREA_07_I', 'AREA_08_I',
                'AREA_09_I', 'AREA_10_I', 'AREA_11_I', 'AREA_12_I', 'AREA_99_I',
                'FIRST_CONTACT_POINT', 'CMV_ID', 'USDOT_NO', 'CCMC_NO', 'ILCC_NO',
                'COMMERCIAL_SRC', 'GVWR', 'CARRIER_NAME', 'CARRIER_STATE',
                'CARRIER_CITY', 'HAZMAT_PLACARDS_I', 'HAZMAT_NAME', 'UN_NO',
                'HAZMAT_PRESENT_I', 'HAZMAT_REPORT_I', 'HAZMAT_REPORT_NO',
                'MCS_REPORT_I', 'MCS_REPORT_NO', 'HAZMAT_VIO_CAUSE_CRASH_I',
                'MCS_VIO_CAUSE_CRASH_I', 'IDOT_PERMIT_NO', 'WIDE_LOAD_I',
                'TRAILER1_WIDTH', 'TRAILER2_WIDTH', 'TRAILER1_LENGTH',
                'TRAILER2_LENGTH', 'TOTAL_VEHICLE_LENGTH', 'AXLE_CNT', 'VEHICLE_CONFIG',
                'CARGO_BODY_TYPE', 'LOAD_TYPE', 'HAZMAT_OUT_OF_SERVICE_I',
                'MCS_OUT_OF_SERVICE_I', 'HAZMAT_CLASS'],
                dtype='object')
            """
            insert_values = (row['CRASH_UNIT_ID'],
                             row['CRASH_RECORD_ID'],  # Corresponds to CRASH_ID in the table
                             row['CRASH_DATE'],
                             row['VEHICLE_ID'],
                             row['MAKE'],
                             row['MODEL'],
                             row['VEHICLE_YEAR'],
                             row['VEHICLE_TYPE'])
        elif postgre_table == 'chicago_dmv.Person':

            """
            person_insert_PSQL: '''(PERSON_ID,
                        CRASH_ID,
                        CRASH_DATE,
                        PERSON_TYPE,
                        VEHICLE_ID,
                        PERSON_SEX,
                        PERSON_AGE)
                        VALUES (%s, %s, %s, %s, %s, %s, %s)'''
            
            Columns in people_transformed_df: Index(['PERSON_ID', 'PERSON_TYPE', 'CRASH_RECORD_ID', 'RD_NO', 'VEHICLE_ID',
                'CRASH_DATE', 'SEAT_NO', 'CITY', 'STATE', 'ZIPCODE', 'SEX', 'AGE',
                'DRIVERS_LICENSE_STATE', 'DRIVERS_LICENSE_CLASS', 'SAFETY_EQUIPMENT',
                'AIRBAG_DEPLOYED', 'EJECTION', 'INJURY_CLASSIFICATION', 'HOSPITAL',
                'EMS_AGENCY', 'EMS_RUN_NO', 'DRIVER_ACTION', 'DRIVER_VISION',
                'PHYSICAL_CONDITION', 'PEDPEDAL_ACTION', 'PEDPEDAL_VISIBILITY',
                'PEDPEDAL_LOCATION', 'BAC_RESULT', 'BAC_RESULT_VALUE',
                'CELL_PHONE_USE'],
                dtype='object')
            """
            insert_values = (row['PERSON_ID'],
                             row['CRASH_RECORD_ID'],
                             row['CRASH_DATE'],
                             row['PERSON_TYPE'],
                             row['VEHICLE_ID'],
                             row['SEX'],
                             row['AGE'])
        else:
            raise ValueError(f'Postgre Data Table {postgre_table} does not exist in this pipeline.')

        print(f"Insert query: {insert_query}")
        print(f"Insert values: {insert_values}")
        
        # Execute insert query
        cur.execute(insert_query, insert_values)

    # Commit changes
    conn.commit()


def close_conn(cur):
    """
    Closing Postgre connection
    :param cur: posgre cursor object
    :return: none
    """

    # Close the cursor and database connection
    cur.close()
    conn.close()
    print('successful closing of cursor object.')

successful creation of cursor object.


In [7]:
# pipeline.py

import yaml

# import pipeline configuration
with open('config.yaml', 'r') as file:
    config_data = yaml.safe_load(file)

def run_pipeline():
    # Step 1: Extract data
    crashes_df = extract_data(config_data['crash_filepath'])
    vehicle_df = extract_data(config_data['vehicle_filepath'])
    people_df = extract_data(config_data['people_filepath'])

    # print("crashes_df:\n", crashes_df.dtypes, "\n"*2)
    # print("vehicle_df:\n", vehicle_df.dtypes, "\n"*2)
    # print("people_df:\n", crashes_df.dtypes, "\n"*2)

    # for col in crashes_df.columns:
    #     types = crashes_df[col].apply(type).value_counts()
    #     print(f"Column: {col}\n{types}\n")

    # Step 2: Transform data
    crashes_transformed_df = transform_data(crashes_df)
    vehicle_transformed_df = transform_data(vehicle_df)
    people_transformed_df = transform_data(people_df)

    # print("Columns in crashes_transformed_df:", crashes_transformed_df.columns)
    # print("Columns in vehicles_transformed_df:", vehicle_transformed_df.columns)
    # print("Columns in people_transformed_df:", people_transformed_df.columns)


    # Step 3: Load data
    load_data(df=crashes_transformed_df,
                   postgre_table=config_data['crash_table_PSQL'],
                   postgre_schema=config_data['crash_insert_PSQL'])
    load_data(df=vehicle_transformed_df,
                   postgre_table=config_data['vehicle_table_PSQL'],
                   postgre_schema=config_data['vehicle_insert_PSQL'])
    load_data(df=people_transformed_df,
                   postgre_table=config_data['person_table_PSQL'],
                   postgre_schema=config_data['person_insert_PSQL'])


if __name__ == "__main__":
    run_pipeline()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

Insert query: INSERT INTO chicago_dmv.Crash (CRASH_UNIT_ID, CRASH_ID, PERSON_ID, VEHICLE_ID, NUM_UNITS, TOTAL_INJURIES) VALUES (%s, %s, %s, %s, %s, %s);
Insert values: ('530411c8611eb0ccb9b25f16b2955cd21761fa1928dcaa71279438be032526d11c08ca75075f176d2dc0085665b171fd57eca2c7cf88221d1191d351867bec81', None, None, None, 2, None)
Insert query: INSERT INTO chicago_dmv.Crash (CRASH_UNIT_ID, CRASH_ID, PERSON_ID, VEHICLE_ID, NUM_UNITS, TOTAL_INJURIES) VALUES (%s, %s, %s, %s, %s, %s);
Insert values: ('305b06235b250aa0029c07313c84f969f4bc13c1cc371592780886815ca51411acb80530c99edd0cd57329d1e808ca56923fb2d4a8ba93bd7d0b272eda9d769f', None, None, None, 3, None)
Insert query: INSERT INTO chicago_dmv.Crash (CRASH_UNIT_ID, CRASH_ID, PERSON_ID, VEHICLE_ID, NUM_UNITS, TOTAL_INJURIES) VALUES (%s, %s, %s, %s, %s, %s);
Insert values: ('444221c2a9bc82fc4f301062ab22b482d7d661cf88fcdf3b7b7e04e3675673f651dc99513069ca0ff83fc0bd852790324463419e52ba713a25703c94d15a156b', None, None, None, 2, None)
Insert query: IN

> ⚠️
> There's future warnings