In [None]:
# The disaster_internal_displacements value is updated if there's any partial match in the GLIDE arrays.
# New records are only inserted if there's no partial match in the GLIDE arrays.


In [None]:
from getpass import getpass

import psycopg2


def connect_to_db():
    conn = psycopg2.connect(
        dbname="merge",
        user="postgres",
        password=getpass("Enter the database password: "),
        host=input("Enter the database host: "),
    )
    return conn


In [None]:
def create_events_table(cur):
    cur.execute("""
    CREATE TABLE IF NOT EXISTS events (
        event_id SERIAL PRIMARY KEY,
        event_name VARCHAR(255),
        disaster_group VARCHAR(50),
        disaster_subgroup VARCHAR(50),
        disaster_type VARCHAR(50),
        disaster_subtype VARCHAR(50),
        iso3_code CHAR(3),
        admin_level_0 VARCHAR(100),
        admin_level_1 TEXT[],
        admin_level_2 TEXT[],
        start_date DATE,
        end_date DATE,
        total_deaths INTEGER,
        number_injured INTEGER,
        number_affected INTEGER,
        number_homeless INTEGER,
        total_affected INTEGER,
        total_damage_adjusted FLOAT,
        reconstruction_costs_adjusted FLOAT,
        aid_contribution FLOAT,
        disaster_internal_displacements INTEGER,
        source VARCHAR(50),
        metadata JSONB,
        USGS TEXT[],
        GLIDE TEXT[],
        DFO TEXT[],
        local_Identifier TEXT[],
        IFRC_Appeal_ID TEXT[],
        Government_Assigned_Identifier TEXT[]
    )
    """)


# def create_indexes(cur):
#     # Create indexes for specified columns
#     index_columns = [
#         "disaster_type",
#         "disaster_subtype",
#         "iso3_code",
#         "admin_level_0",
#         "start_date",
#         "GLIDE"
#     ]

#     for column in index_columns:
#         index_name = f"idx_{column.lower().replace(' ', '_')}"
#         cur.execute(f"""
#         CREATE INDEX IF NOT EXISTS {index_name}
#         ON events ({column})
#         """)

#     # Create special indexes for array columns
#     cur.execute("""
#     CREATE INDEX IF NOT EXISTS idx_admin_level_1
#     ON events USING GIN (admin_level_1)
#     """)

#     cur.execute("""
#     CREATE INDEX IF NOT EXISTS idx_admin_level_2
#     ON events USING GIN (admin_level_2)
#     """)


In [None]:
def merge_and_insert_events(cur):
    # First, insert all events from emdat
    cur.execute("""
    INSERT INTO events (
        event_name, disaster_group, disaster_subgroup, disaster_type, disaster_subtype,
        iso3_code, admin_level_0, admin_level_1, admin_level_2, start_date, end_date,
        total_deaths, number_injured, number_affected, number_homeless, total_affected,
        total_damage_adjusted, reconstruction_costs_adjusted, aid_contribution,
        disaster_internal_displacements, source, metadata, USGS, GLIDE, DFO
    )
    SELECT 
        event_name, disaster_group, disaster_subgroup, disaster_type, disaster_subtype,
        iso3_code, admin_level_0, admin_level_1, admin_level_2, start_date, end_date,
        total_deaths, number_injured, number_affected, number_homeless, total_affected,
        total_damage_adjusted, reconstruction_costs_adjusted, aid_contribution,
        disaster_internal_displacements, source, metadata, USGS, GLIDE, DFO
    FROM events_emdat
    """)

    # Then, update or insert events from idmc
    cur.execute("""
    WITH idmc_data AS (
        SELECT * FROM events_idmc
    ), updated AS (
        UPDATE events e
        SET disaster_internal_displacements = i.disaster_internal_displacements
        FROM idmc_data i
        WHERE EXISTS (
            SELECT 1
            FROM unnest(e.GLIDE) e_glide
            JOIN unnest(i.GLIDE) i_glide ON e_glide = i_glide
        )
        RETURNING e.*
    )
    INSERT INTO events (
        event_name, disaster_type, disaster_subtype, iso3_code, admin_level_0,
        admin_level_1, admin_level_2, start_date, disaster_internal_displacements,
        source, metadata, GLIDE, local_Identifier, IFRC_Appeal_ID, Government_Assigned_Identifier
    )
    SELECT 
        i.event_name, i.disaster_type, i.disaster_subtype, i.iso3_code, i.admin_level_0,
        i.admin_level_1, i.admin_level_2, i.start_date, i.disaster_internal_displacements,
        i.source, i.metadata, i.GLIDE, i.local_Identifier, i.IFRC_Appeal_ID, i.Government_Assigned_Identifier
    FROM idmc_data i
    WHERE NOT EXISTS (
        SELECT 1 FROM updated u
        WHERE EXISTS (
            SELECT 1
            FROM unnest(u.GLIDE) u_glide
            JOIN unnest(i.GLIDE) i_glide ON u_glide = i_glide
        )
    )
    """)


def main():
    conn = connect_to_db()
    cur = conn.cursor()

    try:
        # Create the new events table
        create_events_table(cur)

        # Merge and insert data
        merge_and_insert_events(cur)

        # Create indexes
        # create_indexes(cur)

        # Commit the changes
        conn.commit()
        print("Data merged, inserted, and indexed successfully.")

    except (Exception, psycopg2.Error) as error:
        print(
            "Error while connecting to PostgreSQL or executing query:", error
        )

    finally:
        # Close the database connection
        if conn:
            cur.close()
            conn.close()
            print("PostgreSQL connection is closed")


if __name__ == "__main__":
    main()


In [None]:
"""
-- Create indexes for specified columns
CREATE INDEX IF NOT EXISTS idx_disaster_type ON events (disaster_type);
CREATE INDEX IF NOT EXISTS idx_disaster_subtype ON events (disaster_subtype);
CREATE INDEX IF NOT EXISTS idx_iso3_code ON events (iso3_code);
CREATE INDEX IF NOT EXISTS idx_admin_level_0 ON events (admin_level_0);
CREATE INDEX IF NOT EXISTS idx_start_date ON events (start_date);
CREATE INDEX IF NOT EXISTS idx_glide ON events (GLIDE);

-- Create special indexes for array columns
CREATE INDEX IF NOT EXISTS idx_admin_level_1 ON events USING GIN (admin_level_1);
CREATE INDEX IF NOT EXISTS idx_admin_level_2 ON events USING GIN (admin_level_2);
"""


In [None]:
# Flatten the admin areas

"""
CREATE TABLE events_flattened_admin AS
SELECT
    e.*,
    0 AS admin_level,
    e.admin_level_0 AS admin_name
FROM events e

UNION ALL

SELECT
    e.*,
    1 AS admin_level,
    unnest(e.admin_level_1) AS admin_name
FROM events e
WHERE array_length(e.admin_level_1, 1) > 0

UNION ALL

SELECT
    e.*,
    2 AS admin_level,
    unnest(e.admin_level_2) AS admin_name
FROM events e
WHERE array_length(e.admin_level_2, 1) > 0;

-- Create indexes to improve query performance
CREATE INDEX idx_flattened_events_event_id ON events_flattened_admin (event_id);
CREATE INDEX idx_flattened_events_admin_level ON events_flattened_admin (admin_level);
CREATE INDEX idx_flattened_events_admin_name ON events_flattened_admin (admin_name);
CREATE INDEX idx_flattened_events_iso3_code ON events_flattened_admin (iso3_code);
"""


In [None]:
"""
CREATE OR REPLACE VIEW events_with_geometry AS
SELECT
    fe.event_id,
    fe.event_name,
    fe.disaster_group,
    fe.disaster_subgroup,
    fe.disaster_type,
    fe.disaster_subtype,
    fe.iso3_code,
    fe.admin_level,
    fe.admin_name,
    fe.start_date,
    fe.end_date,
    fe.total_deaths,
    fe.number_injured,
    fe.number_affected,
    fe.number_homeless,
    fe.total_affected,
    fe.total_damage_adjusted,
    fe.reconstruction_costs_adjusted,
    fe.aid_contribution,
    fe.disaster_internal_displacements,
    fe.source,
    fe.metadata,
    fe.USGS,
    fe.GLIDE,
    fe.DFO,
    fe.local_Identifier,
    fe.IFRC_Appeal_ID,
    fe.Government_Assigned_Identifier,
    gc.id AS gid,
    gc.centroid_lat,
    gc.centroid_long,
    gc.geojson_polygon
FROM
    events_flattened_admin fe
LEFT JOIN
    gadm_combined gc ON CASE
        WHEN fe.admin_level = 0 THEN gc.admin_level = 'admin0'
        WHEN fe.admin_level = 1 THEN gc.admin_level = 'admin1'
        WHEN fe.admin_level = 2 THEN gc.admin_level = 'admin2'
    END
    AND fe.admin_name = gc.admin_name
    AND (fe.iso3_code = gc.ISO3 OR fe.iso3_code IS NULL)
WHERE
    gc.id IS NOT NULL;
"""
