In [None]:
# Disable SettingWithCopyWarning
import warnings

# Disable the SettingWithCopyWarning
# Note: Disabling warnings should be done cautiously. Ensure all DataFrame modifications are intentional.
warnings.filterwarnings("ignore")


In [None]:
import pandas as pd

# Usage
file_path = input("Enter the path to the Excel file: ")

# Read the Excel file
df = pd.read_excel(file_path)


In [None]:
selected_columns = [
    "ISO3",
    "Country / Territory",
    "Year",
    "Event Name",
    "Date of Event (start)",
    "Disaster Internal Displacements",
    "Hazard Type",
    "Hazard Sub Type",
    "Event Codes (Code:Type)",
]

# Filter the DataFrame to keep only the selected columns
df_filtered = df[selected_columns]


- Extract GLIDE number from `Events Codes`


In [None]:
df_filtered[df_filtered["Event Codes (Code:Type)"].notnull()].head()


In [None]:
# Extract Event Codes and create new columns
def extract_event_codes(event_codes):
    """
    Extract the Event Code types and values from the given string.

    Args:
    event_codes (str): The Event Codes string to process.

    Returns:
    dict: A dictionary containing the Event Code types as keys and lists of values as values.
    """
    if pd.isna(event_codes):
        return {}
    code_dict = {}
    for code_pair in event_codes.split("; "):
        # Special handling for IFRC Appeal ID
        if ":IFRC Appeal ID" in code_pair:
            code_value = code_pair.split(":IFRC Appeal ID")[0]
            code_type = "IFRC Appeal ID"
        else:
            parts = code_pair.split(":", 1)
            if len(parts) == 2:
                code_value, code_type = parts
            else:
                continue  # Skip if the format is unexpected

        if code_type in code_dict:
            code_dict[code_type].append(code_value)
        else:
            code_dict[code_type] = [code_value]
    return code_dict


# Apply the function to create new columns
df_filtered["EventCodes"] = df_filtered["Event Codes (Code:Type)"].apply(
    extract_event_codes
)

# Create separate columns for each Event Code type
event_code_types = set()
for code_dict in df_filtered["EventCodes"]:
    event_code_types.update(code_dict.keys())

for code_type in event_code_types:
    df_filtered[code_type] = df_filtered["EventCodes"].apply(
        lambda x: x.get(code_type, None)
    )

# Drop the temporary column
df_filtered = df_filtered.drop(columns=["EventCodes"])

# Display the first few rows to verify the changes
print("\nFirst few rows after Event Code extraction:")
display(
    df_filtered[["Event Codes (Code:Type)"] + list(event_code_types)].head()
)

# Count the number of non-null values in each new column
print("\nNumber of non-null values in each new Event Code column:")
display(df_filtered[list(event_code_types)].count())

# Display rows where Glide Number is not null
print("\nRows where Glide Number is not null:")
display(df_filtered[df_filtered["Glide Number"].notna()].head())


In [None]:
# Function to process Glide Number
def process_glide_number(glide_number):
    """
    Process Glide Number by removing the last dash and checking the ISO3 code.

    Args:
    glide_number (str or list): The Glide Number string or list of strings.

    Returns:
    tuple: Processed Glide Number(s) and a flag indicating if the ISO3 code(s) are valid.
    """
    if isinstance(glide_number, list):
        # Process each Glide Number in the list
        processed_glides = []
        valid_iso3s = []
        for gn in glide_number:
            processed, valid = process_single_glide(gn)
            processed_glides.append(processed)
            valid_iso3s.append(valid)
        return processed_glides, all(valid_iso3s)
    else:
        # Process a single Glide Number
        return process_single_glide(glide_number)


def process_single_glide(glide_number):
    """Helper function to process a single Glide Number"""
    if pd.isna(glide_number) or not isinstance(glide_number, str):
        return None, True

    parts = glide_number.split("-")
    if len(parts) < 2:
        return glide_number, True

    processed_glide = "-".join(
        parts[:-1]
    )  # Join all parts except the last one
    iso3_code = parts[-1]  # Last part is the potential ISO3 code

    # Check if the last part is a valid ISO3 code (3 uppercase letters)
    is_valid_iso3 = (
        len(iso3_code) == 3 and iso3_code.isupper() and iso3_code.isalpha()
    )

    return processed_glide, is_valid_iso3


# Apply the function to the Glide Number column
df_filtered["Processed_Glide_Number"], df_filtered["Valid_ISO3"] = zip(
    *df_filtered["Glide Number"].apply(process_glide_number)
)

# Print Glide Numbers with invalid ISO3 codes
invalid_glide_numbers = df_filtered[~df_filtered["Valid_ISO3"]]
if not invalid_glide_numbers.empty:
    print("\nGlide Numbers with potentially invalid ISO3 codes:")
    for _, row in invalid_glide_numbers.iterrows():
        print(
            f"Original Glide Number: {row['Glide Number']}, Processed: {row['Processed_Glide_Number']}"
        )
else:
    print("\nAll Glide Numbers have valid ISO3 codes.")

# Update the Glide Number column with processed values
df_filtered["Glide Number"] = df_filtered["Processed_Glide_Number"]

# Drop temporary columns
df_filtered = df_filtered.drop(
    columns=["Processed_Glide_Number", "Valid_ISO3"]
)

# Display the first few rows to verify the changes
print("\nFirst few rows after Glide Number processing:")
display(df_filtered[["Glide Number"]].head())

# Count the number of non-null Glide Numbers
print(
    f"\nNumber of non-null Glide Numbers: {df_filtered['Glide Number'].count()}"
)

# Display rows where Glide Number is not null
print("\nRows where Glide Number is not null:")
display(df_filtered[df_filtered["Glide Number"].notna()].head())


- Extract location names from `Event Name`


In [None]:
# Modify the ISO of IDMC to match GADM
# Function to modify ISO3 and Country/Territory to match GADM
def modify_iso_and_country(row):
    """
    Modify ISO3 and Country/Territory to match GADM standards.

    Args:
    row (Series): A row from the DataFrame.

    Returns:
    Series: Modified row with updated ISO3 and Country/Territory.
    """
    iso = row["ISO3"]
    country = row["Country / Territory"]

    # Special cases
    if iso in ["HKG", "MAC"]:
        return pd.Series({"ISO3": "CHN", "Country / Territory": "China"})
    elif iso == "ANT":
        return pd.Series({"ISO3": "NLD", "Country / Territory": "Netherlands"})
    elif iso == "SCG":
        # For Serbia and Montenegro, we need more information to decide
        # For now, we'll keep it as is and flag it for manual review
        return pd.Series(
            {
                "ISO3": "SCG",
                "Country / Territory": "Serbia and Montenegro (flagged for review)",
            }
        )
    elif iso == "AB9":
        return pd.Series({"ISO3": "SDN", "Country / Territory": "Sudan"})
    elif iso == "XKX":
        return pd.Series({"ISO3": "XKO", "Country / Territory": "Kosovo"})

    # If no special case, return the original values
    return pd.Series({"ISO3": iso, "Country / Territory": country})


# Apply the function to modify ISO3 and Country/Territory
df_filtered[["ISO3", "Country / Territory"]] = df_filtered.apply(
    modify_iso_and_country, axis=1
)

# Display the first few rows to verify the changes
print("\nFirst few rows after ISO3 and Country/Territory modification:")
display(df_filtered[["ISO3", "Country / Territory"]].head())

# Count the number of rows for each unique ISO3 code
print("\nNumber of rows for each unique ISO3 code:")
display(df_filtered["ISO3"].value_counts())


In [None]:
df_filtered[df_filtered["ISO3"] == "XKX"]


In [None]:
# Read the IDMC admin area mapping Excel file
import pandas as pd

# Read the Excel file
idmc_mapping_df = pd.read_excel("./IDMC_admin_area_mapping.xlsx")


In [None]:
# Filter the DataFrame to retain only ISO3 and columns with prefix 'GADM_'
gadm_columns = [
    col
    for col in idmc_mapping_df.columns
    if col.startswith("GADM_") or col == "ISO3"
]
filtered_idmc_mapping_df = idmc_mapping_df[gadm_columns]
filtered_idmc_mapping_df.head()


In [None]:
filtered_idmc_mapping_df[filtered_idmc_mapping_df["ISO3"] == "XKO"]


In [None]:
# Function to combine location names from GADM columns and group by admin levels
def combine_locations(group):
    """
    Combine location names from GADM columns and group by admin levels for each ISO3 group.

    Args:
    group (DataFrame): A group of rows with the same ISO3 code.

    Returns:
    dict: A dictionary containing combined and deduplicated location lists for each admin level.
    """
    country_names = set()
    admin1_names = set()
    admin2_names = set()

    for _, row in group.iterrows():
        for col in row.index:
            if col.startswith("GADM_") and isinstance(row[col], str):
                names = set(
                    name.strip()
                    for name in row[col].split("|")
                    if name.strip()
                )
                if col in ["GADM_Country", "GADM_Country_Alt"]:
                    country_names.update(names)
                elif col in [
                    "GADM_Admin1",
                    "GADM_Admin1_Alt",
                    "GADM_Admin1_Local",
                ]:
                    admin1_names.update(names)
                elif col in [
                    "GADM_Admin2",
                    "GADM_Admin2_Alt",
                    "GADM_Admin2_Local",
                ]:
                    admin2_names.update(names)

    return pd.Series(
        {
            "country_list": list(country_names) if country_names else None,
            "admin1_list": list(admin1_names) if admin1_names else None,
            "admin2_list": list(admin2_names) if admin2_names else None,
        }
    )


# Group by ISO3 and apply the combine_locations function
grouped_df = (
    filtered_idmc_mapping_df.groupby("ISO3")
    .apply(combine_locations)
    .reset_index()
)

# Rename columns
grouped_df.columns = ["ISO3", "country_list", "admin1_list", "admin2_list"]

# Remove rows where all location lists are None
grouped_df = grouped_df.dropna(
    subset=["country_list", "admin1_list", "admin2_list"], how="all"
)

# Display the first few rows of the transformed DataFrame
print("\nFirst few rows of the transformed DataFrame:")
display(grouped_df.head())

# Print the shape of the resulting DataFrame
print(f"\nShape of the resulting DataFrame: {grouped_df.shape}")

# Print the number of non-null values for each location list column
print("\nNumber of non-null values in each location list column:")
print(grouped_df[["country_list", "admin1_list", "admin2_list"]].notna().sum())

# Print an example of the combined lists for a specific country (e.g., Afghanistan)
print("\nExample of combined lists for Afghanistan:")
display(grouped_df[grouped_df["ISO3"] == "AFG"])


In [None]:
grouped_df[grouped_df["ISO3"] == "XKO"]


In [None]:
import re


def find_location_mentions(input_string, iso3, location_df):
    """
    Find location mentions in the input string for a specific ISO3 code.

    Args:
    input_string (str): The input string to search for location mentions.
    iso3 (str): The ISO3 code to filter locations.
    location_df (DataFrame): DataFrame containing location lists for each ISO3 code.

    Returns:
    list: List of tuples containing (location, admin_level) for matched locations.
    """
    iso3 = iso3.strip()
    # Handle special cases
    if iso3 in ["HKG", "MAC"]:
        iso3 = "CHN"
    elif iso3 == "ANT":
        iso3 = "NLD"
    elif iso3 == "SCG":
        # For Serbia and Montenegro, we'll check both countries
        srb_locations = find_location_mentions(
            input_string, "SRB", location_df
        )
        mne_locations = find_location_mentions(
            input_string, "MNE", location_df
        )
        return srb_locations + mne_locations
    elif iso3 == "AB9":
        return [("Abyei Area", "admin2")]
    elif iso3 in "XKX":
        print(iso3)
        iso3 = "XKO"  # Assuming XKO is the ISO3 code for Kosovo in your location_df

    # Get location lists for the specified ISO3 code
    country_row = location_df[location_df["ISO3"] == iso3]
    if country_row.empty:
        print(f"No matching ISO3 code found for {iso3}")
        return []  # Return an empty list if no matching ISO3 is found

    country_row = country_row.iloc[0]
    country_list = country_row["country_list"] or []
    admin1_list = country_row["admin1_list"] or []
    admin2_list = country_row["admin2_list"] or []

    # For HKG and MAC, add them to admin1 list
    if iso3 == "CHN" and (
        input_string.lower().find("hong kong") != -1
        or input_string.lower().find("macau") != -1
    ):
        admin1_list.extend(["Hong Kong", "Macau"])

    # Combine all locations
    all_locations = (
        [(loc, "country") for loc in country_list]
        + [(loc, "admin1") for loc in admin1_list]
        + [(loc, "admin2") for loc in admin2_list]
    )

    # Sort locations by length (longest first) to ensure longer matches take precedence
    sorted_locations = sorted(
        all_locations, key=lambda x: len(x[0]), reverse=True
    )

    # Create a regex pattern that matches any of the locations as whole words/phrases
    pattern = (
        r"\b(?:"
        + "|".join(re.escape(loc[0]) for loc in sorted_locations)
        + r")\b"
    )

    # Perform case-insensitive search
    matches = re.findall(pattern, input_string, re.IGNORECASE)

    # Return unique matches with their admin level
    return list(
        set(
            (loc[0], loc[1])
            for loc in sorted_locations
            if loc[0].lower() in [m.lower() for m in matches]
        )
    )


# Function to extract location names from Event Name
def extract_locations(event_name, iso3, location_df):
    """
    Extract location names from the Event Name for a specific ISO3 code.

    Args:
    event_name (str): The Event Name to search for locations.
    iso3 (str): The ISO3 code to filter locations.
    location_df (DataFrame): DataFrame containing location lists for each ISO3 code.

    Returns:
    list: List of tuples containing (location, admin_level) for matched locations.
    """
    return find_location_mentions(event_name, iso3, location_df)


# Apply the function to create a new column with extracted locations
df_filtered["Extracted_Locations"] = df_filtered.apply(
    lambda row: extract_locations(row["Event Name"], row["ISO3"], grouped_df),
    axis=1,
)

# Display the first few rows to verify the changes
print("\nFirst few rows after location extraction:")
display(df_filtered[["ISO3", "Event Name", "Extracted_Locations"]].head(10))

# Count the number of events with extracted locations
events_with_locations = (
    df_filtered["Extracted_Locations"].apply(lambda x: len(x) > 0).sum()
)
print(f"\nNumber of events with extracted locations: {events_with_locations}")
print(
    f"Percentage of events with extracted locations: {events_with_locations / len(df_filtered) * 100:.2f}%"
)

# Example usage
example_iso3 = "USA"
example_event = "Earthquake - North - Los Angeles 20100101"
example_locations = extract_locations(example_event, example_iso3, grouped_df)
print(
    f"\nExample extraction for ISO3 '{example_iso3}' and event '{example_event}':"
)
print(example_locations)

# Print ISO3 codes that are in df_filtered but not in grouped_df
missing_iso3 = set(df_filtered["ISO3"]) - set(grouped_df["ISO3"])
print("\nISO3 codes in df_filtered but not in grouped_df:")
print(missing_iso3)


In [None]:
df_filtered[df_filtered["ISO3"] == "XKX"]


- Transform ISO3, Admin name, and Admin level to GADM's standard


In [None]:
# # Function to transform location names and expand admin levels
# def transform_locations(extracted_locations, iso3, filtered_idmc_mapping_df):
#     """
#     Transform location names to GADM's main names and expand admin levels.

#     Args:
#     extracted_locations (list): List of tuples containing (location, admin_level).
#     iso3 (str): The ISO3 code of the country.
#     filtered_idmc_mapping_df (DataFrame): DataFrame containing GADM location mappings.

#     Returns:
#     dict: A dictionary with transformed locations for each admin level.
#     """
#     country_row = filtered_idmc_mapping_df[filtered_idmc_mapping_df['ISO3'] == iso3]
#     if country_row.empty:
#         return {'admin_level_0': [], 'admin_level_1': [], 'admin_level_2': []}

#     result = {'admin_level_0': [], 'admin_level_1': [], 'admin_level_2': []}

#     for location, admin_level in extracted_locations:
#         if admin_level == 'country':
#             country_name = country_row['GADM_Country'].iloc[0]
#             if country_name not in result['admin_level_0']:
#                 result['admin_level_0'].append(country_name)
#         elif admin_level == 'admin1':
#             admin1_match = country_row[country_row['GADM_Admin1'].str.contains(location, case=False, na=False) |
#                                        country_row['GADM_Admin1_Alt'].str.contains(location, case=False, na=False) |
#                                        country_row['GADM_Admin1_Local'].str.contains(location, case=False, na=False)]
#             if not admin1_match.empty:
#                 admin1_name = admin1_match['GADM_Admin1'].iloc[0]
#                 if admin1_name not in result['admin_level_1']:
#                     result['admin_level_1'].append(admin1_name)
#         elif admin_level == 'admin2':
#             admin2_match = country_row[country_row['GADM_Admin2'].str.contains(location, case=False, na=False) |
#                                        country_row['GADM_Admin2_Alt'].str.contains(location, case=False, na=False) |
#                                        country_row['GADM_Admin2_Local'].str.contains(location, case=False, na=False)]
#             if not admin2_match.empty:
#                 admin2_name = admin2_match['GADM_Admin2'].iloc[0]
#                 if admin2_name not in result['admin_level_2']:
#                     result['admin_level_2'].append(admin2_name)

#     return result

# # Apply the transformation to the DataFrame
# df_filtered[['admin_level_0', 'admin_level_1', 'admin_level_2']] = df_filtered.apply(
#     lambda row: pd.Series(transform_locations(row['Extracted_Locations'], row['ISO3'], filtered_idmc_mapping_df)),
#     axis=1
# )

# # Display the first few rows to verify the changes
# print("\nFirst few rows after location transformation:")
# display(df_filtered[['ISO3', 'Event Name', 'Extracted_Locations', 'admin_level_0', 'admin_level_1', 'admin_level_2']].head(10))

# # Count the number of events with transformed locations
# events_with_transformed_locations = df_filtered[['admin_level_0', 'admin_level_1', 'admin_level_2']].apply(lambda x: x.apply(len) > 0).any(axis=1).sum()
# print(f"\nNumber of events with transformed locations: {events_with_transformed_locations}")
# print(f"Percentage of events with transformed locations: {events_with_transformed_locations / len(df_filtered) * 100:.2f}%")

# # Print the number of non-empty lists for each admin level column
# print("\nNumber of non-empty lists in each admin level column:")
# print(df_filtered[['admin_level_0', 'admin_level_1', 'admin_level_2']].apply(lambda x: x.apply(len) > 0).sum())

# # Check for rows with more than one admin_level_0 name
# rows_with_multiple_admin0 = df_filtered[df_filtered['admin_level_0'].apply(len) > 1]
# print(f"\nNumber of rows with more than one admin_level_0 name: {len(rows_with_multiple_admin0)}")

# if len(rows_with_multiple_admin0) > 0:
#     print("\nSample rows with multiple admin_level_0 names:")
#     display(rows_with_multiple_admin0[['ISO3', 'Event Name', 'Extracted_Locations', 'admin_level_0']].head())

# Function to transform location names and expand admin levels
def transform_locations(
    original_country, extracted_locations, iso3, filtered_idmc_mapping_df
):
    """
    Transform location names to GADM's main names and expand admin levels.

    Args:
    extracted_locations (list): List of tuples containing (location, admin_level).
    iso3 (str): The ISO3 code of the country.
    filtered_idmc_mapping_df (DataFrame): DataFrame containing GADM location mappings.

    Returns:
    dict: A dictionary with transformed locations for each admin level.
    """
    country_row = filtered_idmc_mapping_df[
        filtered_idmc_mapping_df["ISO3"] == iso3
    ]
    if country_row.empty:
        return {
            "Country / Territory": None,
            "admin_level_1": [],
            "admin_level_2": [],
        }

    country = original_country
    if original_country != country_row["GADM_Country"].iloc[0]:
        print(
            f"original: {original_country}, GADM: {country_row['GADM_Country'].iloc[0]}"
        )
        country = country_row["GADM_Country"].iloc[0]

    result = {
        "Country / Territory": country,
        "admin_level_1": [],
        "admin_level_2": [],
    }

    for location, admin_level in extracted_locations:
        if admin_level == "admin1":
            admin1_match = country_row[
                country_row["GADM_Admin1"].str.contains(
                    location, case=False, na=False
                )
                | country_row["GADM_Admin1_Alt"].str.contains(
                    location, case=False, na=False
                )
                | country_row["GADM_Admin1_Local"].str.contains(
                    location, case=False, na=False
                )
            ]
            if not admin1_match.empty:
                admin1_name = admin1_match["GADM_Admin1"].iloc[0]
                if admin1_name not in result["admin_level_1"]:
                    result["admin_level_1"].append(admin1_name)
        elif admin_level == "admin2":
            admin2_match = country_row[
                country_row["GADM_Admin2"].str.contains(
                    location, case=False, na=False
                )
                | country_row["GADM_Admin2_Alt"].str.contains(
                    location, case=False, na=False
                )
                | country_row["GADM_Admin2_Local"].str.contains(
                    location, case=False, na=False
                )
            ]
            if not admin2_match.empty:
                admin2_name = admin2_match["GADM_Admin2"].iloc[0]
                if admin2_name not in result["admin_level_2"]:
                    result["admin_level_2"].append(admin2_name)

    return result


# Apply the transformation to the DataFrame
df_filtered[["Country / Territory", "admin_level_1", "admin_level_2"]] = (
    df_filtered.apply(
        lambda row: pd.Series(
            transform_locations(
                row["Country / Territory"],
                row["Extracted_Locations"],
                row["ISO3"],
                filtered_idmc_mapping_df,
            )
        ),
        axis=1,
    )
)

# Display the first few rows to verify the changes
print("\nFirst few rows after location transformation:")
display(
    df_filtered[
        [
            "ISO3",
            "Event Name",
            "Extracted_Locations",
            "admin_level_1",
            "admin_level_2",
        ]
    ].head(10)
)

# Count the number of events with transformed locations
events_with_transformed_locations = (
    df_filtered[["admin_level_1", "admin_level_2"]]
    .apply(lambda x: x.apply(len) > 0)
    .any(axis=1)
    .sum()
)
print(
    f"\nNumber of events with transformed locations: {events_with_transformed_locations}"
)
print(
    f"Percentage of events with transformed locations: {events_with_transformed_locations / len(df_filtered) * 100:.2f}%"
)

# Print the number of non-empty lists for each admin level column
print("\nNumber of non-empty lists in each admin level column:")
print(
    df_filtered[["admin_level_1", "admin_level_2"]]
    .apply(lambda x: x.apply(len) > 0)
    .sum()
)


In [None]:
df_filtered[df_filtered["Country / Territory"].isna()]


In [None]:
df_col = df_filtered[df_filtered["ISO3"] == "TWN"]
df_col


In [None]:
# Filter for rows where Extracted_Locations is an empty list and display the first few rows
df_filtered[df_filtered["Extracted_Locations"].apply(lambda x: len(x) == 0)]


- Map Hazard classification to EMDAT's classification


In [None]:
# Read the IDMC admin area mapping Excel file
import pandas as pd

# Read the Excel file
disaster_hazard_type_mapping_df = pd.read_excel(
    "./Disaster Hazard Type Map.xlsx"
)
# Filter rows where either "Hazard Type" or "Hazard Sub Type" is not null
disaster_hazard_type_mapping_df = disaster_hazard_type_mapping_df[
    disaster_hazard_type_mapping_df["Hazard Type"].notna()
    | disaster_hazard_type_mapping_df["Hazard Sub Type"].notna()
]
disaster_hazard_type_mapping_df


In [None]:
# Create a dictionary for mapping Hazard Type and Sub Type to Disaster Type and Subtype
hazard_to_disaster_map = {}
for _, row in disaster_hazard_type_mapping_df.iterrows():
    hazard_key = (row["Hazard Type"], row["Hazard Sub Type"])
    disaster_value = (row["Disaster Type"], row["Disaster Subtype"])
    hazard_to_disaster_map[hazard_key] = disaster_value


# Function to map hazard types to disaster types
def map_hazard_to_disaster(hazard_type, hazard_sub_type):
    """
    Maps hazard type and sub-type to disaster type and subtype.

    Args:
    hazard_type (str): The hazard type.
    hazard_sub_type (str): The hazard sub-type.

    Returns:
    tuple: A tuple containing the mapped disaster type and subtype.
    """
    return hazard_to_disaster_map.get(
        (hazard_type, hazard_sub_type), (hazard_type, hazard_sub_type)
    )


# Apply the mapping to df_filtered
df_filtered[["Disaster Type", "Disaster Subtype"]] = df_filtered.apply(
    lambda row: pd.Series(
        map_hazard_to_disaster(row["Hazard Type"], row["Hazard Sub Type"])
    ),
    axis=1,
)

# Display the first few rows of the updated df_filtered to verify the changes
print(
    df_filtered[
        ["Hazard Type", "Hazard Sub Type", "Disaster Type", "Disaster Subtype"]
    ].head()
)


In [None]:
df_filtered


- Rename columns


In [None]:
# Rename columns based on the provided mapping
column_mapping = {
    "Disaster Type": "disaster_type",
    "Disaster Subtype": "disaster_subtype",
    "ISO3": "iso3_code",
    "Country / Territory": "admin_level_0",
    "Disaster Internal Displacements": "disaster_internal_displacements",
    "Date of Event (start)": "start_date",
}

# Rename the columns in df_filtered
df_filtered.rename(columns=column_mapping, inplace=True)

# Display the first few rows to verify changes
print("First few rows after renaming columns:")
display(df_filtered[list(column_mapping.values())].head())

# Display column names to confirm changes
print("\nUpdated column names:")
print(df_filtered.columns.tolist())


In [None]:
# # Merge admin_level_0_original column with admin_level_0
# df_filtered['admin_level_0'] = df_filtered.apply(
#     lambda row: row['admin_level_0'] if pd.notnull(row['admin_level_0']) else row['admin_level_0_original'],
#     axis=1
# )

# # Drop the original column as it's no longer needed
# df_filtered.drop('admin_level_0_original', axis=1, inplace=True)

# # Display the first few rows to verify changes
# print("First few rows after merging admin_level_0 columns:")
# display(df_filtered[['iso3_code', 'admin_level_0']].head())


In [None]:
import json

# Define the columns to be included in the metadata
metadata_columns = [
    "Event Name",
    "Event Codes (Code:Type)",
    "Year",
    "Hazard Type",
    "Hazard Sub Type",
]


# Function to create metadata JSON
def create_metadata(row):
    """
    Create a JSON string containing metadata from specified columns.

    Args:
    row (pandas.Series): A row from the DataFrame.

    Returns:
    str: JSON string containing metadata.
    """
    metadata = {
        col: row[col] for col in metadata_columns if pd.notna(row[col])
    }
    return json.dumps(metadata)


# Create the metadata column
df_filtered["metadata"] = df_filtered.apply(create_metadata, axis=1)

# Remove the original columns that are now in metadata
df_filtered = df_filtered.drop(columns=metadata_columns)

print("\nUpdated column names after removing metadata columns:")
print(df_filtered.columns.tolist())


In [None]:
# Add 'source' column with value 'IDMC'
df_filtered["source"] = "IDMC"

# Display the first few rows to verify the new column
print("First few rows after adding 'source' column:")
display(df_filtered[["source"] + df_filtered.columns[:-1].tolist()].head())

# Confirm the new column is added
print("\nUpdated column names:")
print(df_filtered.columns.tolist())


In [None]:
# Rename 'Glide Number' column to 'GLIDE'
df_filtered = df_filtered.rename(columns={"Glide Number": "GLIDE"})
# Rename columns with underscores instead of spaces
df_filtered = df_filtered.rename(
    columns={
        "Local Identifier": "Local_Identifier",
        "IFRC Appeal ID": "IFRC_Appeal_ID",
        "Government Assigned Identifier": "Government_Assigned_Identifier",
    }
)


# Drop 'Extracted_Locations' column
df_filtered = df_filtered.drop(columns=["Extracted_Locations"])

# Display the first few rows to verify the changes
print(
    "First few rows after renaming 'Glide Number' to 'GLIDE' and dropping 'Extracted_Locations':"
)
display(df_filtered.head())

# Confirm the updated column names
print("\nUpdated column names:")
print(df_filtered.columns.tolist())


In [None]:
import uuid


def generate_short_uuid():
    """Generate a short UUID."""
    return str(uuid.uuid4())[:8]


def create_event_name(row):
    """
    Create an event name based on the specified pattern.

    Args:
    row (pandas.Series): A row from the DataFrame.

    Returns:
    str: Event name string.
    """
    # Define default values for missing data
    iso3 = (
        row["iso3_code"] if pd.notna(row["iso3_code"]) else "UNKNOWN-ISO3-CODE"
    )
    disaster_type = (
        row["disaster_type"]
        if pd.notna(row["disaster_type"])
        else "UNKNOWN-DISASTER-TYPE"
    )
    disaster_subtype = (
        row["disaster_subtype"]
        if pd.notna(row["disaster_subtype"])
        else "UNKNOWN-DISASTER-SUBTYPE"
    )

    # Convert start_date to string format YYYYMMDD
    start_time = (
        row["start_date"].strftime("%Y%m%d")
        if pd.notna(row["start_date"])
        else "UNKNOWN-START-DATE"
    )

    # Generate short UUID
    short_uuid = generate_short_uuid()

    # Create event name
    event_name = (
        f"{iso3}_{disaster_type}_{disaster_subtype}_{start_time}_{short_uuid}"
    )

    return event_name


# Create the event_name column
df_filtered["event_name"] = df_filtered.apply(create_event_name, axis=1)

# Display the first few rows to verify the new event_name column
print("First few rows after adding event_name column:")
display(df_filtered[["event_name"] + df_filtered.columns[:-1].tolist()].head())

# Confirm the new column is added
print("\nUpdated column names:")
print(df_filtered.columns.tolist())


In [None]:
# Save the final df_filtered to a CSV file
csv_filename = "preprocessed_idmc.csv"
df_filtered.to_csv(csv_filename, index=False)
print(f"DataFrame saved to {csv_filename}")


In [None]:
from getpass import getpass

import psycopg2
from psycopg2.extras import execute_values


def insert_idmc_data(df):
    """
    Insert the preprocessed IDMC data into the events_idmc table.

    Args:
    df (pandas.DataFrame): The preprocessed IDMC data.

    Returns:
    None
    """
    try:
        # Connect to the local PostgreSQL database
        conn = psycopg2.connect(
            dbname="merge",
            user="postgres",
            password=getpass("Enter the database password: "),
            host=input("Enter the database host: "),
        )

        # Create a cursor object
        cur = conn.cursor()

        # Prepare the data for insertion
        data = df.to_dict("records")

        # SQL query for inserting data
        insert_query = """
        INSERT INTO events_idmc (
            event_name, disaster_type, disaster_subtype, iso3_code, admin_level_0,
            admin_level_1, admin_level_2, start_date, disaster_internal_displacements,
            source, metadata, GLIDE, local_Identifier, IFRC_Appeal_ID, Government_Assigned_Identifier
        ) VALUES %s
        ON CONFLICT (event_name, iso3_code, start_date) DO NOTHING
        """

        # Convert data to tuple format for psycopg2, handling potential integer overflow
        values = [
            (
                row["event_name"],
                row["disaster_type"],
                row["disaster_subtype"],
                row["iso3_code"],
                row["admin_level_0"] if row["admin_level_0"] else None,
                row["admin_level_1"],
                row["admin_level_2"],
                row["start_date"],
                int(row["disaster_internal_displacements"])
                if pd.notna(row["disaster_internal_displacements"])
                else None,
                row["source"],
                row["metadata"],
                row["GLIDE"] if isinstance(row["GLIDE"], list) else None,
                row["Local_Identifier"]
                if isinstance(row["Local_Identifier"], list)
                else None,
                row["IFRC_Appeal_ID"]
                if isinstance(row["IFRC_Appeal_ID"], list)
                else None,
                row["Government_Assigned_Identifier"]
                if isinstance(row["Government_Assigned_Identifier"], list)
                else None,
            )
            for row in data
        ]

        # Execute the insert query
        execute_values(cur, insert_query, values)

        # Commit the changes
        conn.commit()

        print("Data inserted successfully into events_idmc table.")

    except (Exception, psycopg2.Error) as error:
        print(f"Error inserting data into events_idmc table: {error}")

    finally:
        # Close the cursor and connection
        if cur:
            cur.close()
        if conn:
            conn.close()


# Call the function to insert the data
insert_idmc_data(df_filtered)

# Print a message to confirm the operation is complete
print("IDMC data insertion process completed.")


In [None]:
# Display information about the filtered DataFrame
print("Filtered DataFrame Info:")
df_filtered.info()

# Display the first few rows of the filtered DataFrame
print("\nFirst few rows of the filtered DataFrame:")
display(df_filtered.head())

# Display summary statistics for the filtered DataFrame
print("\nSummary statistics for the filtered DataFrame:")
display(df_filtered.describe())

# Count non-null values for each column in the filtered DataFrame
print("\nNon-null value counts for each column:")
display(df_filtered.count())
