In [1]:
import pandas as pd
import os

# Load all datasets
base_path = "D:/Final_year_Project/JAARVIS"
filenames = {
    "landslide": "landslide_dataset.csv",
    "glof": "glof_dataset.csv",
    "earthquake": "earthquake_dataset.csv",
    "liquefaction": "liquefaction_dataset.csv",
    "sinkhole": "sinkhole_dataset.csv",
    "tsunami": "tsunami_dataset.csv",
    "lahar": "lahar_dataset.csv"
}

# Load and store all dataframes in a dictionary
datasets_raw = {}
for hazard, file in filenames.items():
    path = os.path.join(base_path, file)
    df = pd.read_csv(path)
    datasets_raw[hazard] = df

# Display a sample of each dataset before cleaning
sample_pre_cleaning = {hazard: df.head(3) for hazard, df in datasets_raw.items()}
sample_pre_cleaning


{'landslide':          Date   Latitude  Longitude  Rainfall_mm  Slope_deg  Soil_Moisture_%  \
 0  2024-02-22  28.287154  83.951316       162.07      47.99            32.79   
 1  2024-02-20  28.290252  83.958072       131.97      58.43            69.67   
 2  2024-02-01  28.284380  84.001968       141.13      22.84            37.44   
 
    Landslide_Occurred  
 0                   0  
 1                   0  
 2                   0  ,
 'glof':          Date   Latitude  Longitude  Glacier_Volume_million_m3  Lake_Level_m  \
 0  2024-01-07  28.257317  83.944020                       2.24         26.44   
 1  2024-02-02  28.227553  83.995725                       1.68         31.18   
 2  2024-02-11  28.241009  83.937658                       1.53         66.84   
 
    Temperature_C  GLOF_Occurred  
 0          -2.31              0  
 1           9.81              1  
 2          -4.30              0  ,
 'earthquake':          Date   Latitude  Longitude  Magnitude  Depth_km  \
 0  2024-0

In [2]:
import pandas as pd

df = pd.read_csv("landslide_dataset.csv")

df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

print(df.head())


        Date   Latitude  Longitude  Rainfall_mm  Slope_deg  Soil_Moisture_%  \
0 2024-02-22  28.287154  83.951316       162.07      47.99            32.79   
1 2024-02-20  28.290252  83.958072       131.97      58.43            69.67   
2 2024-02-01  28.284380  84.001968       141.13      22.84            37.44   
3 2024-01-20  28.283597  83.970322       156.58      59.62            32.85   
4 2024-01-26  28.245279  83.934194        95.59      38.91            64.16   

   Landslide_Occurred  
0                   0  
1                   0  
2                   0  
3                   1  
4                   0  


In [3]:
import pandas as pd

df = pd.read_csv("glof_dataset.csv")

df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

print(df.head())


        Date   Latitude  Longitude  Glacier_Volume_million_m3  Lake_Level_m  \
0 2024-01-07  28.257317  83.944020                       2.24         26.44   
1 2024-02-02  28.227553  83.995725                       1.68         31.18   
2 2024-02-11  28.241009  83.937658                       1.53         66.84   
3 2024-02-09  28.287152  83.947804                       4.80         37.58   
4 2024-01-31  28.251320  83.954450                       1.33         37.37   

   Temperature_C  GLOF_Occurred  
0          -2.31              0  
1           9.81              1  
2          -4.30              0  
3           5.48              0  
4           8.82              1  


In [4]:
import pandas as pd

df = pd.read_csv("earthquake_dataset.csv")

df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

print(df.head())


        Date   Latitude  Longitude  Magnitude  Depth_km  \
0 2024-01-04  28.222805  83.929838        5.6     33.43   
1 2024-03-29  28.244305  83.928089        5.6     63.30   
2 2024-03-10  28.233592  83.960235        5.0     65.53   
3 2024-03-02  28.304343  83.945386        5.9     35.00   
4 2024-03-11  28.222581  84.008844        4.0     38.82   

   Ground_Acceleration_g  Quake_Occurred  
0                  0.482               1  
1                  0.274               1  
2                  0.058               1  
3                  0.178               0  
4                  0.226               1  


In [5]:
import pandas as pd

df = pd.read_csv("liquefaction_dataset.csv")

df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

# Encode categorical variables
df = pd.get_dummies(df, columns=["Soil_Type"], drop_first=True)

risk_map = {"Low": 0, "Medium": 1, "High": 2}
df["Liquefaction_Risk"] = df["Liquefaction_Risk"].map(risk_map)

print(df.head())


        Date   Latitude  Longitude  Water_Table_Depth_m  \
0 2024-03-12  28.220866  83.934677                 5.24   
1 2024-03-11  28.222613  83.988876                 4.63   
2 2024-04-04  28.219996  83.927190                 7.94   
3 2024-03-25  28.267698  83.989333                 5.55   
4 2024-01-31  28.301847  84.009018                 8.91   

   Peak_Ground_Acceleration  Liquefaction_Risk  Soil_Type_Gravel  \
0                      0.38                  1             False   
1                      0.38                  2             False   
2                      0.34                  0             False   
3                      0.13                  0             False   
4                      0.14                  2             False   

   Soil_Type_Sand  Soil_Type_Silt  
0           False           False  
1            True           False  
2           False           False  
3            True           False  
4            True           False  


In [6]:
import pandas as pd

df = pd.read_csv("sinkhole_dataset.csv")

df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

print(df.head())


        Date   Latitude  Longitude  Subsurface_Void_m3  Groundwater_Level_m  \
0 2024-01-29  28.278711  84.011229               63.17                12.01   
1 2024-02-02  28.231991  84.007159               39.88                 5.76   
2 2024-04-05  28.268093  83.926963              252.33                 9.37   
3 2024-01-18  28.303128  83.978624              394.70                 7.70   
4 2024-04-07  28.275757  83.949988              144.89                12.09   

   Sinkhole_Reported  
0                  1  
1                  0  
2                  0  
3                  1  
4                  0  


In [7]:
import pandas as pd

df = pd.read_csv("tsunami_dataset.csv")

df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

print(df.head())


        Date   Latitude  Longitude  Wave_Height_m  Distance_from_Coast_km  \
0 2024-02-08  28.281295  83.950056           5.41                    9.35   
1 2024-03-22  28.237885  83.964126           5.94                    5.48   
2 2024-01-11  28.241824  84.012953           4.02                    3.19   
3 2024-02-03  28.313800  83.974275           3.37                    4.82   
4 2024-01-26  28.292009  83.969402           6.94                    9.27   

   Coastal_Erosion_m  Alert_Issued  
0               3.07             0  
1               0.37             1  
2               1.68             1  
3               4.32             0  
4               2.06             1  


In [8]:
import pandas as pd

df = pd.read_csv("lahar_dataset.csv")

df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

print(df.head())


        Date   Latitude  Longitude  Rainfall_mm  Volcanic_Ash_Deposit_cm  \
0 2024-04-07  28.247875  83.942157       172.82                    44.92   
1 2024-02-16  28.270636  84.014329        66.86                    73.83   
2 2024-02-08  28.303364  83.933112       128.17                    92.26   
3 2024-03-20  28.311892  84.011370        83.05                    43.06   
4 2024-02-09  28.305273  83.930432       196.73                    28.35   

   Slope_Angle_deg  Lahar_Triggered  
0            41.65                0  
1            40.05                0  
2            37.55                1  
3            38.92                1  
4            30.77                0  


In [9]:
from sklearn.preprocessing import MinMaxScaler

# Function to scale numeric columns using MinMaxScaler
def scale_numeric_features(df):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    scaler = MinMaxScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df

# Function to add feature engineering (simple example per hazard)
def feature_engineering(df, hazard):
    if hazard == "landslide":
        if all(col in df.columns for col in ["Rainfall_mm", "Slope_deg"]):
            df["Risk_Score"] = 0.6 * df["Rainfall_mm"] + 0.4 * df["Slope_deg"]
    elif hazard == "earthquake":
        if all(col in df.columns for col in ["Magnitude", "Depth_km"]):
            df["Energy_Index"] = df["Magnitude"] * (1 / (df["Depth_km"] + 1))
    elif hazard == "glof":
        if all(col in df.columns for col in ["Lake_Level_m", "Temperature_C"]):
            df["Melt_Risk"] = 0.5 * df["Lake_Level_m"] + 0.5 * df["Temperature_C"]
    elif hazard == "sinkhole":
        if all(col in df.columns for col in ["Subsurface_Void_m3", "Groundwater_Level_m"]):
            df["Collapse_Index"] = df["Subsurface_Void_m3"] / (df["Groundwater_Level_m"] + 1)
    elif hazard == "liquefaction":
        if "Peak_Ground_Acceleration" in df.columns:
            df["Liquefaction_Severity"] = df["Peak_Ground_Acceleration"] * df.get("Water_Table_Depth_m", 1)
    elif hazard == "tsunami":
        if all(col in df.columns for col in ["Wave_Height_m", "Distance_from_Coast_km"]):
            df["Impact_Score"] = df["Wave_Height_m"] / (df["Distance_from_Coast_km"] + 1)
    elif hazard == "lahar":
        if all(col in df.columns for col in ["Rainfall_mm", "Volcanic_Ash_Deposit_cm"]):
            df["Flow_Potential"] = df["Rainfall_mm"] * df["Volcanic_Ash_Deposit_cm"]
    return df

# Apply advanced cleaning and feature engineering
advanced_cleaned_datasets = {}
for hazard, df in datasets_raw.items():
    df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    
    # Encode categorical if necessary
    if hazard == "liquefaction":
        if "Soil_Type" in df.columns:
            df = pd.get_dummies(df, columns=["Soil_Type"], drop_first=True)
        if "Liquefaction_Risk" in df.columns:
            risk_map = {"Low": 0, "Medium": 1, "High": 2}
            df["Liquefaction_Risk"] = df["Liquefaction_Risk"].map(risk_map)

    # Feature engineering
    df = feature_engineering(df, hazard)
    
    # Feature scaling
    df = scale_numeric_features(df)
    
    advanced_cleaned_datasets[hazard] = df

# Display the first few rows of each cleaned dataset
sample_advanced_cleaned = {hazard: df.head(3) for hazard, df in advanced_cleaned_datasets.items()}
sample_advanced_cleaned


{'landslide':         Date  Latitude  Longitude  Rainfall_mm  Slope_deg  Soil_Moisture_%  \
 0 2024-02-22  0.729087   0.286249     0.748475   0.702628         0.066064   
 1 2024-02-20  0.760593   0.354387     0.546692   0.966397         0.992464   
 2 2024-02-01  0.700868   0.797128     0.608098   0.067206         0.182869   
 
    Landslide_Occurred  Risk_Score  
 0                 0.0    0.743960  
 1                 0.0    0.610419  
 2                 0.0    0.526354  ,
 'glof':         Date  Latitude  Longitude  Glacier_Volume_million_m3  Lake_Level_m  \
 0 2024-01-07  0.429345   0.215513                   0.384270      0.105926   
 1 2024-02-02  0.128226   0.735562                   0.258427      0.185496   
 2 2024-02-11  0.264364   0.151522                   0.224719      0.784120   
 
    Temperature_C  GLOF_Occurred  Melt_Risk  
 0       0.172740            0.0   0.087366  
 1       0.990553            1.0   0.324564  
 2       0.038462            0.0   0.627743  ,
 'earthqu

In [10]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Function to scale numeric features
def scale_numeric_features(df):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    scaler = MinMaxScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df

# Feature engineering logic
def feature_engineering(df, hazard):
    if hazard == "landslide":
        df["Risk_Score"] = 0.6 * df["Rainfall_mm"] + 0.4 * df["Slope_deg"]
    elif hazard == "earthquake":
        df["Energy_Index"] = df["Magnitude"] * (1 / (df["Depth_km"] + 1))
    elif hazard == "glof":
        df["Melt_Risk"] = 0.5 * df["Lake_Level_m"] + 0.5 * df["Temperature_C"]
    elif hazard == "sinkhole":
        df["Collapse_Index"] = df["Subsurface_Void_m3"] / (df["Groundwater_Level_m"] + 1)
    elif hazard == "liquefaction":
        df["Liquefaction_Severity"] = df["Peak_Ground_Acceleration"] * df["Water_Table_Depth_m"]
    elif hazard == "tsunami":
        df["Impact_Score"] = df["Wave_Height_m"] / (df["Distance_from_Coast_km"] + 1)
    elif hazard == "lahar":
        df["Flow_Potential"] = df["Rainfall_mm"] * df["Volcanic_Ash_Deposit_cm"]
    return df

# File names
datasets = [
    "landslide", "glof", "earthquake", "liquefaction", "sinkhole", "tsunami", "lahar"
]

for hazard in datasets:
    df = pd.read_csv(f"{hazard}_dataset.csv")
    df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)

    # One-hot and label encoding for liquefaction
    if hazard == "liquefaction":
        df = pd.get_dummies(df, columns=["Soil_Type"], drop_first=True)
        df["Liquefaction_Risk"] = df["Liquefaction_Risk"].map({"Low": 0, "Medium": 1, "High": 2})

    df = feature_engineering(df, hazard)
    df = scale_numeric_features(df)

    # Save cleaned version
    df.to_csv(f"cleaned_{hazard}_dataset.csv", index=False)
    print(f"{hazard.title()} cleaned and saved!")



Landslide cleaned and saved!
Glof cleaned and saved!
Earthquake cleaned and saved!
Liquefaction cleaned and saved!
Sinkhole cleaned and saved!
Tsunami cleaned and saved!
Lahar cleaned and saved!
