In [3]:
import pandas as pd
from sklearn.impute import KNNImputer

# Step 1: Load the data
df = pd.read_csv('/content/glofdatabase_V3-1.csv', encoding='latin-1')
df = df.drop(index=0)  # Drop first metadata row
df = df.reset_index(drop=True)

# Step 2: Select important columns for glacier outburst analysis
new_df = df[['Glacier', 'RGI_Glacier_Id', 'RGI_Glacier_Area', 'Lake_type',
             'Lake_area_before', 'Mean_Lake_Volume_VL', 'Mean_Flood_Volume_V0',
             'Peak_discharge_Qp', 'Mechanism', 'River', 'Impact_and_destruction']]

numerical_cols = ['RGI_Glacier_Area', 'Lake_area_before',
                  'Mean_Lake_Volume_VL', 'Mean_Flood_Volume_V0',
                  'Peak_discharge_Qp', 'Impact_and_destruction']

In [None]:
def clean_numeric_column(series):
    return pd.to_numeric(series.astype(str).str.extract(r'([\d\.\-eE]+)')[0], errors='coerce')

for col in numerical_cols:
    new_df[col] = clean_numeric_column(new_df[col])

In [None]:
knn_imputer = KNNImputer(n_neighbors=5)
new_df[numerical_cols] = knn_imputer.fit_transform(new_df[numerical_cols])

In [None]:
# Step 6: Fill missing categorical data with 'Unknown'
categorical_cols = ['Glacier', 'RGI_Glacier_Id', 'Lake_type', 'Mechanism', 'River']
new_df[categorical_cols] = new_df[categorical_cols].fillna('Unknown')

In [None]:

# Step 7: Check final cleaned dataset
print(new_df.head())
print("\nMissing values:\n", new_df.isnull().sum())

In [None]:
new_df.head(5)

In [9]:
threshold_discharge = 5000  # in m³/s
threshold_flood_volume = 100000000  # in m³

# Filter risky rivers
risky_rivers = new_df[
    (new_df['Peak_discharge_Qp'] > threshold_discharge) |
    (new_df['Mean_Flood_Volume_V0'] > threshold_flood_volume)
]

In [None]:
# Show result
print("Filtered Risky Rivers (Potential Overflow):")
print(risky_rivers[['River', 'Peak_discharge_Qp', 'Mean_Flood_Volume_V0']])

# Optional: Save to CSV
risky_rivers.to_csv("risky_rivers.csv", index=False)

In [11]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    new_df[col] = le.fit_transform(new_df[col])
    label_encoders[col] = le


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[col] = le.fit_transform(new_df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[col] = le.fit_transform(new_df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[col] = le.fit_transform(new_df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [12]:
# Define the target: 1 if high risk, else 0
new_df['High_Risk'] = ((new_df['Peak_discharge_Qp'] > threshold_discharge) |
                       (new_df['Mean_Flood_Volume_V0'] > threshold_flood_volume)).astype(int)

# Features and target
X = new_df.drop(columns=['High_Risk'])
y = new_df['High_Risk']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['High_Risk'] = ((new_df['Peak_discharge_Qp'] > threshold_discharge) |


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(),
    "SVM": SVC()
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")



RandomForest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        21
           1       1.00      1.00      1.00        47

    accuracy                           1.00        68
   macro avg       1.00      1.00      1.00        68
weighted avg       1.00      1.00      1.00        68

Accuracy: 1.00

LogisticRegression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        21
           1       1.00      1.00      1.00        47

    accuracy                           1.00        68
   macro avg       1.00      1.00      1.00        68
weighted avg       1.00      1.00      1.00        68

Accuracy: 1.00

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        21
           1       1.00      0.98      0.99        47

    accuracy                           0.99 

In [15]:
import joblib

best_model = RandomForestClassifier(random_state=42)
best_model.fit(X_train_scaled, y_train)

# Save model and scaler
joblib.dump(best_model, 'glacier_risk_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')


['label_encoders.pkl']