In [1]:
#run this in colab to downgrade sklearn
!pip install scikit-learn==1.3.1




In [2]:
#mount drive and change to the appropriate directory
from google.colab import drive
drive.mount('/content/drive/')

%cd drive/My Drive/mml_flood/

Mounted at /content/drive/
/content/drive/My Drive/mml_flood


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
#split training and testing
from sklearn.model_selection import train_test_split
import utils


In [9]:
def attach_target(x_df, y_master, disaster, next_n):
    """
    Attach a 'target' column to x_df based on disaster data in y_master.
    If no data exists in y_master for a given grid_id and year, set the target to NaN.
    Look up from next 1 to next_n years, if there is a major flood occuring.
    """
    # Create a dictionary for fast lookup: {(grid_id, year): disaster_value}
    disaster_lookup = {
        (row['grid_id'], row['year']): row[disaster+'_bin']
        for _, row in y_master.iterrows()
    }

    # Initialize a 'target' column in x_df
    target_col = 'target_' + disaster + '_' + str(next_n)
    x_df[target_col] = np.nan  # Default to NaN

    # Iterate over x_df rows
    for idx, row in x_df.iterrows():
        grid_id = row['grid_id']
        year = row['year']

        # Check if (grid_id, year + next_n + 1) exists in y_master
        if (grid_id, year + next_n ) not in disaster_lookup:
            # No data found for (grid_id, year + next_n + 1), skip this row
            continue

        # Check years from year+1 to year+next_n
        target_found = 0
        for i in range(1, next_n + 1):
            future_year = year + i
            if disaster_lookup.get((grid_id, future_year), 0) == 1:
                target_found = 1
                break

        # Update the 'target' column
        x_df.at[idx, target_col] = target_found
    # Drop rows where 'target' is NaN
    x_df = x_df.dropna(subset=[target_col])

    return x_df

# #attach target for a particular disease for next n years, using y_master
# #next_n is how we choose the next n-periods for the prediction target
# def attach_target_old(x_df, y_master, disaster, next_n):
#     y = y_master.copy()
#     #shift years
#     y['year'] = y['year'] - next_n
#     #keep for particular disaster
#     y = y[['grid_id','year',disaster+'_bin']]
#     # Rename into target
#     y = y.rename(columns={disaster +'_bin': 'target_' + disaster + '_'+ str(next_n)})
#     xy_df = pd.merge(x_df, y, on = ['grid_id','year'], how='inner')
#     return xy_df


In [10]:
#Read data
x_df = pd.read_csv('data/xy_df/x_stat.csv')  # Set index=False to avoid saving the index as a column
y_master = pd.read_csv('data/xy_df/y_master.csv')
print(x_df.shape, y_master.shape)

(48970, 28) (166793, 10)


In [None]:
#Riley: you can add attach other features here using some old codes, but be careful with merging, and name the features with 'nlp_' and 'era_'
# check there is no redundant features

# #attach nlp to xy_df
# def attach_nlp(xy_df, df_nlp):
#     #correct formatting for df_nlp
#     df_nlp = string_to_tuple(df_nlp, 'grid_id')
#     #drop text, location and label columns
#     df_nlp = df_nlp.drop(['location','txt','label','flood_ct_x'], axis=1)
#     #add prefix
#     df_nlp = df_nlp.rename(columns={c: 'nlp_' + str(c) for c in df_nlp.columns if c not in ['grid_id']})
#     #merge
#     xy_df_out = pd.merge(xy_df, df_nlp , on='grid_id', how='left')
#     print('shape of xy_df with nlp features', xy_df_out.shape)
#     return xy_df_out

# #format function for above
# def string_to_tuple(df, col):
#     try:
#         df[col] = df.apply(lambda row: eval(row[col]), axis=1)
#     except:
#         'error converting to tuple'
#     return df

# #attach era features
# def attach_era(xy_df, df_era):
#     #add prefix
#     df_era = df_era.rename(columns={c: 'era_' + c for c in df_era.columns if c not in ['grid_id','year']})
#     xy_df_out = pd.merge(xy_df, df_era, on=['grid_id','year'], how='left')
#     print('shape of xy_df with era features', xy_df_out.shape)
#     return xy_df_out



In [None]:
# Read the CSV file named 'x'
# df_nlp = pd.read_csv('data/nlp/df_cls_transfer128.csv')

# Read era features

In [6]:
#construct xy_df for appropriate prediction year, depending on the n_pred target period
n_pred = 5

#Riley: attach NLP and ERA features here

# x_df = x_df.loc[x_df['year']>=1979] #crop to after 1979
xy_df = attach_target(x_df, y_master, 'flood', n_pred)
print('length of xy_df', len(xy_df))
print('imbalance', xy_df.filter(regex='target').sum()/len(xy_df))

xy_df.columns

length of xy_df 44820
imbalance target_flood_5    0.243351
dtype: float64


Index(['grid_id', 'year', 'stat_flood_amt', 'stat_storm_amt',
       'stat_earthquake_amt', 'stat_extreme temperature _amt',
       'stat_landslide_amt', 'stat_volcanic activity_amt', 'stat_drought_amt',
       'stat_mass movement (dry)_amt', 'stat_flood_ct', 'stat_storm_ct',
       'stat_earthquake_ct', 'stat_extreme temperature _ct',
       'stat_landslide_ct', 'stat_volcanic activity_ct', 'stat_drought_ct',
       'stat_mass movement (dry)_ct', 'stat_flood_bin', 'stat_storm_bin',
       'stat_earthquake_bin', 'stat_extreme temperature _bin',
       'stat_landslide_bin', 'stat_volcanic activity_bin', 'stat_drought_bin',
       'stat_mass movement (dry)_bin', 'stat_lat', 'stat_lon',
       'target_flood_5'],
      dtype='object')

In [8]:
# Random splitting
results={}
# Separate features (X) and targets (y)
x = xy_df.drop(xy_df.filter(regex='target').columns, axis=1)  # Drop target columns
x = x.select_dtypes(['number'])  # Keep only numerical columns
y = xy_df.filter(regex='target')  # Keep only target columns


#train_test_split randomly
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
print("data imbalance train", y_train.sum()/len(y_train))
print("data imbalance test", y_test.sum()/len(y_test))

# Evaluate the model
y_pred, y_pred_prob = utils.run_xgb(x_train, y_train, x_test)
results['stats only random split'] = utils.get_scores_clf(y_test, y_pred_prob)


data imbalance train target_flood_5    0.245012
dtype: float64
data imbalance test target_flood_5    0.239476
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.8563974915055976
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 20}
maximum f1 score, thres 0.6193620013919782 0.7
auc, f1, accu, accu_bl, precision, recall=  0.8162112347894114 0.6193620013919782 0.6326788636025584 0.73467759079584 0.3781665059592997 0.9304347826086956
[[5511 4715]
 [ 224 2996]]


In [None]:
#Riley:
# - fix the linking problem of NLP features (pls use the old .pkl let's make sure we are using as raw as possible)
# - attach era features
# - compute results for random split, and non-random split: using the NEW attach_target function here
# - for each, record the data imbalance issues.


# - run results training using ALL data, and compute the locations with the highest risks of flooding with n_pred = 3, 5, that is 2021 and 2023 -> this is the closest to a "live" prediction we can do using our data.
# - let's think about how to visualize the live.
# - we should discuss the paper -> given current results, the paper is not very strong... option 1: we don't submit, keep working on it. option 2: we write something about the observations and try submit.
