In [None]:
%cd ..
%pip install scikit-learn==1.3.1 
%pip install -r requirements.txt

In [3]:
import pandas as pd
import numpy as np
# import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
#split training and testing
from sklearn.model_selection import train_test_split
import utils



In [4]:
def attach_target(x_df, y_master, disaster, next_n):
    """
    Attach a 'target' column to x_df based on disaster data in y_master.
    If no data exists in y_master for a given grid_id and year, set the target to NaN.
    Look up from next 1 to next_n years, if there is a major flood occuring.
    """
    
    x_df = x_df.copy()
    # Create a dictionary for fast lookup: {(grid_id, year): disaster_value}
    disaster_lookup = {
        (row['grid_id'], row['year']): row[disaster+'_bin']
        for _, row in y_master.iterrows()
    }

    # Initialize a 'target' column in x_df
    target_col = 'target_' + disaster + '_' + str(next_n)
    x_df[target_col] = np.nan  # Default to NaN

    # Iterate over x_df rows
    for idx, row in x_df.iterrows():
        grid_id = row['grid_id']
        year = row['year']

        # Check if (grid_id, year + next_n + 1) exists in y_master
        if (grid_id, year + next_n ) not in disaster_lookup:
            # No data found for (grid_id, year + next_n + 1), skip this row
            continue

        # Check years from year+1 to year+next_n
        target_found = 0
        for i in range(1, next_n + 1):
            future_year = year + i
            if disaster_lookup.get((grid_id, future_year), 0) == 1:
                target_found = 1
                break

        # Update the 'target' column
        x_df.at[idx, target_col] = target_found
    # Drop rows where 'target' is NaN
    x_df = x_df.dropna(subset=[target_col])

    return x_df

# #attach target for a particular disease for next n years, using y_master
# #next_n is how we choose the next n-periods for the prediction target
# def attach_target_old(x_df, y_master, disaster, next_n):
#     y = y_master.copy()
#     #shift years
#     y['year'] = y['year'] - next_n
#     #keep for particular disaster
#     y = y[['grid_id','year',disaster+'_bin']]
#     # Rename into target
#     y = y.rename(columns={disaster +'_bin': 'target_' + disaster + '_'+ str(next_n)})
#     xy_df = pd.merge(x_df, y, on = ['grid_id','year'], how='inner')
#     return xy_df


In [5]:
#Read data
x_df = pd.read_csv('data/testing/x_stat.csv')  # Set index=False to avoid saving the index as a column
y_master = pd.read_csv('data/testing/y_master.csv')
print(x_df.shape, y_master.shape)

(48970, 29) (166793, 11)


In [6]:
x_stat = pd.read_csv('data/testing/x_stat.csv')
x_nlp = pd.read_csv('data/testing/x_nlp.csv')
x_era = pd.read_csv('data/testing/x_era.csv')
x_full = pd.read_csv('data/testing/x_full.csv')

In [56]:
from sklearn import metrics

def get_scores_clf(y_true, y_pred_prob):
    # Get F1 scores at different thresholds
    f1_scores = []
    thres_list = [0.4, 0.5, 0.6, 0.7]
    
    for thres in thres_list:
        y_pred = (y_pred_prob >= thres).astype(int)
        f1 = metrics.f1_score(y_true, y_pred, average='macro')
        f1_scores.append(f1)
    
    # Find best threshold
    max_f1 = max(f1_scores)
    max_thres = thres_list[f1_scores.index(max_f1)]
    # print('Max F1 Score:', max_f1, 'at threshold:', max_thres)

    # Get y_pred using the best threshold
    y_pred = (y_pred_prob >= max_thres).astype(int)

    # Compute metrics
    accu = metrics.accuracy_score(y_true, y_pred)
    accu_bl = metrics.balanced_accuracy_score(y_true, y_pred)
    auc = metrics.roc_auc_score(y_true, y_pred_prob)  # No need for `multi_class='ovo'` unless multi-class
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)

    # print(f'AUC: {auc:.4f}, F1: {max_f1:.4f}, Accuracy: {accu:.4f}, Balanced Acc: {accu_bl:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')
    # print('Confusion Matrix:\n', metrics.confusion_matrix(y_true, y_pred))
    
    return auc, max_f1, accu, accu_bl, precision, recall


### New Chronological Split n=1

In [37]:
n_pred = 1

xy_df = attach_target(x_full, y_master, 'flood', n_pred)

results={}

# Separate features (X) and targets (y)
x = xy_df.drop(xy_df.filter(regex='target|Unnamed').columns, axis=1)  # Drop target columns
x = x.select_dtypes(['number'])  # Keep only numerical columns
x = x.filter(regex='stat|year')
y = xy_df.filter(regex='target')  # Keep only target columns

In [38]:
x.head(5)

Unnamed: 0,year,stat_flood_amt,stat_storm_amt,stat_earthquake_amt,stat_extreme temperature _amt,stat_landslide_amt,stat_volcanic activity_amt,stat_drought_amt,stat_mass movement (dry)_amt,stat_flood_ct,...,stat_flood_bin,stat_storm_bin,stat_earthquake_bin,stat_extreme temperature _bin,stat_landslide_bin,stat_volcanic activity_bin,stat_drought_bin,stat_mass movement (dry)_bin,stat_lat,stat_lon
0,1960,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-46,168
1,1961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-46,168
2,1962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-46,168
3,1963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-46,168
4,1964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-46,168


In [39]:
years = sorted(x["year"].unique())
X = x


In [54]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,roc_auc_score  
from sklearn import metrics

verbose = False
results = []
aucs = []
acc_bls = []
for i in range(45,len(years) - 1):
    train_years = years[: i + 1]
    test_year = years[i + 1]
    
    X_train = X[x["year"].isin(train_years)]
    y_train = y[x["year"].isin(train_years)]
    
    X_test = X[x["year"] == test_year]
    y_test = y[x["year"] == test_year]
    
    model = XGBClassifier(
		n_estimators=100,  
		max_depth=4,       
		random_state=42,
	)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results.append(acc)
    
    y_prob = model.predict_proba(X_test)[:, 1]  

    auc = roc_auc_score(y_test, y_prob)
    aucs.append(auc)
    
    
    auc, max_f1, accu, accu_bl, precision, recall = get_scores_clf(y_test, y_prob)  
    acc_bls.append(accu_bl)
    if verbose:
    	print(f"Train up to {train_years[-1]}, test on {test_year}: AUC = {auc:.4f}, Balanced Acc = {accu_bl:.4f}\n")

	

# Compute average performance
avg_acc = np.mean(results)
print(f"\nAverage Accuracy over {len(results)} iterations: {avg_acc:.4f}")

avg_auc = np.mean(auc)
print(f"\nAverage AUC over {len(results)} iterations: {avg_auc:.4f}")

avg_bl = np.mean(acc_bls)
print(f"\nAverage accu_bl over {len(results)} iterations: {avg_bl:.4f}")



Average Accuracy over 12 iterations: 0.8755

Average AUC over 12 iterations: 0.6027

Average accu_bl over 12 iterations: 0.5143


### New Chronological Split n=2

In [62]:
n_pred = 2

xy_df = attach_target(x_full, y_master, 'flood', n_pred)

results={}

# Separate features (X) and targets (y)
x = xy_df.drop(xy_df.filter(regex='target|Unnamed').columns, axis=1)  # Drop target columns
x = x.select_dtypes(['number'])  # Keep only numerical columns
x = x.filter(regex='stat|year')
y = xy_df.filter(regex='target')  # Keep only target columns

In [63]:
years = sorted(x["year"].unique())
X = x

In [58]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,roc_auc_score  
from sklearn import metrics

verbose = False
results = []
aucs = []
acc_bls = []
for i in range(45,len(years) - 1):
    train_years = years[: i + 1]
    test_year = years[i + 1]
    
    X_train = X[x["year"].isin(train_years)]
    y_train = y[x["year"].isin(train_years)]
    
    X_test = X[x["year"] == test_year]
    y_test = y[x["year"] == test_year]
    
    model = XGBClassifier(
		n_estimators=100,  
		max_depth=4,       
		random_state=42,
	)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results.append(acc)
    
    y_prob = model.predict_proba(X_test)[:, 1]  

    auc = roc_auc_score(y_test, y_prob)
    aucs.append(auc)
    
    
    auc, max_f1, accu, accu_bl, precision, recall = get_scores_clf(y_test, y_prob)  
    acc_bls.append(accu_bl)
    if verbose:
    	print(f"Train up to {train_years[-1]}, test on {test_year}: AUC = {auc:.4f}, Balanced Acc = {accu_bl:.4f}\n")

	

# Compute average performance
avg_acc = np.mean(results)
print(f"\nAverage Accuracy over {len(results)} iterations: {avg_acc:.4f}")

avg_auc = np.mean(auc)
print(f"\nAverage AUC over {len(results)} iterations: {avg_auc:.4f}")

avg_bl = np.mean(acc_bls)
print(f"\nAverage accu_bl over {len(results)} iterations: {avg_bl:.4f}")



Average Accuracy over 11 iterations: 0.7802

Average AUC over 11 iterations: 0.6530

Average accu_bl over 11 iterations: 0.5435


### New Chrono Split n=5

In [59]:
n_pred = 5

xy_df = attach_target(x_full, y_master, 'flood', n_pred)

results={}

# Separate features (X) and targets (y)
x = xy_df.drop(xy_df.filter(regex='target|Unnamed').columns, axis=1)  # Drop target columns
x = x.select_dtypes(['number'])  # Keep only numerical columns
x = x.filter(regex='stat|year')
y = xy_df.filter(regex='target')  # Keep only target columns

In [60]:
years = sorted(x["year"].unique())
X = x

In [61]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,roc_auc_score  
from sklearn import metrics

verbose = False
results = []
aucs = []
acc_bls = []
for i in range(45,len(years) - 1):
    train_years = years[: i + 1]
    test_year = years[i + 1]
    
    X_train = X[x["year"].isin(train_years)]
    y_train = y[x["year"].isin(train_years)]
    
    X_test = X[x["year"] == test_year]
    y_test = y[x["year"] == test_year]
    
    model = XGBClassifier(
		n_estimators=100,  
		max_depth=4,       
		random_state=42,
	)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results.append(acc)
    
    y_prob = model.predict_proba(X_test)[:, 1]  

    auc = roc_auc_score(y_test, y_prob)
    aucs.append(auc)
    
    
    auc, max_f1, accu, accu_bl, precision, recall = get_scores_clf(y_test, y_prob)  
    acc_bls.append(accu_bl)
    if verbose:
    	print(f"Train up to {train_years[-1]}, test on {test_year}: AUC = {auc:.4f}, Balanced Acc = {accu_bl:.4f}\n")

	

# Compute average performance
avg_acc = np.mean(results)
print(f"\nAverage Accuracy over {len(results)} iterations: {avg_acc:.4f}")

avg_auc = np.mean(auc)
print(f"\nAverage AUC over {len(results)} iterations: {avg_auc:.4f}")

avg_bl = np.mean(acc_bls)
print(f"\nAverage accu_bl over {len(results)} iterations: {avg_bl:.4f}")



Average Accuracy over 8 iterations: 0.5994

Average AUC over 8 iterations: 0.6298

Average accu_bl over 8 iterations: 0.5841


### n = 1, random split

In [64]:
n_pred = 1

xy_df = attach_target(x_full, y_master, 'flood', n_pred)

results={}

# Separate features (X) and targets (y)
x = xy_df.drop(xy_df.filter(regex='target|Unnamed').columns, axis=1)  # Drop target columns
x = x.select_dtypes(['number'])  # Keep only numerical columns
y = xy_df.filter(regex='target')  # Keep only target columns


In [74]:
### STAT ONLY 
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first

x_train1 = x_train1.filter(regex='stat|year')
x_test1 = x_test1.filter(regex='stat|year')
print("data imbalance train", y_train1.sum()/len(y_train1))
print("data imbalance test", y_test1.sum()/len(y_test1))
y_pred, y_pred_prob = utils.run_xgb(x_train1, y_train1, x_test1)
results['n=1 stats only random split'] = utils.get_scores_clf(y_test1, y_pred_prob)

data imbalance train target_flood_1    0.063387
dtype: float64
data imbalance test target_flood_1    0.06031
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.7911262494579957
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.5698252150104206 0.6
auc, f1, accu, accu_bl, precision, recall=  0.7525487631871196 0.5426665221788759 0.8769561002631214 0.5907154032189088 0.08909886453115956 0.26521239954075776
[[12434  1137]
 [  640   231]]


In [46]:
### NLP ONLY 
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first
x_train1 = x_train1.filter(regex='nlp')
x_test1 = x_test1.filter(regex='nlp')
print("data imbalance train", y_train1.sum()/len(y_train1))
print("data imbalance test", y_test1.sum()/len(y_test1))
y_pred, y_pred_prob = utils.run_xgb(x_train1, y_train1, x_test1)
results['n=1 nlp only random split'] = utils.get_scores_clf(y_test1, y_pred_prob)


data imbalance train target_flood_1    0.063387
dtype: float64
data imbalance test target_flood_1    0.06031
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.6713414097619406
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 20}
maximum f1 score, thres 0.5466504015418363 0.7
auc, f1, accu, accu_bl, precision, recall=  0.5369651349313865 0.5466504015418363 0.8944744495222269 0.5479274244287876 0.07341540970993071 0.15384615384615385
[[12784   787]
 [  737   134]]


In [49]:
### WEATHER ONLY
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first
x_train1 = x_train1.filter(regex='era_')
x_test1 = x_test1.filter(regex='era_')
print("data imbalance train", y_train1.sum()/len(y_train1))
print("data imbalance test", y_test1.sum()/len(y_test1))
y_pred, y_pred_prob = utils.run_xgb(x_train1, y_train1, x_test1)
results['n=1 era only random split'] = utils.get_scores_clf(y_test1, y_pred_prob)

data imbalance train target_flood_1    0.063387
dtype: float64
data imbalance test target_flood_1    0.06031
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.9834871637822662
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.5169620951127145 0.5
auc, f1, accu, accu_bl, precision, recall=  0.572730854380597 0.48661111102779125 0.9133084060379448 0.5149720299947353 0.0634174842318748 0.06199770378874857
[[13136   435]
 [  817    54]]


In [None]:
### WEATHER + NLP
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first
x_train1 = x_train1.filter(regex='nlp|era_')
x_test1 = x_test1.filter(regex='nlp|era_')
print("data imbalance train", y_train1.sum()/len(y_train1))
print("data imbalance test", y_test1.sum()/len(y_test1))
# y_pred, y_pred_prob = utils.run_xgb(x_train1, y_train1, x_test1)
# results['n=1 weather + nlp random split'] = utils.get_scores_clf(y_test1, y_pred_prob)
x_train1


data imbalance train target_flood_1    0.063387
dtype: float64
data imbalance test target_flood_1    0.06031
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.9828030575691103
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.5268744042133072 0.5
auc, f1, accu, accu_bl, precision, recall=  0.5816528474093937 0.4877150937297996 0.91088491898629 0.5233522873832489 0.06595292505727865 0.08266360505166476
[[13083   488]
 [  799    72]]


In [69]:
### STATS + NLP
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first
x_train1 = x_train1.filter(regex='stat|nlp')
x_test1 = x_test1.filter(regex='stat|nlp')
print("data imbalance train", y_train1.sum()/len(y_train1))
print("data imbalance test", y_test1.sum()/len(y_test1))
y_pred, y_pred_prob = utils.run_xgb(x_train1, y_train1, x_test1)
results['n=1 stat + nlp random split'] = utils.get_scores_clf(y_test1, y_pred_prob)


data imbalance train target_flood_1    0.063387
dtype: float64
data imbalance test target_flood_1    0.06031
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.7000428026211769
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.5574530315529511 0.6
auc, f1, accu, accu_bl, precision, recall=  0.548130464256488 0.530418864868458 0.9121312837557125 0.5508759434266745 0.07848007482625628 0.14006888633754305
[[13051   520]
 [  749   122]]


In [72]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first
x_train1 = x_train1.filter(regex='stat|era')
x_test1 = x_test1.filter(regex='stat|era')
print("data imbalance train", y_train1.sum()/len(y_train1))
print("data imbalance test", y_test1.sum()/len(y_test1))
y_pred, y_pred_prob = utils.run_xgb(x_train1, y_train1, x_test1)
results['n=1 stat + era random split'] = utils.get_scores_clf(y_test1, y_pred_prob)


data imbalance train target_flood_1    0.063387
dtype: float64
data imbalance test target_flood_1    0.06031
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.8702576843580211
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.5435539601803967 0.5
auc, f1, accu, accu_bl, precision, recall=  0.6018614014604147 0.5061882051401289 0.8930895997784241 0.5450417208775957 0.07210502285208106 0.14925373134328357
[[12768   803]
 [  741   130]]


In [73]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first
print("data imbalance train", y_train1.sum()/len(y_train1))
print("data imbalance test", y_test1.sum()/len(y_test1))
y_pred, y_pred_prob = utils.run_xgb(x_train1, y_train1, x_test1)
results['n=1 all vars random split'] = utils.get_scores_clf(y_test1, y_pred_prob)

data imbalance train target_flood_1    0.063387
dtype: float64
data imbalance test target_flood_1    0.06031
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.8817620087043468
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.5676250250846085 0.6
auc, f1, accu, accu_bl, precision, recall=  0.7444174833873236 0.533746139219117 0.8885888381110649 0.5775655287778922 0.08551068412237006 0.22388059701492538
[[12638   933]
 [  676   195]]


### n = 1, chrono split

In [160]:
n_pred = 1

xy_df = attach_target(x_full, y_master, 'flood', n_pred)

results={}

x = xy_df.drop(xy_df.filter(regex='target|Unnamed').columns, axis=1)  # Drop target columns
x = x.select_dtypes(['number'])  # Keep only numerical columns
y = xy_df.filter(regex='target')  # Keep only target columns

split_year = 2015
years = x['year']

train_mask = years <= split_year
test_mask = years > split_year

x_train1, x_test1 = x[train_mask], x[test_mask]
y_train1, y_test1 = y[train_mask], y[test_mask]


In [162]:
y_test1

Unnamed: 0,target_flood_1
56,0.0
57,0.0
115,0.0
116,0.0
174,0.0
...,...
48850,0.0
48908,0.0
48909,0.0
48967,0.0


In [None]:
### STAT ONLY 
x_train1_stat = x_train1.filter(regex='stat|year')
x_test1_stat = x_test1.filter(regex='stat|year')
print("data imbalance train", y_train1.sum()/len(y_train1))
print("data imbalance test", y_test1.sum()/len(y_test1))
y_pred, y_pred_prob = utils.run_xgb(x_train1_stat, y_train1, x_test1_stat)
results['n=1 stats only chrono split'] = utils.get_scores_clf(y_test1, y_pred_prob)

data imbalance train target_flood_1    0.061317
dtype: float64
data imbalance test target_flood_1    0.094578
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.7910331133816664
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.5364854802680565 0.6
auc, f1, accu, accu_bl, precision, recall=  0.5970479423318966 0.5327891922319167 0.772289156626506 0.5662305961325756 0.11300065986847377 0.31210191082802546
[[1233  270]
 [ 108   49]]


In [114]:
### NLP ONLY 
x_train1_nlp = x_train1.filter(regex='nlp')
x_test1_nlp = x_test1.filter(regex='nlp')
print("data imbalance train", y_train1.sum()/len(y_train1))
print("data imbalance test", y_test1.sum()/len(y_test1))
y_pred, y_pred_prob = utils.run_xgb(x_train1_nlp, y_train1, x_test1_nlp)
results['n=1 nlp only chrono split'] = utils.get_scores_clf(y_test1, y_pred_prob)


data imbalance train target_flood_1    0.061317
dtype: float64
data imbalance test target_flood_1    0.094578
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.6546993497955295
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 65}
maximum f1 score, thres 0.09336053851689241 0.7
auc, f1, accu, accu_bl, precision, recall=  0.4877273054739778 0.09336053851689241 0.09698795180722891 0.43288158290637413 0.08452548667910544 0.8471337579617835
[[  28 1475]
 [  24  133]]


In [116]:
### WEATHER ONLY
x_train1_era = x_train1.filter(regex='era_')
x_test1_era = x_test1.filter(regex='era_')
print("data imbalance train", y_train1.sum()/len(y_train1))
print("data imbalance test", y_test1.sum()/len(y_test1))
y_pred, y_pred_prob = utils.run_xgb(x_train1_era, y_train1, x_test1_era)
results['n=1 era only chrono split'] = utils.get_scores_clf(y_test1, y_pred_prob)

data imbalance train target_flood_1    0.061317
dtype: float64
data imbalance test target_flood_1    0.094578
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.9518395157042275
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 65}
maximum f1 score, thres 0.4148354090481343 0.7
auc, f1, accu, accu_bl, precision, recall=  0.5388077348487739 0.4148354090481343 0.5150602409638554 0.5240029495149827 0.0990527114076388 0.535031847133758
[[771 732]
 [ 73  84]]


In [119]:
### WEATHER + NLP
x_train1_wn = x_train1.filter(regex='nlp|era_')
x_test1_wn = x_test1.filter(regex='nlp|era_')
print("data imbalance train", y_train1.sum()/len(y_train1))
print("data imbalance test", y_test1.sum()/len(y_test1))
y_pred, y_pred_prob = utils.run_xgb(x_train1_wn, y_train1, x_test1_wn)
results['n=1 weather + nlp chrono split'] = utils.get_scores_clf(y_test1, y_pred_prob)


data imbalance train target_flood_1    0.061317
dtype: float64
data imbalance test target_flood_1    0.094578
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.9527875516202837
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 65}
maximum f1 score, thres 0.41345612222426215 0.7
auc, f1, accu, accu_bl, precision, recall=  0.5225175975013879 0.41345612222426215 0.5234939759036145 0.5029918930716062 0.09509660245035778 0.47770700636942676
[[794 709]
 [ 82  75]]


In [120]:
### STATS + NLP
x_train1_sn = x_train1.filter(regex='stat|nlp')
x_test1_sn = x_test1.filter(regex='stat|nlp')
print("data imbalance train", y_train1.sum()/len(y_train1))
print("data imbalance test", y_test1.sum()/len(y_test1))
y_pred, y_pred_prob = utils.run_xgb(x_train1_sn, y_train1, x_test1_sn)
results['n=1 stat + nlp chrono split'] = utils.get_scores_clf(y_test1, y_pred_prob)


data imbalance train target_flood_1    0.061317
dtype: float64
data imbalance test target_flood_1    0.094578
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.6780925727107133
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.5445568760611205 0.5
auc, f1, accu, accu_bl, precision, recall=  0.5087553131528875 0.4936779059449867 0.8174698795180723 0.5541041060130271 0.11075751612773774 0.22929936305732485
[[1321  182]
 [ 121   36]]


In [121]:
x_train1_se = x_train1.filter(regex='stat|era')
x_test1_se = x_test1.filter(regex='stat|era')
print("data imbalance train", y_train1.sum()/len(y_train1))
print("data imbalance test", y_test1.sum()/len(y_test1))
y_pred, y_pred_prob = utils.run_xgb(x_train1_se, y_train1, x_test1_se)
results['n=1 stat + era chrono split'] = utils.get_scores_clf(y_test1, y_pred_prob)


data imbalance train target_flood_1    0.061317
dtype: float64
data imbalance test target_flood_1    0.094578
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.8435580459284815
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.5400376437226346 0.6
auc, f1, accu, accu_bl, precision, recall=  0.5673027617800492 0.4900098311357853 0.8662650602409638 0.5354174877421378 0.1067946033451367 0.12738853503184713
[[1418   85]
 [ 137   20]]


In [122]:
print("data imbalance train", y_train1.sum()/len(y_train1))
print("data imbalance test", y_test1.sum()/len(y_test1))
y_pred, y_pred_prob = utils.run_xgb(x_train1, y_train1, x_test1)
results['n=1 all vars chrono split'] = utils.get_scores_clf(y_test1, y_pred_prob)

data imbalance train target_flood_1    0.061317
dtype: float64
data imbalance test target_flood_1    0.094578
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.8570720894604147
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.516484403839055 0.7
auc, f1, accu, accu_bl, precision, recall=  0.5655186442401821 0.516484403839055 0.8783132530120482 0.5164024392827933 0.09945480181014962 0.07006369426751592
[[1447   56]
 [ 146   11]]


### n = 2, random split

In [142]:
n_pred = 2

xy_df = attach_target(x_full, y_master, 'flood', n_pred)

results={}

# Separate features (X) and targets (y)
x = xy_df.drop(xy_df.filter(regex='target|Unnamed').columns, axis=1)  # Drop target columns
x = x.select_dtypes(['number'])  # Keep only numerical columns
y = xy_df.filter(regex='target')  # Keep only target columns
x_train2, x_test2, y_train2, y_test2 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first


In [144]:
y_test2

Unnamed: 0,target_flood_2
5531,0.0
13931,0.0
28402,0.0
37898,0.0
31050,0.0
...,...
15118,0.0
13222,0.0
4018,0.0
22448,0.0


In [77]:
### STAT ONLY 
x_train2, x_test2, y_train2, y_test2 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first

x_train2 = x_train2.filter(regex='stat|year')
x_test2 = x_test2.filter(regex='stat|year')
print("data imbalance train", y_train2.sum()/len(y_train2))
print("data imbalance test", y_test2.sum()/len(y_test2))
y_pred, y_pred_prob = utils.run_xgb(x_train2, y_train2, x_test2)
results['n=2 stats only random split'] = utils.get_scores_clf(y_test2, y_pred_prob)

data imbalance train target_flood_2    0.114684
dtype: float64
data imbalance test target_flood_2    0.117734
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.8327027473086133
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.6021038607830143 0.7
auc, f1, accu, accu_bl, precision, recall=  0.7696441336855753 0.6021038607830143 0.7648840977946875 0.6639879341981094 0.19242711190723472 0.5320167564332735
[[9967 2555]
 [ 782  889]]


In [78]:
### NLP ONLY 
x_train2, x_test2, y_train2, y_test2 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first

x_train2 = x_train2.filter(regex='nlp')
x_test2 = x_test2.filter(regex='nlp')
print("data imbalance train", y_train2.sum()/len(y_train2))
print("data imbalance test", y_test2.sum()/len(y_test2))
y_pred, y_pred_prob = utils.run_xgb(x_train2, y_train2, x_test2)
results['n=2 nlp only random split'] = utils.get_scores_clf(y_test2, y_pred_prob)


data imbalance train target_flood_2    0.114684
dtype: float64
data imbalance test target_flood_2    0.117734
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.6707777295889287
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.5631960378939944 0.7
auc, f1, accu, accu_bl, precision, recall=  0.5569996447186525 0.5631960378939944 0.8269569506094554 0.5604446407715599 0.14302304134590854 0.2118491921005386
[[11383  1139]
 [ 1317   354]]


In [79]:
### WEATHER ONLY
x_train2, x_test2, y_train2, y_test2 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first

x_train2 = x_train2.filter(regex='era_')
x_test2 = x_test2.filter(regex='era_')
print("data imbalance train", y_train2.sum()/len(y_train2))
print("data imbalance test", y_test2.sum()/len(y_test2))
y_pred, y_pred_prob = utils.run_xgb(x_train2, y_train2, x_test2)
results['n=2 era only random split'] = utils.get_scores_clf(y_test2, y_pred_prob)

data imbalance train target_flood_2    0.114684
dtype: float64
data imbalance test target_flood_2    0.117734
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.9528539329527689
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.541832744494412 0.6
auc, f1, accu, accu_bl, precision, recall=  0.5805819818161329 0.5042268743606272 0.8073698301979849 0.5423432855122919 0.13196365530396267 0.19569120287253142
[[11132  1390]
 [ 1344   327]]


In [80]:
### WEATHER + NLP
x_train2, x_test2, y_train2, y_test2 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first

x_train2 = x_train2.filter(regex='nlp|era_')
x_test2 = x_test2.filter(regex='nlp|era_')
print("data imbalance train", y_train2.sum()/len(y_train2))
print("data imbalance test", y_test2.sum()/len(y_test2))
y_pred, y_pred_prob = utils.run_xgb(x_train2, y_train2, x_test2)
results['n=2 weather + nlp random split'] = utils.get_scores_clf(y_test2, y_pred_prob)


data imbalance train target_flood_2    0.114684
dtype: float64
data imbalance test target_flood_2    0.117734
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.9458940702768001
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.553399578196922 0.6
auc, f1, accu, accu_bl, precision, recall=  0.5896793874976332 0.5217325321913383 0.8002536461636017 0.5580165981481211 0.13867621082462123 0.24117295032914424
[[10955  1567]
 [ 1268   403]]


In [81]:
### STATS + NLP
x_train2, x_test2, y_train2, y_test2 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first

x_train2 = x_train2.filter(regex='stat|nlp')
x_test2 = x_test2.filter(regex='stat|nlp')
print("data imbalance train", y_train2.sum()/len(y_train2))
print("data imbalance test", y_test2.sum()/len(y_test2))
y_pred, y_pred_prob = utils.run_xgb(x_train2, y_train2, x_test2)
results['n=2 stat + nlp random split'] = utils.get_scores_clf(y_test2, y_pred_prob)


data imbalance train target_flood_2    0.114684
dtype: float64
data imbalance test target_flood_2    0.117734
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.6842873782519863
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.5672255778630952 0.7
auc, f1, accu, accu_bl, precision, recall=  0.5711314692962648 0.5672255778630952 0.8505601352779539 0.5582635602632007 0.1468530964170436 0.17594254937163376
[[11778   744]
 [ 1377   294]]


In [82]:
### STATS + WEATHER
x_train2, x_test2, y_train2, y_test2 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first

x_train2 = x_train2.filter(regex='stat|era_')
x_test2 = x_test2.filter(regex='stat|era_')
print("data imbalance train", y_train2.sum()/len(y_train2))
print("data imbalance test", y_test2.sum()/len(y_test2))
y_pred, y_pred_prob = utils.run_xgb(x_train2, y_train2, x_test2)
results['n=2 stat + weather random split'] = utils.get_scores_clf(y_test2, y_pred_prob)

data imbalance train target_flood_2    0.114684
dtype: float64
data imbalance test target_flood_2    0.117734
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.8127488503690614
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.5592267085548714 0.6
auc, f1, accu, accu_bl, precision, recall=  0.5963065746357028 0.5559794276712192 0.8062425139153103 0.562966378455785 0.14168898448558792 0.24476361460203472
[[11034  1488]
 [ 1262   409]]


In [83]:
### ALL VARS
x_train2, x_test2, y_train2, y_test2 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first

print("data imbalance train", y_train2.sum()/len(y_train2))
print("data imbalance test", y_test2.sum()/len(y_test2))
y_pred, y_pred_prob = utils.run_xgb(x_train2, y_train2, x_test2)
results['n=2 all vars random split'] = utils.get_scores_clf(y_test2, y_pred_prob)

data imbalance train target_flood_2    0.114684
dtype: float64
data imbalance test target_flood_2    0.117734
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.9455553518799874
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.597255060651974 0.7
auc, f1, accu, accu_bl, precision, recall=  0.7624891621028259 0.597255060651974 0.8218135700697526 0.6039430924732255 0.1685438687532862 0.3189706762417714
[[11131  1391]
 [ 1138   533]]


### n = 2, chrono split

In [152]:
n_pred = 2

xy_df = attach_target(x_full, y_master, 'flood', n_pred)

results={}

x = xy_df.drop(xy_df.filter(regex='target|Unnamed').columns, axis=1)  # Drop target columns
x = x.select_dtypes(['number'])  # Keep only numerical columns
y = xy_df.filter(regex='target')  # Keep only target columns

split_year = 2013
years = x['year']

train_mask = years <= split_year
test_mask = years > split_year

x_train2, x_test2 = x[train_mask], x[test_mask]
y_train2, y_test2 = y[train_mask], y[test_mask]


In [153]:
### STAT ONLY 
x_train2_stat = x_train2.filter(regex='stat|year')
x_test2_stat = x_test2.filter(regex='stat|year')
print("data imbalance train", y_train2.sum()/len(y_train2))
print("data imbalance test", y_test2.sum()/len(y_test2))
y_pred, y_pred_prob = utils.run_xgb(x_train2_stat, y_train2, x_test2_stat)
results['n=2 stats only chrono split'] = utils.get_scores_clf(y_test2, y_pred_prob)

data imbalance train target_flood_2    0.110888
dtype: float64
data imbalance test target_flood_2    0.200402
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.7963883426026674
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 65}
maximum f1 score, thres 0.19859023806357232 0.7
auc, f1, accu, accu_bl, precision, recall=  0.5559340680356192 0.19859023806357232 0.22248995983935743 0.5085560372377099 0.20318224720742303 0.9859719438877755
[[  62 1929]
 [   7  492]]


In [154]:
### NLP ONLY 
x_train2_nlp = x_train2.filter(regex='nlp')
x_test2_nlp = x_test2.filter(regex='nlp')
print("data imbalance train", y_train2.sum()/len(y_train2))
print("data imbalance test", y_test2.sum()/len(y_test2))
y_pred, y_pred_prob = utils.run_xgb(x_train2_nlp, y_train2, x_test2_nlp)
results['n=2 nlp only chrono split'] = utils.get_scores_clf(y_test2, y_pred_prob)


data imbalance train target_flood_2    0.110888
dtype: float64
data imbalance test target_flood_2    0.200402
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.6608825348578294
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 20}
maximum f1 score, thres 0.4997429009139511 0.7
auc, f1, accu, accu_bl, precision, recall=  0.4915224723681416 0.4997429009139511 0.6044176706827309 0.5153607063448846 0.20567784701140499 0.3667334669338677
[[1322  669]
 [ 316  183]]


In [155]:
### WEATHER ONLY
x_train2_era = x_train2.filter(regex='era_')
x_test2_era = x_test2.filter(regex='era_')
print("data imbalance train", y_train2.sum()/len(y_train2))
print("data imbalance test", y_test2.sum()/len(y_test2))
y_pred, y_pred_prob = utils.run_xgb(x_train2_era, y_train2, x_test2_era)
results['n=2 era only chrono split'] = utils.get_scores_clf(y_test2, y_pred_prob)

data imbalance train target_flood_2    0.110888
dtype: float64
data imbalance test target_flood_2    0.200402
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.7779844335776264
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 65}
maximum f1 score, thres 0.27815555231285566 0.7
auc, f1, accu, accu_bl, precision, recall=  0.5384792689346548 0.27815555231285566 0.2819277108433735 0.5074287198203539 0.2028148138542508 0.8837675350701403
[[ 261 1730]
 [  58  441]]


In [156]:
### WEATHER + NLP
x_train2_wn = x_train2.filter(regex='nlp|era_')
x_test2_wn = x_test2.filter(regex='nlp|era_')
print("data imbalance train", y_train2.sum()/len(y_train2))
print("data imbalance test", y_test2.sum()/len(y_test2))
y_pred, y_pred_prob = utils.run_xgb(x_train2_wn, y_train2, x_test2_wn)
results['n=2 weather + nlp chrono split'] = utils.get_scores_clf(y_test2, y_pred_prob)


data imbalance train target_flood_2    0.110888
dtype: float64
data imbalance test target_flood_2    0.200402
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.8945316667886869
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 65}
maximum f1 score, thres 0.3412901487843572 0.7
auc, f1, accu, accu_bl, precision, recall=  0.5241145273973362 0.3412901487843572 0.3417670682730924 0.5080542803336456 0.20302588327553922 0.7855711422845691
[[ 459 1532]
 [ 107  392]]


In [157]:
### STATS + NLP
x_train2_sn = x_train2.filter(regex='stat|nlp')
x_test2_sn = x_test2.filter(regex='stat|nlp')
print("data imbalance train", y_train2.sum()/len(y_train2))
print("data imbalance test", y_test2.sum()/len(y_test2))
y_pred, y_pred_prob = utils.run_xgb(x_train2_sn, y_train2, x_test2_sn)
results['n=2 stat + nlp chrono split'] = utils.get_scores_clf(y_test2, y_pred_prob)


data imbalance train target_flood_2    0.110888
dtype: float64
data imbalance test target_flood_2    0.200402
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.6762226129676429
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.5456656603523206 0.7
auc, f1, accu, accu_bl, precision, recall=  0.5156948754364581 0.5456656603523206 0.7614457831325301 0.5437212949253605 0.2232841733119397 0.18036072144288579
[[1806  185]
 [ 409   90]]


In [158]:
x_train2_se = x_train2.filter(regex='stat|era')
x_test2_se = x_test2.filter(regex='stat|era')
print("data imbalance train", y_train2.sum()/len(y_train2))
print("data imbalance test", y_test2.sum()/len(y_test2))
y_pred, y_pred_prob = utils.run_xgb(x_train2_se, y_train2, x_test2_se)
results['n=2 stat + era chrono split'] = utils.get_scores_clf(y_test2, y_pred_prob)


data imbalance train target_flood_2    0.110888
dtype: float64
data imbalance test target_flood_2    0.200402
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.7936291197624896
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 20}
maximum f1 score, thres 0.48962573963773043 0.7
auc, f1, accu, accu_bl, precision, recall=  0.523044079117552 0.48962573963773043 0.5795180722891566 0.5118046238131713 0.20437275376040548 0.39879759519038077
[[1244  747]
 [ 300  199]]


In [159]:
print("data imbalance train", y_train2.sum()/len(y_train2))
print("data imbalance test", y_test2.sum()/len(y_test2))
y_pred, y_pred_prob = utils.run_xgb(x_train2, y_train2, x_test2)
results['n=2 all vars chrono split'] = utils.get_scores_clf(y_test2, y_pred_prob)

data imbalance train target_flood_2    0.110888
dtype: float64
data imbalance test target_flood_2    0.200402
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.8415880098659712
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.3923894372700348 0.7
auc, f1, accu, accu_bl, precision, recall=  0.523195059128805 0.3923894372700348 0.40602409638554215 0.4934212976430007 0.19832738883876738 0.6392785571142284
[[ 692 1299]
 [ 180  319]]


### n = 5, random split

In [87]:
n_pred = 5

xy_df = attach_target(x_full, y_master, 'flood', n_pred)

results={}

# Separate features (X) and targets (y)
x = xy_df.drop(xy_df.filter(regex='target|Unnamed').columns, axis=1)  # Drop target columns
x = x.select_dtypes(['number'])  # Keep only numerical columns
y = xy_df.filter(regex='target')  # Keep only target columns
x_train5, x_test5, y_train5, y_test5 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first


In [88]:
### STAT ONLY 
x_train5, x_test5, y_train5, y_test5 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first

x_train5 = x_train5.filter(regex='stat|year')
x_test5 = x_test5.filter(regex='stat|year')
print("data imbalance train", y_train5.sum()/len(y_train5))
print("data imbalance test", y_test5.sum()/len(y_test5))
y_pred, y_pred_prob = utils.run_xgb(x_train5, y_train5, x_test5)
results['n=5 stats only random split'] = utils.get_scores_clf(y_test5, y_pred_prob)

data imbalance train target_flood_5    0.245012
dtype: float64
data imbalance test target_flood_5    0.239476
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.8563974997436457
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 20}
maximum f1 score, thres 0.6193620013919782 0.7
auc, f1, accu, accu_bl, precision, recall=  0.8162112347894114 0.6193620013919782 0.6326788636025584 0.73467759079584 0.3781665059592997 0.9304347826086956
[[5511 4715]
 [ 224 2996]]


In [None]:
### NLP ONLY 
x_train5, x_test5, y_train5, y_test5 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first

x_train5 = x_train5.filter(regex='nlp')
x_test5 = x_test5.filter(regex='nlp')
print("data imbalance train", y_train5.sum()/len(y_train5))
print("data imbalance test", y_test5.sum()/len(y_test5))
y_pred, y_pred_prob = utils.run_xgb(x_train5, y_train5, x_test5)
results['n=5 nlp only random split'] = utils.get_scores_clf(y_test5, y_pred_prob)

data imbalance train target_flood_5    0.245012
dtype: float64
data imbalance test target_flood_5    0.239476
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.6613184142357336
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.4256515345607714 0.7
auc, f1, accu, accu_bl, precision, recall=  0.6045099387385461 0.4256515345607714 0.42867767365759335 0.5365151914557097 0.2538511203363307 0.7434782608695653
[[3370 6856]
 [ 826 2394]]


In [None]:
### WEATHER ONLY
x_train5, x_test5, y_train5, y_test5 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first

x_train5 = x_train5.filter(regex='era_')
x_test5 = x_test5.filter(regex='era_')
print("data imbalance train", y_train5.sum()/len(y_train5))
print("data imbalance test", y_test5.sum()/len(y_test5))
y_pred, y_pred_prob = utils.run_xgb(x_train5, y_train5, x_test5)
results['n=5 era only random split'] = utils.get_scores_clf(y_test5, y_pred_prob)

data imbalance train target_flood_5    0.245012
dtype: float64
data imbalance test target_flood_5    0.239476
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.9101930433320391
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.5124947246751776 0.7
auc, f1, accu, accu_bl, precision, recall=  0.6287087596711828 0.5124947246751776 0.5314591700133868 0.588449519128564 0.2793895669234604 0.6978260869565217
[[4899 5327]
 [ 973 2247]]


In [91]:
### WEATHER + NLP
x_train5, x_test5, y_train5, y_test5 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first

x_train5 = x_train5.filter(regex='nlp|era_')
x_test5 = x_test5.filter(regex='nlp|era_')
print("data imbalance train", y_train5.sum()/len(y_train5))
print("data imbalance test", y_test5.sum()/len(y_test5))
y_pred, y_pred_prob = utils.run_xgb(x_train5, y_train5, x_test5)
results['n=5 weather + nlp random split'] = utils.get_scores_clf(y_test5, y_pred_prob)

data imbalance train target_flood_5    0.245012
dtype: float64
data imbalance test target_flood_5    0.239476
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.9111032323030295
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.534974225447115 0.7
auc, f1, accu, accu_bl, precision, recall=  0.6414696644650768 0.534974225447115 0.5635133125092965 0.5963315710896473 0.2845920560684741 0.6593167701863354
[[5454 4772]
 [1097 2123]]


In [92]:
### STATS + NLP
x_train5, x_test5, y_train5, y_test5 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first

x_train5 = x_train5.filter(regex='stat|nlp')
x_test5 = x_test5.filter(regex='stat|nlp')
print("data imbalance train", y_train5.sum()/len(y_train5))
print("data imbalance test", y_test5.sum()/len(y_test5))
y_pred, y_pred_prob = utils.run_xgb(x_train5, y_train5, x_test5)
results['n=5 stat + nlp random split'] = utils.get_scores_clf(y_test5, y_pred_prob)


data imbalance train target_flood_5    0.245012
dtype: float64
data imbalance test target_flood_5    0.239476
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.6754578785338157
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.38673124527018665 0.7
auc, f1, accu, accu_bl, precision, recall=  0.611692625544678 0.38673124527018665 0.38673211363974414 0.5317047156620622 0.251756169412337 0.8099378881987578
[[2592 7634]
 [ 612 2608]]


In [93]:
### STATS + WEATHER
x_train5, x_test5, y_train5, y_test5 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first

x_train5 = x_train5.filter(regex='stat|era_')
x_test5 = x_test5.filter(regex='stat|era_')
print("data imbalance train", y_train5.sum()/len(y_train5))
print("data imbalance test", y_test5.sum()/len(y_test5))
y_pred, y_pred_prob = utils.run_xgb(x_train5, y_train5, x_test5)
results['n=5 stat + weather random split'] = utils.get_scores_clf(y_test5, y_pred_prob)


data imbalance train target_flood_5    0.245012
dtype: float64
data imbalance test target_flood_5    0.239476
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.8971086285869415
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.5146023777716942 0.7
auc, f1, accu, accu_bl, precision, recall=  0.6341901747220883 0.5146023777716942 0.5358470920719917 0.5859087115658175 0.27818598275371614 0.6819875776397516
[[5009 5217]
 [1024 2196]]


In [94]:
x_train5, x_test5, y_train5, y_test5 = train_test_split(x, y, test_size=0.3, random_state=42) # random split first

print("data imbalance train", y_train5.sum()/len(y_train5))
print("data imbalance test", y_test5.sum()/len(y_test5))
y_pred, y_pred_prob = utils.run_xgb(x_train5, y_train5, x_test5)
results['n=5 all vars random split'] = utils.get_scores_clf(y_test5, y_pred_prob)

data imbalance train target_flood_5    0.245012
dtype: float64
data imbalance test target_flood_5    0.239476
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.9382932996512136
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 10}
maximum f1 score, thres 0.6923274473171055 0.7
auc, f1, accu, accu_bl, precision, recall=  0.8259883314119532 0.6923274473171055 0.7293618920124945 0.7530276314302964 0.41741077939616417 0.7984472049689441
[[7236 2990]
 [ 649 2571]]


### n=5, chrono split

In [164]:
n_pred = 5

xy_df = attach_target(x_full, y_master, 'flood', n_pred)

results={}

x = xy_df.drop(xy_df.filter(regex='target|Unnamed').columns, axis=1)  # Drop target columns
x = x.select_dtypes(['number'])  # Keep only numerical columns
y = xy_df.filter(regex='target')  # Keep only target columns

split_year = 2010
years = x['year']

train_mask = years <= split_year
test_mask = years > split_year

x_train5, x_test5 = x[train_mask], x[test_mask]
y_train5, y_test5 = y[train_mask], y[test_mask]


In [165]:
### STAT ONLY 
x_train5_stat = x_train5.filter(regex='stat|year')
x_test5_stat = x_test5.filter(regex='stat|year')
print("data imbalance train", y_train5.sum()/len(y_train5))
print("data imbalance test", y_test5.sum()/len(y_test5))
y_pred, y_pred_prob = utils.run_xgb(x_train5_stat, y_train5, x_test5_stat)
results['n=5 stats only chrono split'] = utils.get_scores_clf(y_test5, y_pred_prob)

data imbalance train target_flood_5    0.232601
dtype: float64
data imbalance test target_flood_5    0.426104
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.8150050567559528
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 65}
maximum f1 score, thres 0.3028737604537326 0.7
auc, f1, accu, accu_bl, precision, recall=  0.5338108746452408 0.3028737604537326 0.42730923694779116 0.5006856095857388 0.4264399996735692 0.9971724787935909
[[   6 1423]
 [   3 1058]]


In [171]:
### NLP ONLY 
x_train5_nlp = x_train5.filter(regex='nlp')
x_test5_nlp = x_test5.filter(regex='nlp')
print("data imbalance train", y_train5.sum()/len(y_train5))
print("data imbalance test", y_test5.sum()/len(y_test5))
y_pred, y_pred_prob = utils.run_xgb(x_train5_nlp, y_train5, x_test5_nlp)
results['n=5 nlp only chrono split'] = utils.get_scores_clf(y_test5, y_pred_prob)


data imbalance train target_flood_5    0.232601
dtype: float64
data imbalance test target_flood_5    0.426104
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.6552670827830225
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 20}
maximum f1 score, thres 0.29878907350042244 0.4
auc, f1, accu, accu_bl, precision, recall=  0.47055341455998645 0.29257448574854716 0.42610441767068274 0.5 0.42610441767068274 1.0
[[   0 1429]
 [   0 1061]]


In [172]:
### WEATHER ONLY
x_train5_era = x_train5.filter(regex='era_')
x_test5_era = x_test5.filter(regex='era_')
print("data imbalance train", y_train5.sum()/len(y_train5))
print("data imbalance test", y_test5.sum()/len(y_test5))
y_pred, y_pred_prob = utils.run_xgb(x_train5_era, y_train5, x_test5_era)
results['n=5 era only chrono split'] = utils.get_scores_clf(y_test5, y_pred_prob)

data imbalance train target_flood_5    0.232601
dtype: float64
data imbalance test target_flood_5    0.426104
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.7423806283681639
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 20}
maximum f1 score, thres 0.34012273181260844 0.7
auc, f1, accu, accu_bl, precision, recall=  0.5068095970831747 0.34012273181260844 0.42329317269076305 0.48638575251175825 0.41955798564384095 0.9132893496701225
[[  85 1344]
 [  92  969]]


In [174]:
### WEATHER + NLP
x_train5_wn = x_train5.filter(regex='nlp|era_')
x_test5_wn = x_test5.filter(regex='nlp|era_')
print("data imbalance train", y_train5.sum()/len(y_train5))
print("data imbalance test", y_test5.sum()/len(y_test5))
y_pred, y_pred_prob = utils.run_xgb(x_train5_wn, y_train5, x_test5_wn)
results['n=5 weather + nlp chrono split'] = utils.get_scores_clf(y_test5, y_pred_prob)


data imbalance train target_flood_5    0.232601
dtype: float64
data imbalance test target_flood_5    0.426104
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.7624065625407784
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 20}
maximum f1 score, thres 0.32759965682455966 0.7
auc, f1, accu, accu_bl, precision, recall=  0.47760704776314516 0.32759965682455966 0.41485943775100403 0.47855252283881283 0.41589133859837646 0.9095193213949104
[[  68 1361]
 [  96  965]]


In [173]:
### STATS + NLP
x_train5_sn = x_train5.filter(regex='stat|nlp')
x_test5_sn = x_test5.filter(regex='stat|nlp')
print("data imbalance train", y_train5.sum()/len(y_train5))
print("data imbalance test", y_test5.sum()/len(y_test5))
y_pred, y_pred_prob = utils.run_xgb(x_train5_sn, y_train5, x_test5_sn)
results['n=5 stat + nlp chrono split'] = utils.get_scores_clf(y_test5, y_pred_prob)


data imbalance train target_flood_5    0.232601
dtype: float64
data imbalance test target_flood_5    0.426104
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.671346399870154
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 65}
maximum f1 score, thres 0.3005391113753974 0.7
auc, f1, accu, accu_bl, precision, recall=  0.4741780104988296 0.3005391113753974 0.42610441767068274 0.49963592449126715 0.42592643061511937 0.9971724787935909
[[   3 1426]
 [   3 1058]]


In [175]:
x_train5_se = x_train5.filter(regex='stat|era')
x_test5_se = x_test5.filter(regex='stat|era')
print("data imbalance train", y_train5.sum()/len(y_train5))
print("data imbalance test", y_test5.sum()/len(y_test5))
y_pred, y_pred_prob = utils.run_xgb(x_train5_se, y_train5, x_test5_se)
results['n=5 stat + era chrono split'] = utils.get_scores_clf(y_test5, y_pred_prob)


data imbalance train target_flood_5    0.232601
dtype: float64
data imbalance test target_flood_5    0.426104
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.7225396021774361
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 65}
maximum f1 score, thres 0.31509453884386557 0.7
auc, f1, accu, accu_bl, precision, recall=  0.511794859280199 0.31509453884386557 0.42248995983935744 0.4917538875943249 0.422110775249857 0.9604147031102733
[[  33 1396]
 [  42 1019]]


In [176]:
print("data imbalance train", y_train5.sum()/len(y_train5))
print("data imbalance test", y_test5.sum()/len(y_test5))
y_pred, y_pred_prob = utils.run_xgb(x_train5, y_train5, x_test5)
results['n=5 all vars chrono split'] = utils.get_scores_clf(y_test5, y_pred_prob)

data imbalance train target_flood_5    0.232601
dtype: float64
data imbalance test target_flood_5    0.426104
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.8558100654524949
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 65}
maximum f1 score, thres 0.3063862978984995 0.7
auc, f1, accu, accu_bl, precision, recall=  0.4823100195294852 0.3063862978984995 0.42530120481927713 0.49735847389044363 0.42481646777450255 0.9849198868991518
[[  14 1415]
 [  16 1045]]


### misc

In [6]:
#construct xy_df for appropriate prediction year, depending on the n_pred target period
n_pred = 5

#Riley: attach NLP and ERA features here

# x_df = x_df.loc[x_df['year']>=1979] #crop to after 1979
xy_df = attach_target(x_df, y_master, 'flood', n_pred)
print('length of xy_df', len(xy_df))
print('imbalance', xy_df.filter(regex='target').sum()/len(xy_df))

xy_df.columns

length of xy_df 44820
imbalance target_flood_5    0.243351
dtype: float64


Index(['grid_id', 'year', 'stat_flood_amt', 'stat_storm_amt',
       'stat_earthquake_amt', 'stat_extreme temperature _amt',
       'stat_landslide_amt', 'stat_volcanic activity_amt', 'stat_drought_amt',
       'stat_mass movement (dry)_amt', 'stat_flood_ct', 'stat_storm_ct',
       'stat_earthquake_ct', 'stat_extreme temperature _ct',
       'stat_landslide_ct', 'stat_volcanic activity_ct', 'stat_drought_ct',
       'stat_mass movement (dry)_ct', 'stat_flood_bin', 'stat_storm_bin',
       'stat_earthquake_bin', 'stat_extreme temperature _bin',
       'stat_landslide_bin', 'stat_volcanic activity_bin', 'stat_drought_bin',
       'stat_mass movement (dry)_bin', 'stat_lat', 'stat_lon',
       'target_flood_5'],
      dtype='object')

In [8]:
# Random splitting
results={}
# Separate features (X) and targets (y)
x = xy_df.drop(xy_df.filter(regex='target').columns, axis=1)  # Drop target columns
x = x.select_dtypes(['number'])  # Keep only numerical columns
y = xy_df.filter(regex='target')  # Keep only target columns


#train_test_split randomly
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
print("data imbalance train", y_train.sum()/len(y_train))
print("data imbalance test", y_test.sum()/len(y_test))

# Evaluate the model
y_pred, y_pred_prob = utils.run_xgb(x_train, y_train, x_test)
results['stats only random split'] = utils.get_scores_clf(y_test, y_pred_prob)


data imbalance train target_flood_5    0.245012
dtype: float64
data imbalance test target_flood_5    0.239476
dtype: float64
running xgb...
[10, 20, 65]
Train AUC:  0.8563974915055976
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'scale_pos_weight': 20}
maximum f1 score, thres 0.6193620013919782 0.7
auc, f1, accu, accu_bl, precision, recall=  0.8162112347894114 0.6193620013919782 0.6326788636025584 0.73467759079584 0.3781665059592997 0.9304347826086956
[[5511 4715]
 [ 224 2996]]


In [None]:
#Riley:
# - fix the linking problem of NLP features (pls use the old .pkl let's make sure we are using as raw as possible)
# - attach era features
# - compute results for random split, and non-random split: using the NEW attach_target function here
# - for each, record the data imbalance issues.


# - run results training using ALL data, and compute the locations with the highest risks of flooding with n_pred = 3, 5, that is 2021 and 2023 -> this is the closest to a "live" prediction we can do using our data.
# - let's think about how to visualize the live.
# - we should discuss the paper -> given current results, the paper is not very strong... option 1: we don't submit, keep working on it. option 2: we write something about the observations and try submit.
