In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
results_df = pd.read_csv('data/13_estimation_results/kalimantan_multiple_models_cross_validation_results_sorted.csv')

# Remove results from linear model (we already know that gwl is not linear)
results_df = results_df[results_df['estimator_name'] != 'LinearRegression']
len(results_df)

In [None]:
# dont use e notation
pd.set_option('display.float_format', lambda x: '%.5f' % x)

summary_stats = results_df[['r2_score', 'rmse', 'pearson_r']].describe()
summary_stats



In [None]:
# Performance by model type
model_performance = results_df.groupby('estimator_name')[['r2_score', 'rmse', 'pearson_r']].mean()
model_performance

In [None]:
# Analyze performance by split type
split_performance = results_df.groupby('validation')[['r2_score', 'rmse', 'pearson_r']].mean()
split_performance


In [None]:
# Calculating mean R2 and RMSE for each PHU
phu_split_performance = results_df.groupby(['phu_id', 'validation']).agg({
    'pearson_r': 'mean',
    'r2_score': 'mean',
    'rmse': 'mean',
    'no_obs': 'mean'  # Total number of observations for each PHU
}).reset_index()
# Sort by R2 score
phu_split_performance.sort_values(by='r2_score', ascending=False, inplace=True)
# don't show the exponential notation
pd.options.display.float_format = '{:.4f}'.format
phu_split_performance


In [None]:
# Calculating mean R2 and RMSE for each PHU
df_filter = (~results_df.estimator_name.isin(["Sequential"]))
phu_split_performance = results_df[df_filter].groupby(['phu_id']).agg({
    'pearson_r': 'median',
    'r2_score': 'median',
    'rmse': 'median',
    'no_obs': 'median'  # Total number of observations for each PHU
}).reset_index()
# Sort by R2 score
phu_split_performance.sort_values(by='r2_score', ascending=False, inplace=True)
# don't show the exponential notation
pd.options.display.float_format = '{:.4f}'.format
phu_split_performance.head()

phu_split_performance

In [None]:
best_phus = phu_split_performance.head(5).phu_id.values
best_phus

## Train the model

In [None]:
from gee_scripts.parameters import explain_vars
from gee_scripts.models import get_random_forest
from scipy.stats import pearsonr
from sklearn.metrics import r2_score, mean_squared_error
from gee_scripts.plots import plot_observed_vs_predicted

import seaborn as sns


In [None]:
# USER PARAMETERS

# Select the region where to work
region = "kalimantan"

## List of specific id values to remove
# These stations were selected based on the results of the previous analysis
bad_stations = ['batok1','batok2','brg11','brg13','brg16','BRG_620309_01','BRG_620309_02','BRG_630805_01','BRG_630708_01']


In [None]:
df = pd.read_csv("data/9_clean_training_data/all_training_data_with_extra_and_locations_and_precipSum.csv", parse_dates=["date"])
assert len(df) == 32783, "The length of the dataframe is not correct"
data = df[(~df.id.isin(bad_stations)) & (df.phu_id.isin(best_phus))].copy()
print(len(data))

In [None]:
data.id.nunique()

In [None]:
23*4*4*10

In [None]:
test_months = [1,2,3]
train_months = [m for m in range(1,13) if m not in test_months]

train_data = data[data["date"].dt.month.isin(train_months)]
test_data = data[(data["date"].dt.month.isin(test_months)) & (data.gwl_cm>-150)]

X_train, X_test = train_data[explain_vars], test_data[explain_vars]
y_train, y_test = train_data["gwl_cm"], test_data["gwl_cm"]

print("lenght of train and test", len(X_train), len(X_test))

####################### TRAIN

regr = get_random_forest()

regr.fit(X_train, y_train)
y_pred_test = regr.predict(X_test)

r, p = pearsonr(y_test, y_pred_test)
r2_score_val = r2_score(y_test, y_pred_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

# print all the metrics
print(f"r2_score: {r2_score_val}")
print(f"rmse: {rmse}")
print(f"pearson r: {r}")
print(f"p-value: {p}")

# print train and test months
print(f"Train months: {train_months}")
print(f"Test months: {test_months}")
plot_observed_vs_predicted(y_test, y_pred_test, color = 1)

In [None]:
# divide test/train by random sampling

# randomly select 80% as train and the rest as test

# train_data = data[data.gwl_cm>-150].sample(frac=0.8)
# test_data = data[data.gwl_cm>-150].drop(train_data.index)

# best_kalimantan_phus = [357., 297., 350., 351., 352.]

# Create a new feature that indicates if an area is flooded or not

train_data = df[
    (df.gwl_cm>-150)
    & (df.gwl_cm<5) 
    & (df.phu_id.isin([350, 351, 379, ])) 
    & (~df.id.isin(bad_stations))
]

test_data = df[
    (df.gwl_cm>-150) 
    & (df.gwl_cm<5) 
    & (df.phu_id.isin([357]))
    &(~df.id.isin(bad_stations))
    
    
]

# test_data = df[
#     (df.id == "BRG_630801_01") & (df.gwl_cm>-150)
# ]

X_train, X_test = train_data[explain_vars], test_data[explain_vars]
y_train, y_test = train_data["gwl_cm"], test_data["gwl_cm"]

print("lenght of train and test", len(X_train), len(X_test))

####################### TRAIN

regr = get_random_forest()

regr.fit(X_train, y_train)
y_pred_test = regr.predict(X_test)

r, p = pearsonr(y_test, y_pred_test)
r2_score_val = r2_score(y_test, y_pred_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

# print all the metrics
print(f"r2_score: {r2_score_val}")
print(f"rmse: {rmse}")
print(f"pearson r: {r}")
print(f"p-value: {p}")


plot_observed_vs_predicted(y_test, y_pred_test, color=0)

In [None]:
from gee_scripts.models import bootstrap

In [None]:
bootrap_result = bootstrap(data[data.gwl_cm>-150], "gwl_cm", 25, 0.8, explain_vars)
bootrap_result