In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from gee_scripts.parameters import explain_vars
from gee_scripts.models import get_random_forest
from scipy.stats import pearsonr
from sklearn.metrics import r2_score, mean_squared_error
from gee_scripts.plots import plot_observed_vs_predicted


# 2. Train the model

In [None]:
# USER PARAMETERS

# Select the region where to work
region = "kalimantan"

## List of specific id values to remove
# These stations were selected based on the results of the previous analysis
bad_stations = ['batok1','batok2','brg11','brg13','brg16','BRG_620309_01','BRG_620309_02','BRG_630805_01','BRG_630708_01']

best_phus = [350,351,357,379]

In [None]:
df = pd.read_csv("data/9_clean_training_data/all_training_data_with_extra_and_locations_and_precipSum.csv", parse_dates=["date"])
assert len(df) == 32783, "The length of the dataframe is not correct"
data = df[(~df.id.isin(bad_stations)) & (df.phu_id.isin(best_phus))].copy()
print(len(data))

In [None]:
test_data = pd.read_csv("data/7_training_data/bosf/explanatory_with_response_var_and_source_extra_sum_prec_bosf.csv")
test_data = test_data[test_data.source=="bosf_NASA"]
test_data["gwl_cm"].describe()


test_data = test_data[test_data.time_difference<1]
train_data = test_data

# Plot the gwl_cm variable
train_data.gwl_cm.hist()

### 2.3. Model testing

In [None]:
test_data = df[
    (df.gwl_cm>-50) &
    # & (df.gwl_cm<5) 
    (df.phu_id.isin([379])) 
    & (~df.id.isin(bad_stations))
]

In [None]:
X_train, X_test = train_data[explain_vars], test_data[explain_vars]
y_train, y_test = train_data["gwl_cm"], test_data["gwl_cm"]

print("lenght of train and test", len(X_train), len(X_test))

####################### TRAIN

regr = get_random_forest()

regr.fit(X_train, y_train)
y_pred_test = regr.predict(X_test)

r, p = pearsonr(y_test, y_pred_test)
r2_score_val = r2_score(y_test, y_pred_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

# print all the metrics
print(f"r2_score: {r2_score_val}")
print(f"rmse: {rmse}")
print(f"pearson r: {r}")
print(f"p-value: {p}")


plot_observed_vs_predicted(y_test, y_pred_test, color=0)

In [None]:
# divide test/train by random sampling

# randomly select 80% as train and the rest as test

# train_data = data[data.gwl_cm>-150].sample(frac=0.8)
# test_data = data[data.gwl_cm>-150].drop(train_data.index)

# best_kalimantan_phus = [357., 297., 350., 351., 352.]

# Create a new feature that indicates if an area is flooded or not

train_data = df[
    # (df.gwl_cm>-150)
    # & (df.gwl_cm<5) 
    (df.phu_id.isin([350, 351, 379, 357])) 
    & (~df.id.isin(bad_stations))
]

X_train, X_test = train_data[explain_vars], test_data[explain_vars]
y_train, y_test = train_data["gwl_cm"], test_data["gwl_cm"]

print("lenght of train and test", len(X_train), len(X_test))

####################### TRAIN

regr = get_random_forest()

regr.fit(X_train, y_train)
y_pred_test = regr.predict(X_test)

r, p = pearsonr(y_test, y_pred_test)
r2_score_val = r2_score(y_test, y_pred_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

# print all the metrics
print(f"r2_score: {r2_score_val}")
print(f"rmse: {rmse}")
print(f"pearson r: {r}")
print(f"p-value: {p}")


plot_observed_vs_predicted(y_test, y_pred_test, color=0)