In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%pylab inline

In [None]:
import geopandas as gpd
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from pathlib import Path
from gee_scripts.randomforest import get_regressor

# 1. Read training data

In [None]:
# read phu regions shapefile
phu_regions = gpd.read_file("data/0_shp/AOI__Province__865_PHUs__INDONESIA.gpkg")
phu_regions = phu_regions.to_crs("EPSG:4326")

In [None]:
df = pd.read_csv("data/9_clean_training_data/clean_training_data.csv")

# Convert to GeoDataFrame
df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat), crs="EPSG:4326")
# do spatial join with phu's
df = gpd.sjoin(df, phu_regions, how="left", predicate="within")

In [None]:
# Get the number of cases per PHU
phu_cases = df.groupby("phu_id").size().reset_index(name="observations")

In [None]:
# get a boxplot of response var per region but use a small graph size

# set the seaborn style and size
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(8,5)})
sns.boxplot(x="island", y="gwl_cm", data=df, width=0.5)

# Rename x-axis with phu id
plt.xticks(rotation=90)
plt.xlabel("PHU id")
plt.ylabel("Groundwater Level (cm)")
plt.title("Groundwater Level Distribution by Island")
plt.show()

In [None]:
# get a boxplot of response var per region but use a small graph size

# set the seaborn style and size
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(8,5)})
sns.boxplot(x="source", y="gwl_cm", data=df, width=0.5)

# Rename x-axis with phu id
plt.xticks(rotation=90)
plt.xlabel("source")
plt.ylabel("Groundwater Level (cm)")
plt.title("Groundwater Level Distribution by source")
plt.show()

In [None]:
# get a boxplot of response var per region but use a small graph size

# set the seaborn style and size
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(8,5)})
sns.boxplot(x="province", y="gwl_cm", data=df, width=0.5)

# Rename x-axis with phu id
plt.xticks(rotation=90)
plt.xlabel("PHU id")
plt.ylabel("Groundwater Level (cm)")
plt.title("Groundwater Level Distribution by Province")

## Create a boxplot showing the number of dates per each point

In [None]:
# group by id and get the number of dates for each id
df_grouped = df.groupby("phu_name").count().reset_index()
df_grouped = df_grouped[["phu_name", "date"]]
df_grouped.columns = ["name", "date_count"]
df_grouped.sort_values(by="date_count", ascending=False)

In [None]:
# Define the dimensions of the plot
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))

# Create a violin plot for the variable
sns.violinplot(x=df_grouped["date_count"])

# Set the title and x-axis label
plt.title(f"Frequency dates per point")
plt.xlabel("Number of dates per plot")

plt.show()

In [None]:
# drop all stations with less tan 9 observations
min_obs = 9
df.groupby('id').agg({'date': 'count'}).sort_values(by='date', ascending=False).reset_index()
df = df.groupby('id').filter(lambda group: len(group) >= min_obs)

In [None]:
from gee_scripts.parameters import explain_vars, response_var
print("dependent var", response_var)
print("explanatory lenght", len(explain_vars))

# 2. Define a model

## All but one test over stations

In [None]:
from gee_scripts.randomforest import run_randomforest
from gee_scripts.randomforest import get_heatmap

In [None]:
variable = 'gwl_cm'


high_corr_ids = pd.read_csv("data/high_corr_0.2_temporal_variables_station_ids.csv")
high_corr_ids.columns = ["id"]

training_df = df[
    (df.island == "Kalimantan") &
    (df.id.isin(high_corr_ids.id.unique()))
]

# Manually selected PHU for training
# high_corr_phu_ids = [
#     136,
#     137,
#     138,
#     143
# ]
# training_df = df[
#     (df.phu_id.isin(high_corr_phu_ids))
# ]

stats_df = run_randomforest(training_df, type_="allbutone")

In [None]:
get_heatmap(stats_df, "r_local")

In [None]:
get_heatmap(stats_df, "rmse_local")

In [None]:
# Change parameters, play with different datasets

training_df = df[
    (df.island == "Kalimantan") & 
    (df.id.isin(high_corr_ids.id.unique()))
]
stats_df = run_randomforest(training_df, type_="allbutone")

In [None]:
# count the number of values per station
training_df.groupby('id').agg({'date': 'count'}).sort_values(by='date', ascending=False).reset_index()

## Select best stations

In [None]:
best_stations = stats_df[stats_df.r_local > 0.5].sort_values(by="r_local", ascending=False).index
best_stations
len(best_stations)

## Model with best stations over all stations

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import pearsonr
import numpy as np
from gee_scripts.parameters import explain_vars, temporal_expl

In [None]:
# split gdf into good statoins for train
gdf_high = training_df[training_df.id.isin(best_stations)].copy()
# and bad stations for test
gdf_low = training_df[~training_df.id.isin(best_stations)].copy()

variable = 'gwl_cm'

# create and train classifier
regr = get_regressor()

regr.fit(gdf_high[explain_vars], gdf_high[variable])


row = {}
#rmse_list = []
for station in gdf_low.id.unique():
    explans = []
    # apply model to specific station
    gdf_test = gdf_low[gdf_low.id == station]
    y_pred_test = regr.predict(gdf_test[explain_vars])

    # get pearsons r
    r, p = pearsonr(gdf_test[variable].values, y_pred_test)
    explans.append(r)

    explans.append(np.sqrt(mean_squared_error(gdf_test[variable].values, y_pred_test)))

    # add correlation of explanatories
    for expl in temporal_expl:
        explans.append(gdf_test[variable].corr(gdf_test[expl]))
     
    row[station] = explans
    #row[station] = [np.sqrt(mean_squared_error(gdf_test[variable].values, y_pred_test))]
    #print(row)
    
stats_df = pd.DataFrame.from_dict(row, orient='index')

In [None]:
get_heatmap(stats_df, "r_local")

In [None]:
get_heatmap(stats_df, "rmse_local")


In [None]:
gdf_high.to_file("data/0_shp/kalimantan_best_stations.gpkg", driver="GPKG")
len(gdf_high)

In [None]:
best_worse_stations = stats_df[stats_df.rmse_local < 10].index
best_worse_stations

## Final model bootstraping

In [None]:
bootstrap_stations = list(best_stations) #+ list(best_worse_stations)

In [None]:
train_size=0.8
size = int(train_size*len(bootstrap_stations))
print(size)

i = 0
r_list, r2_list, rmse_list = [], [], []
samples_train, samples_test = [], []

i = 0
while i < 100: 

    train_list = np.random.choice(bootstrap_stations, size=size, replace=False)


    gdf_train = training_df[training_df.id.isin(train_list)].copy()

    gdf_test = training_df[
        (training_df.id.isin(bootstrap_stations)) & (~training_df.id.isin(gdf_train.id.unique()))
    ].copy()

    X_train, X_test = gdf_train[explain_vars], gdf_test[explain_vars]
    y_train, y_test = gdf_train[variable], gdf_test[variable]
    
    regr = get_regressor()
    regr.fit(X_train, y_train)
    y_pred_test = regr.predict(X_test)
    
    samples_train.append(len(gdf_train))
    samples_test.append(len(gdf_test))
    r, p = pearsonr(y_test, y_pred_test)
    r_list.append(r)
    r2_list.append(r2_score(y_test, y_pred_test))
    rmse_list.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    
    i += 1

In [None]:
pd.DataFrame([
    [np.array(r_list).mean(), np.array(r_list).min(), np.array(r_list).max(), np.median(np.array(r_list))],
    [np.array(r2_list).mean(), np.array(r2_list).min(), np.array(r2_list).max(), np.median(np.array(r2_list))],
    [np.array(rmse_list).mean(), np.array(rmse_list).min(), np.array(rmse_list).max(), np.median(np.array(rmse_list))],
    [np.array(samples_train).mean(), np.array(samples_train).min(), np.array(samples_train).max()],
    [np.array(samples_test).mean(), np.array(samples_test).min(), np.array(samples_test).max()],
],
    index=["r", "r2", "rmse", "samples_train", "samples_test"],
    columns=["mean", "min", "max", "median"]
)


In [None]:
# Save model to file with not pickle (pickle is not safe) 
import joblib
model_name = "All_but_one_PHU_Kalimantan_high_corr_0_2_temporal_variables"
joblib.dump(regr, f"data/10_models/{model_name}.joblib")
plt.savefig(f"{model_name}.png")

In [None]:
# open unieqe points geopackage file
unique_points = gpd.read_file("data/0_shp/unique_stations_no_repeated.shp")

# export best estations to geopackage
unique_points[unique_points.id.isin(best_stations)].to_file("data/0_shp/best_kalimantan_corr02_rmse10.gpkg", driver='GPKG')

# # merge the stats_df with the unique_points
# unique_points = unique_points.merge(stats_df, left_on='id', right_index=True)

# # save the unique_points as a geopackage file
# unique_points.to_file("data/0_shp/kalimantan_r_local.gpkg", driver='GPKG')

In [None]:
stats_df.sort_values(by="r_local", ascending=False).head(10)