In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%pylab inline

In [None]:
import geopandas as gpd
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from pathlib import Path
from gee_scripts.models import get_random_forest

# 1. Read training data

In [None]:
# read phu regions shapefile
#phu_regions = gpd.read_file("data/0_shp/AOI__Province__865_PHUs__INDONESIA.gpkg")
#phu_regions = phu_regions.to_crs("EPSG:4326")

In [None]:
df = pd.read_csv("data/9_clean_training_data/clean_training_data.csv")
df.head()
# Convert to GeoDataFrame
#df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat), crs="EPSG:4326")
# do spatial join with phu's
#df = gpd.sjoin(df, phu_regions, how="left", predicate="within")

In [None]:
# Get the number of cases per PHU
phu_cases = df.groupby("phu_id").size().reset_index(name="observations")

In [None]:
# get a boxplot of response var per island but use a small graph size

# set the seaborn style and size
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(8,5)})
sns.boxplot(x="island", y="gwl_cm", data=df, width=0.5)

# Rename x-axis with phu id
plt.xticks(rotation=90)
plt.xlabel("PHU id")
plt.ylabel("Groundwater Level (cm)")
plt.title("Groundwater Level Distribution by Island")
plt.show()

In [None]:
# get a boxplot of response var per source but use a small graph size

# set the seaborn style and size
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(8,5)})
sns.boxplot(x="source", y="gwl_cm", data=df, width=0.5)

# Rename x-axis with phu id
plt.xticks(rotation=90)
plt.xlabel("source")
plt.ylabel("Groundwater Level (cm)")
plt.title("Groundwater Level Distribution by source")
plt.show()

In [None]:
# get a boxplot of response var per province but use a small graph size

# set the seaborn style and size
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(8,5)})
sns.boxplot(x="province", y="gwl_cm", data=df, width=0.5)

# Rename x-axis with phu id
plt.xticks(rotation=90)
plt.xlabel("PHU id")
plt.ylabel("Groundwater Level (cm)")
plt.title("Groundwater Level Distribution by Province")

## Create a boxplot showing the number of dates per each point

In [None]:
df.columns

In [None]:
# group by id and get the number of dates for each id
group_by = "id"
df_grouped = df.groupby(group_by).count().reset_index()
df_grouped = df_grouped[[group_by, "date"]]
df_grouped.columns = ["name", "date_count"]
df_grouped.sort_values(by="date_count", ascending=False)

In [None]:
# Define the dimensions of the plot
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))

# Create a violin plot for the variable
sns.violinplot(x=df_grouped["date_count"])

# Set the title and x-axis label
plt.title(f"Frequency dates per point")
plt.xlabel("Number of dates per station")

plt.show()

In [None]:
# drop all stations with less tan 9 observations
min_obs = 9
df.groupby('id').agg({'date': 'count'}).sort_values(by='date', ascending=False).reset_index()
df = df.groupby('id').filter(lambda group: len(group) >= min_obs)

In [None]:
from gee_scripts.parameters import explain_vars, response_var
print("dependent var", response_var)
print("explanatory lenght", len(explain_vars))

# 2. Define a model

## All but one test over stations

In [None]:
from gee_scripts.randomforest import run_randomforest
from gee_scripts.randomforest import get_heatmap

In [None]:
high_corr_ids = pd.read_csv("data/high_corr_0.2_temporal_variables_station_ids.csv")
high_corr_ids.head()

In [None]:
variable = 'gwl_cm'

high_corr_ids = pd.read_csv("data/high_corr_0.2_temporal_variables_station_ids.csv")
high_corr_ids.columns = ["id"]

training_df = df[
  #  (df.island == "Kalimantan") & don't use the selection by island, the selected stations come from the previous notebook
    df.id.isin(high_corr_ids.id.unique())
]

# Manually selected PHU for training
# high_corr_phu_ids = [
#     136,
#     137,
#     138,
#     143
# ]
# training_df = df[
#     (df.phu_id.isin(high_corr_phu_ids))
# ]

stats_df = run_randomforest(training_df, type_="allbutone")

In [None]:
stats_df.head()

In [None]:
df.head()

In [None]:
get_heatmap(stats_df, "r_local")

In [None]:
get_heatmap(stats_df, "rmse_local")

## Select best stations

In [None]:
stats_df.columns

In [None]:
best_stations = stats_df[stats_df.rmse_local < 15].sort_values(by="r_local", ascending=False).index
best_stations
len(best_stations)

## Model with best stations over all stations

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import pearsonr
import numpy as np
from gee_scripts.parameters import explain_vars, temporal_expl

In [None]:
# split gdf into good statoins for train
gdf_high = training_df[training_df.id.isin(best_stations)].copy()
# and bad stations for test
gdf_low = training_df[~training_df.id.isin(best_stations)].copy()

variable = 'gwl_cm'

# create and train classifier
regr = get_regressor()
regr.fit(gdf_high[explain_vars], gdf_high[variable])

row = {}
#rmse_list = []
for station in gdf_low.id.unique():
    explans = []
    # apply model to specific station
    gdf_test = gdf_low[gdf_low.id == station]
    y_pred_test = regr.predict(gdf_test[explain_vars])

    # get pearsons r
    r, p = pearsonr(gdf_test[variable].values, y_pred_test)
    explans.append(r)

    explans.append(np.sqrt(mean_squared_error(gdf_test[variable].values, y_pred_test)))

    # add correlation of explanatories
    for expl in temporal_expl:
        explans.append(gdf_test[variable].corr(gdf_test[expl]))
     
    row[station] = explans
    #row[station] = [np.sqrt(mean_squared_error(gdf_test[variable].values, y_pred_test))]
    #print(row)
    
stats_df = pd.DataFrame.from_dict(row, orient='index')

In [None]:
get_heatmap(stats_df, "r_local")

In [None]:
get_heatmap(stats_df, "rmse_local")

In [None]:
best_worse_stations = stats_df[stats_df.rmse_local < 15].index
best_worse_stations

In [None]:
gdf_high.to_file("data/0_shp/kalimantan_best_stations.gpkg", driver="GPKG")
len(gdf_high)

# 3. Final model bootstraping (test different combinations)

The following cells will test different combinations of stations, provinces or phus.
After each bootraping, combination, a result containing the average, min, max and median statistics of the different statistical parameters over all the iterations. 

This result will help to select what is the best combination of stations to produce the final data.

In [None]:
from gee_scripts.randomforest import bootstrap

In [None]:
bootstrap_stations = list(best_stations) # + list(best_worse_stations)
len(bootstrap_stations)

### Bootstrap with only best stations

In [None]:
selected_df = training_df[training_df.id.isin(best_stations)]
bootstrap_result = bootstrap(df = selected_df, variable="gwl_cm", iterations=5, train_size=0.8)
bootstrap_result

### Bootstrap with best + best worse

In [None]:
selected_df = training_df[training_df.id.isin(list(best_stations) + list(best_worse_stations))]
bootstrap_result = bootstrap(df = selected_df, variable="gwl_cm", iterations=5, train_size=0.8)
bootstrap_result

### Bootstrap by PHU

In [None]:
selected_df = df[
    (df.phu_id == 801) #this is the code for 'KHG Sungai Siak - Sungai Kampar'
]
bootstrap_result = bootstrap(df = selected_df, variable="gwl_cm", iterations=10, train_size=0.8)
bootstrap_result

### Bootstrap with BRG

In [None]:
selected_df = df[
    (df.source.isin(["brg", "old_brg"])) & 
    (df.island == "Kalimantan")
]
bootstrap_result = bootstrap(df = selected_df, variable="gwl_cm", iterations=10, train_size=0.8)
bootstrap_result

In [None]:
selected_df = df[
    (df.source.isin(["brg", "old_brg"])) & 
    (df.island == "Sumatera")
]
bootstrap_result = bootstrap(df = selected_df, variable="gwl_cm", iterations=10, train_size=0.8)
bootstrap_result

### Bootstrap with PKEG

In [None]:
selected_df = df[
    (df.source.isin(["pkeg"])) & 
    (df.island == "Sumatera")
]
bootstrap_result = bootstrap(df = selected_df, variable="gwl_cm", iterations=10, train_size=0.8)
bootstrap_result

In [None]:
selected_df = df[
    (df.source.isin(["pkeg"])) & 
    (df.island == "Kalimantan")
]
bootstrap_result = bootstrap(df = selected_df, variable="gwl_cm", iterations=10, train_size=0.8)
bootstrap_result

### Bootstrap by regions

In [None]:
df.province.unique()

In [None]:
selected_df = df[
    (df.source.isin(["brg", "brg_old"])) & 
    (df.province == "Central Kalimantan")
]
bootstrap_result = bootstrap(df = selected_df, variable="gwl_cm", iterations=10, train_size=0.8)
bootstrap_result

# 4. Final model selection

After selecting the best combination of stations that present the best model statistics (r, rmse), the following cell can be used to train and store the last model, replace "final_df" with the filters that worked well in the bootraping models.

In [None]:
# Define the filters of the best stations.
df = pd.read_csv("data/9_clean_training_data/clean_training_data.csv")
final_df = df[(df.phu_id == 801)]
final_df

In [None]:
#Select only the stations with high correlation or ingest all the station of the PHU in the model
high_corr_ids = pd.read_csv("data/high_corr_0.2_temporal_variables_station_ids.csv")

training_df = df[
  #  (df.island == "Kalimantan") & don't use the selection by island, the selected stations come from the previous notebook
    df.id.isin(high_corr_ids.id.unique())
]

In [None]:
from gee_scripts.directories import model_path
# Save model to file with not pickle (pickle is not safe) 
import joblib
from sklearn.model_selection import train_test_split
from gee_scripts.parameters import explain_vars, temporal_expl
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

In [None]:
#Choose the df for the model, either high correlation stations or all the stations in a selected PHU

#final_df = df[(df.phu_id == 801)]#To selecT it based on the PHU

#Select only the stations with high correlation or ingest all the station of the PHU in the model
high_corr_ids = pd.read_csv("data/high_corr_0.3_temporal_variables_station_ids.csv")
high_corr_ids.columns = ['id']
#final_df = df[df['id'].isin(high_corr_ids)]
final_df = df[df.id.isin(high_corr_ids.id.unique())]#To selecT it based on correlations
len(final_df)

In [None]:
# Split the dataset into features (X) and target variable (y)
X = final_df.drop(columns=['gwl_cm'])  # Replace 'target_column' with the name of your target column
y = final_df['gwl_cm']

# Split the dataset into 80% training and 20% testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# X_train and y_train contain 70% of the data for training
# X_test and y_test contain 30% of the data for testing
len(X_train)

In [None]:
variable = 'gwl_cm'

# Define the filters of the best stations.
final_df = df[
    (df.phu_id == 801)
]

regr = get_regressor()
regr.fit(X_train[explain_vars], y_train)

# Define a name for this model, it will be overwritten if there's something before
model_name = "model_sungai_siak_sungai_kampar_phu_0.3_corr_0.3.joblib"
joblib.dump(regr, model_path/model_name)

In [None]:
# Make predictions
predictions = regr.predict(X_test[explain_vars])

In [None]:
# Assuming you have trained your RandomForestRegressor model and made predictions
# rf_regressor.fit(X_train, y_train)  # Assuming you've trained the model already
# predictions = rf_regressor.predict(X_test)  # Assuming you've made predictions already

# Calculate R-squared (coefficient of determination)
r2 = r2_score(y_test, predictions)

# Plot real vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions, color='blue', label='Predicted')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2, color='red', label='Actual')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values (R-squared: {:.2f})'.format(r2))
plt.legend()
plt.grid(True)
plt.show()