In [6]:
import pandas as pd
from pathlib import Path

In [7]:
# Explore data sources from .xslsx files, merge them and save them as .csv
# ach means Achmad
# wal means Waluyo

ach_path = Path("data/2_Achmad/achmed_raw.csv")
wal_path = Path("data/3_Waluyo/waluyo_raw.csv")
old_brg_path = Path("data/4_brg_old/brg_old.csv")

DATA_COLS = ["source", "id", "lon", "lat", "date", "gwl_cm"]

In [8]:
ach_df = pd.read_csv(ach_path)

### Achmed data

In [9]:
ach_cols_rename = {
    "gwl_rata" : "gwl_cm",
}

# rename columns
ach_df.rename(columns=ach_cols_rename, inplace=True)

# Convert date column to datetime
ach_df["date"] = pd.to_datetime(ach_df["date"])

# multiply gwl_cm by 100 to convert it to cm
ach_df["gwl_cm"] = ach_df["gwl_cm"] * 100

# set a new column for source
ach_df["source"] = "ach"

ach_df = ach_df[DATA_COLS]
ach_df.head()

Unnamed: 0,source,id,lon,lat,date,gwl_cm
0,ach,BRG_140301_01,102.099167,1.519444,2018-10-15,-14.4
1,ach,BRG_140301_01,102.099167,1.519444,2018-10-16,-17.9
2,ach,BRG_140301_01,102.099167,1.519444,2018-10-17,-20.6
3,ach,BRG_140301_01,102.099167,1.519444,2018-10-18,-18.1
4,ach,BRG_140301_01,102.099167,1.519444,2018-10-19,-23.1


In [10]:
ach_df.id.unique()

array(['BRG_140301_01', 'BRG_140301_02', 'BRG_140302_01', 'BRG_140302_02',
       'BRG_140802_02', 'brg8', 'BRG_140802_01', 'BRG_140802_03',
       'BRG_140806_01', 'BRG_150710_02', 'BRG_150706_01', 'BRG_150709_01',
       'BRG_150709_02', 'BRG_150710_01', 'BRG_150710_03', 'BRG_621107_04',
       'BRG_621107_05', 'BRG_621107_06', 'BRG_621107_07', 'BRG_621107_08',
       'BRG_627104_06', 'BRG_621103_05', 'BRG_621105_02', 'BRG_621105_03',
       'BRG_621107_02', 'BRG_621107_03', 'BRG_621101_02'], dtype=object)

### Waluyo data

In [11]:
wal_cols_rename = {
    "date(dd/mm/yyyy)" : "date",
    "gwl(cm)" : "gwl_cm",
    "coor_y(dd)" : "lat",
    "coor_x(dd)" : "lon",
}

# Read Waluyo's data
wal_df = pd.read_csv(wal_path, sep=";")

# Rename columns
wal_df.rename(columns=wal_cols_rename, inplace=True)

# combine "kode_perusahaan"	and "kode_titik" to create a unique id
wal_df.loc[:, "id"] = wal_df["kode_perusahaan"] + "_" + wal_df["kode_titik"]

# Convert date column to datetime
wal_df.loc[:, "date"] = pd.to_datetime(wal_df["date"], dayfirst=True)

# set a new column for source
wal_df["source"] = "wal"

# Only select columns that are needed
wal_df = wal_df[DATA_COLS]
wal_df.head()

  wal_df.loc[:, "date"] = pd.to_datetime(wal_df["date"], dayfirst=True)


Unnamed: 0,source,id,lon,lat,date,gwl_cm
0,wal,121_APC_A21,96.451939,3.809687,2020-11-14,-10.0
1,wal,121_APC_A21,96.451939,3.809687,2020-11-26,-12.0
2,wal,121_APC_A21,96.451939,3.809687,2020-12-02,-30.0
3,wal,121_APC_A21,96.451939,3.809687,2020-12-16,-60.0
4,wal,121_APC_A21,96.451939,3.809687,2021-07-12,-25.0


## Previous BRG data

In [12]:
old_brg_df = pd.read_csv(old_brg_path)
old_brg_df.loc[:, "source"] = "old_brg"
old_brg_df.loc[:, "date"] = pd.to_datetime(old_brg_df["date"], dayfirst=True)
old_brg_df = old_brg_df[DATA_COLS]
old_brg_df.head()

  old_brg_df.loc[:, "date"] = pd.to_datetime(old_brg_df["date"], dayfirst=True)


Unnamed: 0,source,id,lon,lat,date,gwl_cm
0,old_brg,BRG_150503_01,103.928286,-1.545325,2019-07-06,-3.04
1,old_brg,BRG_150503_01,103.928286,-1.545325,2019-07-07,-3.04
2,old_brg,BRG_150503_01,103.928286,-1.545325,2019-07-08,-3.04
3,old_brg,BRG_150503_01,103.928286,-1.545325,2019-07-09,-3.04
4,old_brg,BRG_150503_01,103.928286,-1.545325,2019-07-10,-3.04


### Concatenate both dataframes

In [21]:
# Merge the two dataframes
df = pd.concat([ach_df, wal_df, old_brg_df], ignore_index=True)


# save the dataframe as csv
df.to_csv("data/field_data_all_with_old.csv", index=False)

In [22]:
# Remove duplicate dates for each id using the mean value
# Group by 'id' and 'date' and calculate the mean value for 'gwl_cm' while keeping other columns

print("Before removing duplicates", len(df))

agg_dict = {'source':'first','lon':'first','lat':'first','gwl_cm':'mean'}
df = df.groupby(['id','date']).agg(agg_dict).reset_index()

print("After removing duplicates", len(df))

Before removing duplicates 306132
After removing duplicates 298827


In [23]:
# Below I will remove the duplicated coordinates IDS and keep the first one
# get unique lon-lat pairs
unique = df[["id", "lon", "lat"]].drop_duplicates()

# Get duplicated lon-lat pairs
duplicated = unique[unique.duplicated(subset=["lon", "lat"], keep=False)]

duplicated = duplicated.drop_duplicates(subset=["lon", "lat"], keep="first")

# Get the duplicated ids
duplicated_ids = duplicated["id"].unique()

# # get dataframe without duplicated ids

df = df[~df["id"].isin(duplicated_ids)]

df.to_csv("data/field_data_unique_coords_2.csv", index=False)
df

Unnamed: 0,id,date,source,lon,lat,gwl_cm
0,02_AHL_SBG-B076,2020-11-05,wal,117.007750,3.937760,-37.000
1,02_AHL_SBG-B076,2020-11-17,wal,117.007750,3.937760,-39.000
2,02_AHL_SBG-B076,2020-12-05,wal,117.007750,3.937760,-39.000
3,02_AHL_SBG-B076,2020-12-16,wal,117.007750,3.937760,-35.000
4,02_AHL_SBG-B076,2021-01-02,wal,117.007750,3.937760,-34.000
...,...,...,...,...,...,...
298822,kecil1,2019-10-26,old_brg,113.805611,-2.856089,-3.021
298823,kecil1,2019-10-27,old_brg,113.805611,-2.856089,-3.023
298824,kecil1,2019-10-31,old_brg,113.805611,-2.856089,-3.023
298825,kecil1,2019-11-02,old_brg,113.805611,-2.856089,-3.023


In [24]:
# get unique coordinates for each station
stations = df[["id", "source", "lon", "lat"]].drop_duplicates()


# Convert to GeoDataFrame
from geopandas import GeoDataFrame
from shapely.geometry import Point

geometry = [Point(xy) for xy in zip(stations.lon, stations.lat)]
stations_gdf = GeoDataFrame(stations, geometry=geometry)
stations_gdf.crs = "EPSG:4326"
stations_gdf.to_file("data/0_shp/unique_stations_no_repeated.shp")

## Get unique points (and check they have the same coordinates)

In [25]:
# get unique ids
ids = df["id"].unique()
print("ids len", len(ids))

# Check that we have the same number of unique coordinates

# get unique coordinates
coords = df[["lon", "lat"]].drop_duplicates()
print("coords len", len(coords))

# Check the stations that have more than one coordinate

# get the number of coordinates per id
grouped = df.groupby(['lon', 'lat']).agg({'id': pd.Series.nunique}).reset_index()

# Filter groups with more than one unique 'id'
shared_coords = grouped[grouped['id'] > 1].reset_index()

# For each shared coordinate, list the unique station IDs
shared_ids = []
for _, row in shared_coords.iterrows():
    lon, lat = row['lon'], row['lat']
    stations_at_coord = df[(df['lon'] == lon) & (df['lat'] == lat)]['id'].unique()
    shared_ids.append(stations_at_coord.tolist())

# flatten the list
shared_ids = set([item for sublist in shared_ids for item in sublist])

# Print the results
# we'd say that "half" of the following have shared coordinates
len(pd.DataFrame(shared_ids).iloc[:,0].unique())

ids len 2073
coords len 2075


IndexError: single positional indexer is out-of-bounds

## Convert unique ids to shapefile 

Note that unique ids contains duplicated coordinates.

In [109]:
# Export unique point ID's as shapefile

df_only_locations = df.drop_duplicates(subset=["id"])

# Transfor date to string to avoid errors when exporting to shapefile
df_only_locations["date"] = df_only_locations["date"].astype(str)

df_only_locations = gpd.GeoDataFrame(df_only_locations, geometry=gpd.points_from_xy(df_only_locations["lon"], df_only_locations["lat"]))
df_only_locations.crs = "EPSG:4326"
df_only_locations.to_file("data/merged_df.shp")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_only_locations["date"] = df_only_locations["date"].astype(str)


In [115]:

### (test()) Create a random sample of 3 points to extract their SM from the images

# Create a random sample of 3 points to extract their SM from the images
test_sample = gpd.GeoDataFrame(df_only_locations.sample(3, random_state=42))

# Export the sample as shapefile
test_sample.to_file("data/0_shp/test_sample.shp")

In [158]:
from pathlib import PosixPath
import datetime

smm_df = pd.read_csv("data/6_extracted_sm_data/all_extracted_data.csv")

# convert date to datetime
smm_df["date"] = pd.to_datetime(smm_df["date"])
smm_df

Unnamed: 0,image,smm_value,coordinate,date,point_id
0,/home/sepal-user/soil_moisture/kalimantan_isla...,24.0,"(114.542846, -2.520543)",2023-04-16,199_GAL_G80
1,/home/sepal-user/soil_moisture/kalimantan_isla...,22.0,"(114.657132, -2.52298)",2023-04-16,199_GAL_T16
2,/home/sepal-user/soil_moisture/kalimantan_isla...,19.0,"(114.678142, -2.514387)",2023-04-16,199_GAL_U23
3,/home/sepal-user/soil_moisture/kalimantan_isla...,20.0,"(114.686607, -2.50288)",2023-04-16,199_GAL_U29
4,/home/sepal-user/soil_moisture/kalimantan_isla...,22.0,"(114.697252, -2.484188)",2023-04-16,199_GAL_U38
...,...,...,...,...,...
427752,/home/sepal-user/soil_moisture/papua_dan/close...,29.0,"(133.13999, -2.925532)",2020-12-06,271_RSP_J73
427753,/home/sepal-user/soil_moisture/papua_dan/close...,27.0,"(133.145243, -2.935706)",2020-12-06,271_RSP_K75
427754,/home/sepal-user/soil_moisture/papua_dan/close...,23.0,"(133.13741, -2.945648)",2020-12-06,271_RSP_L72
427755,/home/sepal-user/soil_moisture/papua_dan/close...,22.0,"(133.137354, -2.953089)",2020-12-06,271_RSP_M72


## Join smm_df with data df

In [159]:
# join the two dataframes based on their point_id and date
# If the point_id and date are the same, the SM value will be added to the df
# If the point_id and date are not the same, the SM value will be NaN

df_with_sm_data = df.merge(smm_df, how="left", left_on=["id", "date"], right_on=["point_id", "date"])
df_with_sm_data.head(5)

Unnamed: 0,source,id,lon,lat,date,gwl_cm,image,smm_value,coordinate,point_id
0,ach,BRG_140301_01,102.099167,1.519444,2018-10-15,-14.4,,,,
1,ach,BRG_140301_01,102.099167,1.519444,2018-10-16,-17.9,,,,
2,ach,BRG_140301_01,102.099167,1.519444,2018-10-17,-20.6,,,,
3,ach,BRG_140301_01,102.099167,1.519444,2018-10-18,-18.1,,,,
4,ach,BRG_140301_01,102.099167,1.519444,2018-10-19,-23.1,,,,


In [160]:
# get the columns that are needed

cols_to_export = [
    "source",
    "id",
    "lon",
    "lat",
    "date",
    "gwl_cm",
    "smm_value"
]

In [161]:
df_with_sm_data[cols_to_export].to_csv("data/field_data_with_sm.csv", index=False)

## Find stations with the same coordinate pair

In [None]:
# Find stations with the same coordinate pair

