In [4]:
import pandas as pd
from pathlib import Path

In [62]:
# Explore data sources from .xslsx files, merge them and save them as .csv
# ach means Achmad
# wal means Waluyo

ach_path = Path("data/2_Achmad/achmed_raw.csv")
wal_path = Path("data/3_Waluyo/waluyo_raw.csv")

DATA_COLS = ["source", "id", "lon", "lat", "date", "gwl_cm"]

In [76]:
ach_df = pd.read_csv(ach_path)

In [77]:
wal_df.head()

Unnamed: 0,kode_perusahaan,kode_titik,date(dd/mm/yyyy),gwl(cm),provinsi,perusahaan,perizinan,unique_marking,coorvlookup_dummy,coor_y(dd),coor_x(dd)
0,121_APC,A21,14/11/2020,-10.0,ACEH,PT. Surya Panen Subur,PERKEBUNAN,121_APC_A21__44149,PT. Surya Panen Subur__A21,3.809687,96.451939
1,121_APC,A21,26/11/2020,-12.0,ACEH,PT. Surya Panen Subur,PERKEBUNAN,121_APC_A21__44161,PT. Surya Panen Subur__A21,3.809687,96.451939
2,121_APC,A21,02/12/2020,-30.0,ACEH,PT. Surya Panen Subur,PERKEBUNAN,121_APC_A21__44167,PT. Surya Panen Subur__A21,3.809687,96.451939
3,121_APC,A21,16/12/2020,-60.0,ACEH,PT. Surya Panen Subur,PERKEBUNAN,121_APC_A21__44181,PT. Surya Panen Subur__A21,3.809687,96.451939
4,121_APC,A21,12/07/2021,-25.0,ACEH,PT. Surya Panen Subur,PERKEBUNAN,121_APC_A21__44389,PT. Surya Panen Subur__A21,3.809687,96.451939


In [78]:
ach_cols_rename = {
    "gwl_rata" : "gwl_cm",
}

# rename columns
ach_df.rename(columns=ach_cols_rename, inplace=True)

# Convert date column to datetime
ach_df["date"] = pd.to_datetime(ach_df["date"])

# multiply gwl_cm by 100 to convert it to cm
ach_df["gwl_cm"] = ach_df["gwl_cm"] * 100

# set a new column for source
ach_df["source"] = "ach"

ach_df = ach_df[DATA_COLS]
ach_df.head()

Unnamed: 0,source,id,lon,lat,date,gwl_cm
0,ach,BRG_140301_01,102.099167,1.519444,2018-10-15,-14.4
1,ach,BRG_140301_01,102.099167,1.519444,2018-10-16,-17.9
2,ach,BRG_140301_01,102.099167,1.519444,2018-10-17,-20.6
3,ach,BRG_140301_01,102.099167,1.519444,2018-10-18,-18.1
4,ach,BRG_140301_01,102.099167,1.519444,2018-10-19,-23.1


In [83]:
wal_cols_rename = {
    "date(dd/mm/yyyy)" : "date",
    "gwl(cm)" : "gwl_cm",
    "coor_y(dd)" : "lat",
    "coor_x(dd)" : "lon",
}

# Read Waluyo's data
wal_df = pd.read_csv(wal_path, sep=";")

# Rename columns
wal_df.rename(columns=wal_cols_rename, inplace=True)

# combine "kode_perusahaan"	and "kode_titik" to create a unique id
wal_df.loc[:, "id"] = wal_df["kode_perusahaan"] + "_" + wal_df["kode_titik"]

# Convert date column to datetime
wal_df.loc[:, "date"] = pd.to_datetime(wal_df["date"], dayfirst=True)

# set a new column for source
wal_df["source"] = "wal"

# Only select columns that are needed
wal_df = wal_df[DATA_COLS]
wal_df.head()

  wal_df.loc[:, "date"] = pd.to_datetime(wal_df["date"], dayfirst=True)


Unnamed: 0,source,id,lon,lat,date,gwl_cm
0,wal,121_APC_A21,96.451939,3.809687,2020-11-14,-10.0
1,wal,121_APC_A21,96.451939,3.809687,2020-11-26,-12.0
2,wal,121_APC_A21,96.451939,3.809687,2020-12-02,-30.0
3,wal,121_APC_A21,96.451939,3.809687,2020-12-16,-60.0
4,wal,121_APC_A21,96.451939,3.809687,2021-07-12,-25.0


### Concatenate both dataframes

In [134]:
# Merge the two dataframes
df = pd.concat([ach_df, wal_df], ignore_index=True)
df

Unnamed: 0,source,id,lon,lat,date,gwl_cm
0,ach,BRG_140301_01,102.099167,1.519444,2018-10-15,-14.4
1,ach,BRG_140301_01,102.099167,1.519444,2018-10-16,-17.9
2,ach,BRG_140301_01,102.099167,1.519444,2018-10-17,-20.6
3,ach,BRG_140301_01,102.099167,1.519444,2018-10-18,-18.1
4,ach,BRG_140301_01,102.099167,1.519444,2018-10-19,-23.1
...,...,...,...,...,...,...
275811,wal,138_NBR_M13,135.275777,-3.407239,2023-05-26,-45.0
275812,wal,138_NBR_M13,135.275777,-3.407239,2023-06-13,-14.0
275813,wal,138_NBR_M13,135.275777,-3.407239,2023-06-26,-30.0
275814,wal,138_NBR_M13,135.275777,-3.407239,2023-07-15,-27.0


## Get unique points (and check they have the same coordinates)

In [None]:
import geopandas as gpd
from shapely.geometry import Point


In [86]:
# get unique ids
ids = df["id"].unique()
print("ids len", len(ids))

# Check that we have the same number of unique coordinates

# get unique coordinates
coords = df[["lon", "lat"]].drop_duplicates()
print("coords len", len(coords))

ids len 2411
coords len 2002


## Convert unique ids to shapefile 

Note that unique ids contains duplicated coordinates.

In [109]:
# Export unique point ID's as shapefile

df_only_locations = df.drop_duplicates(subset=["id"])

# Transfor date to string to avoid errors when exporting to shapefile
df_only_locations["date"] = df_only_locations["date"].astype(str)

df_only_locations = gpd.GeoDataFrame(df_only_locations, geometry=gpd.points_from_xy(df_only_locations["lon"], df_only_locations["lat"]))
df_only_locations.crs = "EPSG:4326"
df_only_locations.to_file("data/merged_df.shp")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_only_locations["date"] = df_only_locations["date"].astype(str)


In [115]:

### (test()) Create a random sample of 3 points to extract their SM from the images

# Create a random sample of 3 points to extract their SM from the images
test_sample = gpd.GeoDataFrame(df_only_locations.sample(3, random_state=42))

# Export the sample as shapefile
test_sample.to_file("data/0_shp/test_sample.shp")

In [158]:
from pathlib import PosixPath
import datetime

smm_df = pd.read_csv("data/6_extracted_sm_data/all_extracted_data.csv")

# convert date to datetime
smm_df["date"] = pd.to_datetime(smm_df["date"])
smm_df

Unnamed: 0,image,smm_value,coordinate,date,point_id
0,/home/sepal-user/soil_moisture/kalimantan_isla...,24.0,"(114.542846, -2.520543)",2023-04-16,199_GAL_G80
1,/home/sepal-user/soil_moisture/kalimantan_isla...,22.0,"(114.657132, -2.52298)",2023-04-16,199_GAL_T16
2,/home/sepal-user/soil_moisture/kalimantan_isla...,19.0,"(114.678142, -2.514387)",2023-04-16,199_GAL_U23
3,/home/sepal-user/soil_moisture/kalimantan_isla...,20.0,"(114.686607, -2.50288)",2023-04-16,199_GAL_U29
4,/home/sepal-user/soil_moisture/kalimantan_isla...,22.0,"(114.697252, -2.484188)",2023-04-16,199_GAL_U38
...,...,...,...,...,...
427752,/home/sepal-user/soil_moisture/papua_dan/close...,29.0,"(133.13999, -2.925532)",2020-12-06,271_RSP_J73
427753,/home/sepal-user/soil_moisture/papua_dan/close...,27.0,"(133.145243, -2.935706)",2020-12-06,271_RSP_K75
427754,/home/sepal-user/soil_moisture/papua_dan/close...,23.0,"(133.13741, -2.945648)",2020-12-06,271_RSP_L72
427755,/home/sepal-user/soil_moisture/papua_dan/close...,22.0,"(133.137354, -2.953089)",2020-12-06,271_RSP_M72


## Join smm_df with data df

In [159]:
# join the two dataframes based on their point_id and date
# If the point_id and date are the same, the SM value will be added to the df
# If the point_id and date are not the same, the SM value will be NaN

df_with_sm_data = df.merge(smm_df, how="left", left_on=["id", "date"], right_on=["point_id", "date"])
df_with_sm_data.head(5)

Unnamed: 0,source,id,lon,lat,date,gwl_cm,image,smm_value,coordinate,point_id
0,ach,BRG_140301_01,102.099167,1.519444,2018-10-15,-14.4,,,,
1,ach,BRG_140301_01,102.099167,1.519444,2018-10-16,-17.9,,,,
2,ach,BRG_140301_01,102.099167,1.519444,2018-10-17,-20.6,,,,
3,ach,BRG_140301_01,102.099167,1.519444,2018-10-18,-18.1,,,,
4,ach,BRG_140301_01,102.099167,1.519444,2018-10-19,-23.1,,,,


In [160]:
# get the columns that are needed

cols_to_export = [
    "source",
    "id",
    "lon",
    "lat",
    "date",
    "gwl_cm",
    "smm_value"
]

In [161]:
df_with_sm_data[cols_to_export].to_csv("data/field_data_with_sm.csv", index=False)