In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', 1000)

In [3]:
# Defining directories
dir_path = Path.cwd().parent
prep_folder = dir_path.joinpath("data/prep")
s2_path = prep_folder.joinpath("sent2.parquet")

processed_folder = dir_path.joinpath("data/processed")
if not processed_folder.exists():
    processed_folder.mkdir(parents=True)

In [4]:
df = pd.read_parquet(s2_path)
df

Unnamed: 0,date,long,lat,species_names,B11,B02,B12,B08,B04,B03
0,2019-08-23,76.98762370000003,31.571770300000026,Pinus roxburghii,2234.0,376.0,1084.0,3340.0,468.0,739.0
1,2019-08-23,76.98770870000004,31.571733300000062,Pinus roxburghii,2234.0,240.0,1084.0,3374.0,248.0,496.0
2,2019-08-23,76.98786920000003,31.571645600000068,Pinus roxburghii,1826.0,269.0,823.0,3458.0,216.0,525.0
3,2019-08-23,76.98792120000007,31.572662200000025,Pinus roxburghii,2037.0,345.0,971.0,3754.0,359.0,699.0
4,2019-08-23,76.98797090000005,31.57163410000004,Pinus roxburghii,1826.0,247.0,823.0,3248.0,245.0,529.0
...,...,...,...,...,...,...,...,...,...,...
13193487,2021-11-25,78.12497830000007,31.25865170000003,Rhododendron arboreum,1781.0,1115.0,1436.0,1827.0,1196.0,1221.0
13193488,2021-11-25,78.12522000000007,31.258880000000033,Rhododendron arboreum,1657.0,1175.0,1431.0,1704.0,1216.0,1230.0
13193489,2021-11-25,78.12614830000007,31.258660000000077,Rhododendron arboreum,1678.0,1113.0,1389.0,1954.0,1222.0,1175.0
13193490,2021-11-25,78.12618260000005,31.25882800000005,Rhododendron arboreum,1966.0,1104.0,1481.0,2143.0,1240.0,1219.0


In [5]:
for band in ["B11", "B02", "B12", "B08", "B04", "B03"]:
    print(df[band].isna().value_counts())

B11
False    13193492
Name: count, dtype: int64
B02
False    13193388
True          104
Name: count, dtype: int64
B12
False    13193492
Name: count, dtype: int64
B08
False    13193492
Name: count, dtype: int64
B04
False    13193463
True           29
Name: count, dtype: int64
B03
False    13193459
True           33
Name: count, dtype: int64


In [6]:
# Extracting month and year
df["year"] = pd.to_datetime(df["date"]).dt.year
df["month"] = pd.to_datetime(df["date"]).dt.month
df.drop('date', axis=1, inplace=True)

# scaling the band values by a factor 0.0001
for band in ["B11", "B02", "B12", "B08", "B04", "B03"]:
    df[band] = df[band] * 0.0001

In [7]:
for band in ["B11", "B02", "B12", "B08", "B04", "B03"]:
    print(df[band].isna().value_counts())

B11
False    13193492
Name: count, dtype: int64
B02
False    13193388
True          104
Name: count, dtype: int64
B12
False    13193492
Name: count, dtype: int64
B08
False    13193492
Name: count, dtype: int64
B04
False    13193463
True           29
Name: count, dtype: int64
B03
False    13193459
True           33
Name: count, dtype: int64


In [8]:
def make_s2_indices(df):

    #### SENTINEL 2

    ## NDVI
    df["NDVI"] = (df["B08"] - df["B04"]) / (df["B08"] + df["B04"])
    temp_df = pd.DataFrame(df["NDVI"])

    ## Atmospherically Resistant Vegetation Index 2
    df["ARVI2"] = -0.18 + 1.17 * df["NDVI"]

    ## Blue-wide dynamic range vegetation index
    df["BWDRVI"] = (0.1 * (df["B08"] - df["B02"])) / (0.1 * (df["B08"] + df["B02"]))

    ## Chlorophyll vegetation index
    df["CVI"] = df["B08"] * (df["B04"] / (df["B03"].pow(2)))

    ## Corrected Transformed Vegetation Index
    temp_df["ndvi+0.5"] = temp_df["NDVI"] + 0.5
    temp_df["ctvi1"] = (temp_df["ndvi+0.5"]) / (temp_df["ndvi+0.5"].abs())
    temp_df["ctvi2"] = np.sqrt(temp_df["ndvi+0.5"].abs())
    df["CTVI"] = temp_df["ctvi1"] * temp_df["ctvi2"]

    ## Enhanced Vegetation Index 2 -2
    df["EVI2"] = 2.5 * ((df["B08"] - df["B04"]) / (df["B08"] + (2.4 * df["B04"]) + 1))

    ## Global Vegetation Moisture Index
    temp_df['gvmi_num'] = (df['B08'] + 0.1) - (df['B12'] + 0.02)
    temp_df['gvmi_den'] = (df['B08'] + 0.1) + (df['B12'] + 0.02)
    df['GVMI'] = temp_df['gvmi_num']/temp_df['gvmi_den']

    ## Modified Soil Adjusted Vegetation Index hyper
    temp_df['hyper_first_term'] = 2 * df['B08'] + 1
    temp_df['hyper_second'] = np.sqrt(temp_df['hyper_first_term'].pow(2) - (8 * (df['B08'] - df['B04'])))
    df['MSVAIhyper'] = 0.5 * (temp_df['hyper_first_term'] - temp_df['hyper_second'])

    ## Modified Triangular Vegetation Index 2
    temp_df['mtvi2_num'] = 1.5 * (1.2 * (df['B08'] - df['B03']) - 2.5 *(df['B04'] - df['B03']))
    temp_df['mtvi2_den'] = np.sqrt((2 * df['B08'] + 1).pow(2) - (6 * df['B08'] - 5*(np.sqrt(df['B04']))) - 0.5)
    df['MTVI2'] = temp_df['mtvi2_num']/temp_df['mtvi2_den']
    # MTVI2 = 1.5*(1.2 * (NIR - Green) - 2.5 * (Red - Green))√((2 * NIR + 1)²-(6 * NIR - 5√(Red)) - 0.5)


    ## Normalized Difference NIR/MIR Modified Normalized Difference Vegetation Index
    df['MNDVI'] = (df['B08'] - df['B12'])/(df['B08'] + df['B12'])

    ## Optimized Soil Adjusted Vegetation Index
    temp_df['Y'] = 0.16
    temp_df['osavi_num'] = df['B08'] - df['B04']
    temp_df['osavi_den'] = df['B08'] + df['B04'] + temp_df['Y']
    df['OSAVI'] = (temp_df['Y'] + 1) * (temp_df['osavi_num']/temp_df['osavi_den'])

    ## Perpendicular Vegetation Index
    pvi_a = 0.149
    pvi_ar = 0.374
    pvi_b = 0.735

    temp_df['pvi_first'] = 1/(np.sqrt(1 + (pvi_a*pvi_a)))
    temp_df['pvi_second'] = df['B08'] - pvi_ar - pvi_b
    df['PVI'] = temp_df['pvi_first'] * temp_df['pvi_second']

    ## Soil and Atmospherically Resistant Vegetation Index
    y = 0.735
    Rr = 0.740
    L = 0.487
    RB = 0.560

    temp_df['sarvi_num'] = df['B08'] - (Rr - y * (RB-Rr))
    temp_df['sarvi_den'] = (df['B08'] + -(Rr - y * (RB-Rr))) + L
    df['SARVI'] = (1+L) * (temp_df['sarvi_num'] / temp_df['sarvi_den'])

    ## Specific Leaf Area Vegetation Index
    df['SLAVI'] = df['B08'] / (df['B04'] + df['B12'])

    ## Transformed Soil Adjusted Vegetation Index 2
    tsavi2_a = 0.419
    tsavi2_b = 0.787
    temp_df['tsavi2_num'] = (tsavi2_a * df['B08']) - (tsavi2_a * df['B04']) - tsavi2_b
    temp_df['tsavi2_den'] = df['B04'] + (tsavi2_a * df['B08']) - (tsavi2_a * tsavi2_b)
    df['TSAVI2'] = temp_df['tsavi2_num'] / temp_df['tsavi2_den']

    ## Weighted Difference Vegetation Index
    wdvi_a = 0.752
    df['WDVI'] = df['B08'] - (wdvi_a * df['B04'])

    ## Wide Dynamic Range Vegetation Index
    df['WDRVI'] = (0.1*(df['B08'] - df['B04']))/(0.1*(df['B08'] + df['B04']))

    return df

df_indices = make_s2_indices(df)
df_indices

Unnamed: 0,long,lat,species_names,B11,B02,B12,B08,B04,B03,year,month,NDVI,ARVI2,BWDRVI,CVI,CTVI,EVI2,GVMI,MSVAIhyper,MTVI2,MNDVI,OSAVI,PVI,SARVI,SLAVI,TSAVI2,WDVI,WDRVI
0,76.98762370000003,31.571770300000026,Pinus roxburghii,0.2234,0.0376,0.1084,0.3340,0.0468,0.0739,2019,8,0.754202,0.702416,0.797632,2.862223,1.119911,0.496432,0.543385,0.485925,0.488624,0.509946,0.616036,-0.766538,15.603355,2.152062,4.661752,0.298806,0.754202
1,76.98770870000004,31.571733300000062,Pinus roxburghii,0.2234,0.0240,0.1084,0.3374,0.0248,0.0496,2019,8,0.863059,0.829779,0.867183,3.401210,1.167501,0.559445,0.546129,0.561649,0.591280,0.513683,0.694401,-0.763175,16.605351,2.533033,4.010337,0.318750,0.863059
2,76.98786920000003,31.571645600000068,Pinus roxburghii,0.1826,0.0269,0.0823,0.3458,0.0216,0.0525,2019,8,0.882417,0.852428,0.855648,2.709943,1.175762,0.579906,0.626710,0.587000,0.636986,0.615510,0.713068,-0.754867,19.820392,3.328200,3.988417,0.329557,0.882417
3,76.98792120000007,31.572662200000025,Pinus roxburghii,0.2037,0.0345,0.0971,0.3754,0.0359,0.0699,2019,8,0.825432,0.785755,0.831666,2.758255,1.151274,0.580715,0.604726,0.579892,0.603412,0.588995,0.689340,-0.725590,74.635384,2.822556,4.721350,0.348403,0.825432
4,76.98797090000005,31.57163410000004,Pinus roxburghii,0.1826,0.0247,0.0823,0.3248,0.0245,0.0529,2019,8,0.859719,0.825872,0.858655,2.843615,1.166070,0.542606,0.611838,0.542497,0.580178,0.595677,0.683974,-0.775637,13.456736,3.041199,3.908532,0.306376,0.859719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13193487,78.12497830000007,31.25865170000003,Rhododendron arboreum,0.1781,0.1115,0.1436,0.1827,0.1196,0.1221,2021,11,0.208733,0.064218,0.242012,1.465677,0.841863,0.107332,0.266861,0.099708,0.083817,0.119828,0.158330,-0.916186,5.061378,0.694149,5.692750,0.092761,0.208733
13193488,78.12522000000007,31.258880000000033,Rhododendron arboreum,0.1657,0.1175,0.1431,0.1704,0.1216,0.1230,2021,11,0.167123,0.015534,0.183744,1.369597,0.816776,0.083434,0.247520,0.077242,0.063742,0.087081,0.125239,-0.928351,4.856795,0.643748,5.605284,0.078957,0.167123
13193489,78.12614830000007,31.258660000000077,Rhododendron arboreum,0.1678,0.1113,0.1389,0.1954,0.1222,0.1175,2021,11,0.230479,0.089660,0.274209,1.729498,0.854680,0.122928,0.300462,0.114727,0.086477,0.169010,0.177789,-0.903624,5.300423,0.748372,6.017877,0.103506,0.230479
13193490,78.12618260000005,31.25882800000005,Rhododendron arboreum,0.1966,0.1104,0.1481,0.2143,0.1240,0.1219,2021,11,0.266923,0.132300,0.319988,1.788284,0.875741,0.149315,0.303068,0.140171,0.111598,0.182671,0.210211,-0.884931,5.721906,0.787578,6.460468,0.121052,0.266923


In [9]:
for col in df_indices.columns:
    print(df_indices[col].isna().value_counts())


long
False    13193492
Name: count, dtype: int64
lat
False    13193492
Name: count, dtype: int64
species_names
False    13193492
Name: count, dtype: int64
B11
False    13193492
Name: count, dtype: int64
B02
False    13193388
True          104
Name: count, dtype: int64
B12
False    13193492
Name: count, dtype: int64
B08
False    13193492
Name: count, dtype: int64
B04
False    13193463
True           29
Name: count, dtype: int64
B03
False    13193459
True           33
Name: count, dtype: int64
year
False    13193492
Name: count, dtype: int64
month
False    13193492
Name: count, dtype: int64
NDVI
False    13193463
True           29
Name: count, dtype: int64
ARVI2
False    13193463
True           29
Name: count, dtype: int64
BWDRVI
False    13193388
True          104
Name: count, dtype: int64
CVI
False    13193430
True           62
Name: count, dtype: int64
CTVI
False    13193463
True           29
Name: count, dtype: int64
EVI2
False    13193463
True           29
Name: count, dtype: int64


### Making Monthly Medians

Monthly medians, as expected, have misisng values afte rmedian computations

In [10]:
index_cols = ["long", "lat", "year", "month", "species_names"]
#making monthlu medians
df_month_medians = (
    df_indices.groupby(index_cols)
    .median()
    .reset_index()
)

df_month_medians

Unnamed: 0,long,lat,year,month,species_names,B11,B02,B12,B08,B04,B03,NDVI,ARVI2,BWDRVI,CVI,CTVI,EVI2,GVMI,MSVAIhyper,MTVI2,MNDVI,OSAVI,PVI,SARVI,SLAVI,TSAVI2,WDVI,WDRVI
0,75.66669580000007,32.25107380000003,2018,2,Bambusoideae spp.,0.26240,0.14080,0.20730,0.28920,0.16830,0.16120,0.264262,0.129187,0.345116,1.873063,0.874221,0.178517,0.262612,0.171918,0.134151,0.164955,0.227116,-0.810849,9.022578,0.769968,18.281425,0.162638,0.264262
1,75.66669580000007,32.25107380000003,2018,3,Bambusoideae spp.,0.28310,0.13900,0.22460,0.31860,0.17640,0.17100,0.287273,0.156109,0.392483,1.921994,0.887284,0.204080,0.262364,0.197548,0.159463,0.173049,0.251835,-0.781770,12.344106,0.794514,36.628039,0.185947,0.287273
2,75.66669580000007,32.25107380000003,2018,4,Bambusoideae spp.,0.28970,0.15440,0.23560,0.34180,0.17550,0.17720,0.321477,0.196128,0.377670,1.910389,0.906354,0.235820,0.266992,0.228589,0.196250,0.183928,0.284819,-0.758823,18.134563,0.831428,64.981728,0.209824,0.321477
3,75.66669580000007,32.25107380000003,2018,5,Bambusoideae spp.,0.32590,0.21600,0.26770,0.32420,0.23600,0.23600,0.157444,0.004209,0.200296,1.373729,0.810829,0.116630,0.191740,0.115042,0.096600,0.095455,0.142061,-0.776231,13.339193,0.643637,-17.821364,0.146728,0.157444
4,75.66669580000007,32.25107380000003,2018,6,Bambusoideae spp.,0.32320,0.16480,0.26840,0.34100,0.19560,0.19280,0.270965,0.137029,0.348359,1.794358,0.878046,0.200780,0.209213,0.195647,0.162253,0.119134,0.242125,-0.759614,17.833930,0.734914,-83.208503,0.193909,0.270965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9478188,78.12624330000006,31.258713300000068,2024,8,Rhododendron arboreum,0.21590,0.11680,0.16170,0.33150,0.11900,0.12700,0.471698,0.371887,0.478920,2.445812,0.985747,0.328520,0.407371,0.315368,0.281402,0.344282,0.403767,-0.769010,14.947390,1.180976,9.713553,0.242012,0.471698
9478189,78.12624330000006,31.258713300000068,2024,9,Rhododendron arboreum,0.19565,0.11470,0.14720,0.32305,0.11240,0.14260,0.482899,0.384992,0.475040,1.793129,0.991329,0.330430,0.433184,0.316796,0.313829,0.373414,0.409749,-0.777368,13.982224,1.244150,8.548427,0.238525,0.482899
9478190,78.12624330000006,31.258713300000068,2024,10,Rhododendron arboreum,0.18175,0.09995,0.12895,0.22385,0.10470,0.10710,0.361852,0.243366,0.382070,2.044298,0.928305,0.201748,0.369414,0.189244,0.160179,0.268025,0.282372,-0.875485,6.000114,0.959065,5.628229,0.145116,0.361852
9478191,78.12624330000006,31.258713300000068,2024,11,Rhododendron arboreum,0.18195,0.12695,0.14900,0.21795,0.13615,0.13635,0.230987,0.090255,0.266684,1.596637,0.854978,0.132369,0.305819,0.124763,0.101885,0.187783,0.184541,-0.881321,5.817910,0.764326,7.375042,0.115565,0.230987


### Making Season Medians 

No missing values in seasonal medians

In [11]:
# creating seasons
season_index_cols = ["long", "lat", "year", "season", "species_names"]


conds = [df_month_medians["month"].isin([3, 4, 5]), df_month_medians["month"].isin([10, 11, 12])]
opts = ["summer", "winter"]


df_month_medians["season"] = np.select(conds, opts, default="NA")
df_season_medians = df_month_medians[~(df_month_medians["season"] == "NA")]


df_season_medians = df_season_medians.groupby(season_index_cols).median().reset_index()
df_season_medians

Unnamed: 0,long,lat,year,season,species_names,month,B11,B02,B12,B08,B04,B03,NDVI,ARVI2,BWDRVI,CVI,CTVI,EVI2,GVMI,MSVAIhyper,MTVI2,MNDVI,OSAVI,PVI,SARVI,SLAVI,TSAVI2,WDVI,WDRVI
0,75.66669580000007,32.25107380000003,2018,summer,Bambusoideae spp.,4.0,0.28970,0.15440,0.23560,0.32420,0.17640,0.17720,0.287273,0.156109,0.377670,1.910389,0.887284,0.204080,0.262364,0.197548,0.159463,0.173049,0.251835,-0.776231,13.339193,0.794514,36.628039,0.185947,0.287273
1,75.66669580000007,32.25107380000003,2018,winter,Bambusoideae spp.,11.0,0.24600,0.13630,0.18410,0.34990,0.14930,0.15820,0.411741,0.301737,0.439325,2.151097,0.954851,0.300179,0.387179,0.289427,0.258520,0.324125,0.361074,-0.750811,21.943751,1.078607,18.783879,0.240258,0.411741
2,75.66669580000007,32.25107380000003,2019,summer,Bambusoideae spp.,4.5,0.25885,0.11280,0.19540,0.29230,0.13280,0.12965,0.400403,0.288472,0.477402,2.683532,0.947785,0.249000,0.295781,0.237530,0.196948,0.200573,0.325497,-0.807782,18.051811,0.916986,-36.203756,0.192434,0.400403
3,75.66669580000007,32.25107380000003,2019,winter,Bambusoideae spp.,10.5,0.25620,0.14105,0.18470,0.38535,0.15455,0.16150,0.427497,0.320172,0.465031,2.293912,0.963061,0.328547,0.406719,0.317695,0.283551,0.352000,0.382530,-0.715748,34.403773,1.136096,50.859088,0.269128,0.427497
4,75.66669580000007,32.25107380000003,2020,summer,Bambusoideae spp.,4.0,0.27645,0.14465,0.21670,0.34525,0.16110,0.16560,0.360856,0.242202,0.407303,2.025796,0.927653,0.264676,0.305009,0.255691,0.223837,0.226889,0.318257,-0.755411,170.718932,0.911091,140.143094,0.224103,0.360856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2084693,78.12624330000006,31.258713300000068,2022,winter,Rhododendron arboreum,11.0,0.16190,0.13255,0.12955,0.17620,0.11455,0.12110,0.212149,0.068215,0.168492,1.384757,0.843875,0.106228,0.297474,0.098332,0.088290,0.152568,0.158703,-0.922615,4.950380,0.721901,5.386990,0.090058,0.212149
2084694,78.12624330000006,31.258713300000068,2023,summer,Rhododendron arboreum,4.0,0.23905,0.13985,0.17975,0.33210,0.14780,0.16015,0.384380,0.269725,0.311191,1.728078,0.940341,0.273302,0.367826,0.263232,0.251601,0.297785,0.334304,-0.768417,5.257732,1.015502,17.123828,0.220954,0.384380
2084695,78.12624330000006,31.258713300000068,2023,winter,Rhododendron arboreum,11.0,0.16420,0.09895,0.13355,0.17600,0.10340,0.10235,0.259935,0.124124,0.280555,1.770120,0.871708,0.127468,0.279856,0.117653,0.093834,0.130750,0.191703,-0.922813,4.946996,0.737550,4.958911,0.098243,0.259935
2084696,78.12624330000006,31.258713300000068,2024,summer,Rhododendron arboreum,4.0,0.22115,0.18330,0.17425,0.30525,0.15660,0.16970,0.272490,0.138814,0.234178,1.477287,0.876428,0.194787,0.355068,0.188081,0.183950,0.275292,0.238455,-0.794974,6.944189,0.878359,26.076134,0.176884,0.272490


### Splitting Testing and Training
All data from 2024 will be assigned for testing and data from 2018 - 2023 (6 years) will be used for training.

In [12]:
sent2_train_season_medians = df_season_medians[df_season_medians['year']!=2024]
sent2_test_season_medians = df_season_medians[df_season_medians['year']==2024]


sent2_train_season_medians = sent2_train_season_medians.drop(["month"], axis=1)
sent2_test_season_medians = sent2_test_season_medians.drop(["month"], axis=1)

sent2_test_season_medians

Unnamed: 0,long,lat,year,season,species_names,B11,B02,B12,B08,B04,B03,NDVI,ARVI2,BWDRVI,CVI,CTVI,EVI2,GVMI,MSVAIhyper,MTVI2,MNDVI,OSAVI,PVI,SARVI,SLAVI,TSAVI2,WDVI,WDRVI
12,75.66669580000007,32.25107380000003,2024,summer,Bambusoideae spp.,0.29080,0.14230,0.22880,0.33480,0.16720,0.16640,0.285961,0.154575,0.405837,2.065170,0.886545,0.209016,0.254991,0.203063,0.154652,0.155412,0.253745,-0.765746,15.826980,0.775898,37.068751,0.195003,0.285961
13,75.66669580000007,32.25107380000003,2024,winter,Bambusoideae spp.,0.25830,0.15780,0.19190,0.34770,0.15900,0.17030,0.372410,0.255719,0.375668,1.906218,0.934029,0.272798,0.354750,0.263581,0.239437,0.285022,0.328322,-0.752987,20.746814,1.000841,28.242038,0.228759,0.372410
26,75.66676390000003,32.25073910000003,2024,summer,Bambusoideae spp.,0.31710,0.13660,0.24580,0.34540,0.15810,0.16480,0.345016,0.223668,0.421399,2.035980,0.919247,0.253276,0.221278,0.245149,0.204307,0.133574,0.305154,-0.755262,19.636599,0.799167,29.353192,0.218914,0.345016
27,75.66676390000003,32.25073910000003,2024,winter,Bambusoideae spp.,0.27550,0.15180,0.19890,0.33450,0.15340,0.16380,0.372159,0.255426,0.375694,1.912466,0.933895,0.283844,0.337335,0.273260,0.232773,0.262741,0.336195,-0.766043,15.742295,0.971819,15.439635,0.226362,0.372159
40,75.66686810000004,32.25026770000005,2024,summer,Bambusoideae spp.,0.32890,0.14420,0.25470,0.33860,0.16220,0.17200,0.317657,0.191659,0.388559,1.992846,0.904244,0.219832,0.221538,0.213420,0.181254,0.120755,0.271958,-0.761988,16.993831,0.767839,57.208149,0.201435,0.317657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2084669,78.12614830000007,31.258660000000077,2024,winter,Rhododendron arboreum,0.18220,0.13045,0.14535,0.22275,0.13635,0.13045,0.244736,0.106341,0.265971,1.809724,0.862974,0.142331,0.318520,0.134378,0.101822,0.206402,0.196709,-0.876573,5.946358,0.790678,7.559093,0.122165,0.244736
2084682,78.12618260000005,31.25882800000005,2024,summer,Rhododendron arboreum,0.22280,0.17600,0.17290,0.27855,0.15610,0.17300,0.273971,0.140546,0.223813,1.467303,0.878410,0.181560,0.326215,0.174169,0.169701,0.235362,0.232748,-0.821382,8.518848,0.841399,13.304565,0.159283,0.273971
2084683,78.12618260000005,31.25882800000005,2024,winter,Rhododendron arboreum,0.20700,0.12365,0.15390,0.24995,0.14150,0.13975,0.277155,0.144271,0.338736,1.827620,0.881544,0.170599,0.303257,0.162164,0.131382,0.198699,0.228187,-0.849670,6.837432,0.809950,8.896813,0.143542,0.277155
2084696,78.12624330000006,31.258713300000068,2024,summer,Rhododendron arboreum,0.22115,0.18330,0.17425,0.30525,0.15660,0.16970,0.272490,0.138814,0.234178,1.477287,0.876428,0.194787,0.355068,0.188081,0.183950,0.275292,0.238455,-0.794974,6.944189,0.878359,26.076134,0.176884,0.272490


In [13]:
sent2_train_season_medians

Unnamed: 0,long,lat,year,season,species_names,B11,B02,B12,B08,B04,B03,NDVI,ARVI2,BWDRVI,CVI,CTVI,EVI2,GVMI,MSVAIhyper,MTVI2,MNDVI,OSAVI,PVI,SARVI,SLAVI,TSAVI2,WDVI,WDRVI
0,75.66669580000007,32.25107380000003,2018,summer,Bambusoideae spp.,0.289700,0.154400,0.23560,0.324200,0.17640,0.17720,0.287273,0.156109,0.377670,1.910389,0.887284,0.204080,0.262364,0.197548,0.159463,0.173049,0.251835,-0.776231,13.339193,0.794514,36.628039,0.185947,0.287273
1,75.66669580000007,32.25107380000003,2018,winter,Bambusoideae spp.,0.246000,0.136300,0.18410,0.349900,0.14930,0.15820,0.411741,0.301737,0.439325,2.151097,0.954851,0.300179,0.387179,0.289427,0.258520,0.324125,0.361074,-0.750811,21.943751,1.078607,18.783879,0.240258,0.411741
2,75.66669580000007,32.25107380000003,2019,summer,Bambusoideae spp.,0.258850,0.112800,0.19540,0.292300,0.13280,0.12965,0.400403,0.288472,0.477402,2.683532,0.947785,0.249000,0.295781,0.237530,0.196948,0.200573,0.325497,-0.807782,18.051811,0.916986,-36.203756,0.192434,0.400403
3,75.66669580000007,32.25107380000003,2019,winter,Bambusoideae spp.,0.256200,0.141050,0.18470,0.385350,0.15455,0.16150,0.427497,0.320172,0.465031,2.293912,0.963061,0.328547,0.406719,0.317695,0.283551,0.352000,0.382530,-0.715748,34.403773,1.136096,50.859088,0.269128,0.427497
4,75.66669580000007,32.25107380000003,2020,summer,Bambusoideae spp.,0.276450,0.144650,0.21670,0.345250,0.16110,0.16560,0.360856,0.242202,0.407303,2.025796,0.927653,0.264676,0.305009,0.255691,0.223837,0.226889,0.318257,-0.755411,170.718932,0.911091,140.143094,0.224103,0.360856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2084691,78.12624330000006,31.258713300000068,2021,winter,Rhododendron arboreum,0.166850,0.158050,0.13680,0.197950,0.11840,0.13760,0.157806,0.004633,0.112399,1.365192,0.809671,0.095915,0.238653,0.092677,0.069931,0.115905,0.128573,-0.901102,4.632840,0.663562,5.083158,0.108913,0.157806
2084692,78.12624330000006,31.258713300000068,2022,summer,Rhododendron arboreum,0.259825,0.137625,0.18465,0.336225,0.13430,0.14315,0.423974,0.316050,0.409618,2.188334,0.961068,0.301349,0.358958,0.289373,0.259761,0.285641,0.366384,-0.764337,-142.994178,1.046054,25.943279,0.235231,0.423974
2084693,78.12624330000006,31.258713300000068,2022,winter,Rhododendron arboreum,0.161900,0.132550,0.12955,0.176200,0.11455,0.12110,0.212149,0.068215,0.168492,1.384757,0.843875,0.106228,0.297474,0.098332,0.088290,0.152568,0.158703,-0.922615,4.950380,0.721901,5.386990,0.090058,0.212149
2084694,78.12624330000006,31.258713300000068,2023,summer,Rhododendron arboreum,0.239050,0.139850,0.17975,0.332100,0.14780,0.16015,0.384380,0.269725,0.311191,1.728078,0.940341,0.273302,0.367826,0.263232,0.251601,0.297785,0.334304,-0.768417,5.257732,1.015502,17.123828,0.220954,0.384380


### Widening the frame

#### Widening Train

In [14]:
sent2_train_season_medians_wide = sent2_train_season_medians.pivot(index=['long', 'lat', 'year','species_names'], columns=['season']).reset_index()
new_cols = ["".join(x) for x in sent2_train_season_medians_wide.columns]
sent2_train_season_medians_wide.columns = new_cols
sent2_train_season_medians_wide

Unnamed: 0,long,lat,year,species_names,B11summer,B11winter,B02summer,B02winter,B12summer,B12winter,B08summer,B08winter,B04summer,B04winter,B03summer,B03winter,NDVIsummer,NDVIwinter,ARVI2summer,ARVI2winter,BWDRVIsummer,BWDRVIwinter,CVIsummer,CVIwinter,CTVIsummer,CTVIwinter,EVI2summer,EVI2winter,GVMIsummer,GVMIwinter,MSVAIhypersummer,MSVAIhyperwinter,MTVI2summer,MTVI2winter,MNDVIsummer,MNDVIwinter,OSAVIsummer,OSAVIwinter,PVIsummer,PVIwinter,SARVIsummer,SARVIwinter,SLAVIsummer,SLAVIwinter,TSAVI2summer,TSAVI2winter,WDVIsummer,WDVIwinter,WDRVIsummer,WDRVIwinter
0,75.66669580000007,32.25107380000003,2018,Bambusoideae spp.,0.289700,0.24600,0.154400,0.13630,0.235600,0.18410,0.324200,0.34990,0.17640,0.14930,0.17720,0.15820,0.287273,0.411741,0.156109,0.301737,0.377670,0.439325,1.910389,2.151097,0.887284,0.954851,0.204080,0.300179,0.262364,0.387179,0.197548,0.289427,0.159463,0.258520,0.173049,0.324125,0.251835,0.361074,-0.776231,-0.750811,13.339193,21.943751,0.794514,1.078607,36.628039,18.783879,0.185947,0.240258,0.287273,0.411741
1,75.66669580000007,32.25107380000003,2019,Bambusoideae spp.,0.258850,0.25620,0.112800,0.14105,0.195400,0.18470,0.292300,0.38535,0.13280,0.15455,0.12965,0.16150,0.400403,0.427497,0.288472,0.320172,0.477402,0.465031,2.683532,2.293912,0.947785,0.963061,0.249000,0.328547,0.295781,0.406719,0.237530,0.317695,0.196948,0.283551,0.200573,0.352000,0.325497,0.382530,-0.807782,-0.715748,18.051811,34.403773,0.916986,1.136096,-36.203756,50.859088,0.192434,0.269128,0.400403,0.427497
2,75.66669580000007,32.25107380000003,2020,Bambusoideae spp.,0.276450,0.25100,0.144650,0.13035,0.216700,0.18535,0.345250,0.35955,0.16110,0.15440,0.16560,0.15675,0.360856,0.397863,0.242202,0.285500,0.407303,0.466377,2.025796,2.256043,0.927653,0.947438,0.264676,0.296029,0.305009,0.381638,0.255691,0.285919,0.223837,0.248274,0.226889,0.318541,0.318257,0.352093,-0.755411,-0.741267,170.718932,189.844088,0.911091,1.057878,140.143094,33.429727,0.224103,0.243441,0.360856,0.397863
3,75.66669580000007,32.25107380000003,2021,Bambusoideae spp.,0.292600,0.17570,0.150000,0.07620,0.239950,0.10050,0.319600,0.28050,0.18160,0.08550,0.17520,0.09650,0.275399,0.530091,0.142217,0.440206,0.361201,0.570220,1.895898,2.556117,0.880560,1.014934,0.196552,0.346411,0.234981,0.516937,0.190535,0.334204,0.152337,0.284346,0.142406,0.469517,0.242141,0.427112,-0.780781,-0.819454,12.514366,8.397010,0.758327,1.496774,55.341681,5.534139,0.183037,0.223724,0.275399,0.530091
4,75.66669580000007,32.25107380000003,2022,Bambusoideae spp.,0.307800,0.24770,0.153600,0.14870,0.234000,0.18220,0.325900,0.36890,0.19020,0.15840,0.17780,0.16700,0.270070,0.417861,0.135982,0.308897,0.365854,0.434734,1.957255,2.142705,0.877536,0.958051,0.195165,0.321921,0.268881,0.409546,0.189574,0.311482,0.145198,0.280120,0.182296,0.355349,0.238824,0.374587,-0.774549,-0.732019,13.678397,16.993831,0.789678,1.128400,123.528038,27.768672,0.185050,0.259860,0.270070,0.417861
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
893437,78.12624330000006,31.258713300000068,2019,Rhododendron arboreum,0.243600,0.16170,0.140525,0.11845,0.178200,0.13325,0.346600,0.20275,0.13695,0.12440,0.14730,0.12090,0.416348,0.239591,0.307127,0.100321,0.403482,0.263483,2.149144,1.731773,0.955703,0.859991,0.307733,0.130475,0.378621,0.312071,0.295786,0.122100,0.268972,0.094354,0.308246,0.184026,0.364786,0.186600,-0.754075,-0.896355,-4.395381,5.455463,1.089953,0.767643,25.292495,6.274160,0.243614,0.109201,0.416348,0.239591
893438,78.12624330000006,31.258713300000068,2020,Rhododendron arboreum,0.221750,0.15530,0.145950,0.15385,0.184700,0.13620,0.316400,0.19710,0.14870,0.13250,0.18420,0.14450,0.369943,0.220117,0.252834,0.077537,0.353816,0.152081,1.620617,1.446515,0.932707,0.848594,0.251145,0.118107,0.344915,0.314275,0.240886,0.110411,0.232998,0.097110,0.264865,0.186134,0.315786,0.170207,-0.783946,-0.901943,10.940422,5.308156,0.960566,0.749256,-94.687463,6.494663,0.200094,0.102273,0.369943,0.220117
893439,78.12624330000006,31.258713300000068,2021,Rhododendron arboreum,0.220675,0.16685,0.170025,0.15805,0.172975,0.13680,0.337800,0.19795,0.15355,0.11840,0.16145,0.13760,0.376100,0.157806,0.260037,0.004633,0.333711,0.112399,1.984258,1.365192,0.935259,0.809671,0.270909,0.095915,0.387983,0.238653,0.261325,0.092677,0.233481,0.069931,0.322371,0.115905,0.328745,0.128573,-0.762779,-0.901102,19.763274,4.632840,1.047446,0.663562,96.109345,5.083158,0.222330,0.108913,0.376100,0.157806
893440,78.12624330000006,31.258713300000068,2022,Rhododendron arboreum,0.259825,0.16190,0.137625,0.13255,0.184650,0.12955,0.336225,0.17620,0.13430,0.11455,0.14315,0.12110,0.423974,0.212149,0.316050,0.068215,0.409618,0.168492,2.188334,1.384757,0.961068,0.843875,0.301349,0.106228,0.358958,0.297474,0.289373,0.098332,0.259761,0.088290,0.285641,0.152568,0.366384,0.158703,-0.764337,-0.922615,-142.994178,4.950380,1.046054,0.721901,25.943279,5.386990,0.235231,0.090058,0.423974,0.212149


In [15]:
for col in sent2_train_season_medians_wide.columns:
    print(pd.Series(sent2_train_season_medians_wide[col].isna().value_counts()))

long
False    893442
Name: count, dtype: int64
lat
False    893442
Name: count, dtype: int64
year
False    893442
Name: count, dtype: int64
species_names
False    893442
Name: count, dtype: int64
B11summer
False    893442
Name: count, dtype: int64
B11winter
False    893442
Name: count, dtype: int64
B02summer
False    893442
Name: count, dtype: int64
B02winter
False    893442
Name: count, dtype: int64
B12summer
False    893442
Name: count, dtype: int64
B12winter
False    893442
Name: count, dtype: int64
B08summer
False    893442
Name: count, dtype: int64
B08winter
False    893442
Name: count, dtype: int64
B04summer
False    893442
Name: count, dtype: int64
B04winter
False    893442
Name: count, dtype: int64
B03summer
False    893442
Name: count, dtype: int64
B03winter
False    893442
Name: count, dtype: int64
NDVIsummer
False    893442
Name: count, dtype: int64
NDVIwinter
False    893442
Name: count, dtype: int64
ARVI2summer
False    893442
Name: count, dtype: int64
ARVI2winter
False   

##### Widen Test

In [16]:
sent2_test_season_medians_wide = sent2_test_season_medians.pivot(index=['long', 'lat', 'year','species_names'], columns=['season']).reset_index()
new_cols = ["".join(x) for x in sent2_test_season_medians_wide.columns]
sent2_test_season_medians_wide.columns = new_cols
sent2_test_season_medians_wide

Unnamed: 0,long,lat,year,species_names,B11summer,B11winter,B02summer,B02winter,B12summer,B12winter,B08summer,B08winter,B04summer,B04winter,B03summer,B03winter,NDVIsummer,NDVIwinter,ARVI2summer,ARVI2winter,BWDRVIsummer,BWDRVIwinter,CVIsummer,CVIwinter,CTVIsummer,CTVIwinter,EVI2summer,EVI2winter,GVMIsummer,GVMIwinter,MSVAIhypersummer,MSVAIhyperwinter,MTVI2summer,MTVI2winter,MNDVIsummer,MNDVIwinter,OSAVIsummer,OSAVIwinter,PVIsummer,PVIwinter,SARVIsummer,SARVIwinter,SLAVIsummer,SLAVIwinter,TSAVI2summer,TSAVI2winter,WDVIsummer,WDVIwinter,WDRVIsummer,WDRVIwinter
0,75.66669580000007,32.25107380000003,2024,Bambusoideae spp.,0.29080,0.25830,0.14230,0.15780,0.22880,0.19190,0.33480,0.34770,0.16720,0.15900,0.1664,0.17030,0.285961,0.372410,0.154575,0.255719,0.405837,0.375668,2.065170,1.906218,0.886545,0.934029,0.209016,0.272798,0.254991,0.354750,0.203063,0.263581,0.154652,0.239437,0.155412,0.285022,0.253745,0.328322,-0.765746,-0.752987,15.826980,20.746814,0.775898,1.000841,37.068751,28.242038,0.195003,0.228759,0.285961,0.372410
1,75.66676390000003,32.25073910000003,2024,Bambusoideae spp.,0.31710,0.27550,0.13660,0.15180,0.24580,0.19890,0.34540,0.33450,0.15810,0.15340,0.1648,0.16380,0.345016,0.372159,0.223668,0.255426,0.421399,0.375694,2.035980,1.912466,0.919247,0.933895,0.253276,0.283844,0.221278,0.337335,0.245149,0.273260,0.204307,0.232773,0.133574,0.262741,0.305154,0.336195,-0.755262,-0.766043,19.636599,15.742295,0.799167,0.971819,29.353192,15.439635,0.218914,0.226362,0.345016,0.372159
2,75.66686810000004,32.25026770000005,2024,Bambusoideae spp.,0.32890,0.28610,0.14420,0.15360,0.25470,0.20680,0.33860,0.37100,0.16220,0.15900,0.1720,0.17120,0.317657,0.400000,0.191659,0.288000,0.388559,0.414411,1.992846,2.012628,0.904244,0.948683,0.219832,0.302408,0.221538,0.348604,0.213420,0.292518,0.181254,0.267172,0.120755,0.282627,0.271958,0.356406,-0.761988,-0.729942,16.993831,34.403773,0.767839,1.027362,57.208149,23.117523,0.201435,0.252455,0.317657,0.400000
3,75.66719240000003,32.249825100000066,2024,Bambusoideae spp.,0.32510,0.27620,0.14340,0.15570,0.25590,0.19410,0.32540,0.34350,0.15900,0.15620,0.1667,0.17140,0.305036,0.406310,0.176892,0.295382,0.364361,0.382651,1.984213,1.934911,0.897238,0.952003,0.206904,0.287520,0.214636,0.348844,0.198928,0.276591,0.177901,0.248658,0.112903,0.277902,0.261741,0.352054,-0.775044,-0.757141,13.576633,15.099199,0.752091,0.996998,31.677736,14.018858,0.188160,0.226670,0.305036,0.406310
4,75.66729090000007,32.24962370000003,2024,Bambusoideae spp.,0.32770,0.27370,0.13640,0.15710,0.25180,0.19400,0.34350,0.33040,0.15520,0.14800,0.1603,0.16480,0.349971,0.426893,0.229466,0.319465,0.423539,0.424003,2.084338,2.123584,0.921938,0.962753,0.255823,0.298361,0.202223,0.341800,0.247436,0.286588,0.210158,0.255555,0.112010,0.266512,0.308859,0.367321,-0.757141,-0.770098,18.811617,14.103185,0.781214,1.019590,29.773170,11.618175,0.219119,0.228862,0.349971,0.426893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148902,78.12497830000007,31.25865170000003,2024,Rhododendron arboreum,0.21160,0.17545,0.16735,0.12860,0.17215,0.14170,0.25400,0.20655,0.15155,0.14475,0.1650,0.13740,0.237574,0.165224,0.097962,0.013313,0.194441,0.223169,1.435463,1.551780,0.855441,0.815613,0.156266,0.092444,0.302340,0.305836,0.149567,0.087003,0.144799,0.073740,0.196189,0.179411,0.200646,0.131142,-0.845664,-0.892596,5.744313,5.542639,0.775235,0.708139,9.573166,7.617956,0.140034,0.093198,0.237574,0.165224
148903,78.12522000000007,31.258880000000033,2024,Rhododendron arboreum,0.22900,0.17825,0.17680,0.12045,0.18470,0.14705,0.29230,0.19560,0.15390,0.15065,0.1825,0.14280,0.225503,0.119334,0.083838,-0.040380,0.198402,0.228580,1.487546,1.417172,0.849130,0.786970,0.163073,0.065693,0.320799,0.277850,0.157876,0.061775,0.146941,0.052782,0.228374,0.141676,0.198501,0.094295,-0.807782,-0.903427,5.810397,5.304682,0.794209,0.643412,-7.820830,7.795563,0.159572,0.078161,0.225503,0.119334
148904,78.12614830000007,31.258660000000077,2024,Rhododendron arboreum,0.21600,0.18220,0.17360,0.13045,0.16720,0.14535,0.29870,0.22275,0.15610,0.13635,0.1640,0.13045,0.280266,0.244736,0.147912,0.106341,0.249835,0.265971,1.628457,1.809724,0.879862,0.862974,0.200419,0.142331,0.358933,0.318520,0.193124,0.134378,0.180929,0.101822,0.275267,0.206402,0.244481,0.196709,-0.801452,-0.876573,7.422811,5.946358,0.899885,0.790678,20.893708,7.559093,0.177327,0.122165,0.280266,0.244736
148905,78.12618260000005,31.25882800000005,2024,Rhododendron arboreum,0.22280,0.20700,0.17600,0.12365,0.17290,0.15390,0.27855,0.24995,0.15610,0.14150,0.1730,0.13975,0.273971,0.277155,0.140546,0.144271,0.223813,0.338736,1.467303,1.827620,0.878410,0.881544,0.181560,0.170599,0.326215,0.303257,0.174169,0.162164,0.169701,0.131382,0.235362,0.198699,0.232748,0.228187,-0.821382,-0.849670,8.518848,6.837432,0.841399,0.809950,13.304565,8.896813,0.159283,0.143542,0.273971,0.277155


In [17]:
for col in sent2_test_season_medians_wide.columns:
    print(pd.Series(sent2_test_season_medians_wide[col].isna().value_counts()))

long
False    148907
Name: count, dtype: int64
lat
False    148907
Name: count, dtype: int64
year
False    148907
Name: count, dtype: int64
species_names
False    148907
Name: count, dtype: int64
B11summer
False    148907
Name: count, dtype: int64
B11winter
False    148907
Name: count, dtype: int64
B02summer
False    148907
Name: count, dtype: int64
B02winter
False    148907
Name: count, dtype: int64
B12summer
False    148907
Name: count, dtype: int64
B12winter
False    148907
Name: count, dtype: int64
B08summer
False    148907
Name: count, dtype: int64
B08winter
False    148907
Name: count, dtype: int64
B04summer
False    148907
Name: count, dtype: int64
B04winter
False    148907
Name: count, dtype: int64
B03summer
False    148907
Name: count, dtype: int64
B03winter
False    148907
Name: count, dtype: int64
NDVIsummer
False    148907
Name: count, dtype: int64
NDVIwinter
False    148907
Name: count, dtype: int64
ARVI2summer
False    148907
Name: count, dtype: int64
ARVI2winter
False   

### Exporting Frames

In [18]:
sent_2_month_medians_path = processed_folder.joinpath("sent_2_month_medians.parquet")
sent_2_season_medians_path = processed_folder.joinpath("sent_2_season_medians.parquet")

sent2_train_season_medians_widened_path = processed_folder.joinpath("sent2_train_season_medians_widened.parquet")
sent2_test_season_medians_widened_path = processed_folder.joinpath("sent2_test_season_medians_widened.parquet")

In [19]:
df_month_medians.to_parquet(sent_2_month_medians_path, index=False)
df_season_medians.to_parquet(sent_2_season_medians_path, index=False)

In [20]:
sent2_train_season_medians_wide.to_parquet(sent2_train_season_medians_widened_path, index=False)
sent2_test_season_medians_wide.to_parquet(sent2_test_season_medians_widened_path, index=False)