In [2]:
import os
import pandas as pd
import numpy as np

In [31]:
def add_date_info(df):
    """Takes a dataframe, processes date information by converting timestamps and extracting year/month components"""
    df['date'] = pd.to_datetime(df['date'], unit = 'ms')
    df['Year'] = df['date'].dt.year
    df['Month'] = df['date'].dt.month
    #df['DOY'] = df['Timestamp'].dt.dayofyear
    return df

def rename_and_drop_columns(df):
    """Renames NDVI column based on month number and removes unnecessary columns from the dataframe"""
    new_column_name = f"{df['Month'].iloc[0]}_NDVI"
    df = df.rename(columns={
        'NDVI': new_column_name
    }).drop(columns=['system:index', 'Timestamp', 'Month'])
    return df


Load and Process July NDVI Data 

In [4]:
directory_path_07_ndvi =r'c:\Users\Dgebe N\Documents\Olzha Datasets\Field Analysis\NDVI\07_NDVI-20250121T045303Z-001\07_NDVI'
ndvi_07_df = pd.DataFrame()
for files in os.listdir(directory_path_07_ndvi):
    print(files)
    if files.endswith('.csv'):
        df = pd.read_csv(os.path.join(directory_path_07_ndvi, files))
        df = df.dropna()
        df['date'] = pd.to_datetime(df['date'])
        df['Year'] = df['date'].dt.year
        df = df.drop(columns=['.geo', 'system:index', 'date'])
        print(f'{files} has {df.shape} shape')
        ndvi_07_df = pd.concat([ndvi_07_df, df], ignore_index=True)
ndvi_07_df = ndvi_07_df.rename(columns={'NDVI': '7_NDVI'})
ndvi_07_df


07-01_NEW_NDVI_2020_export.csv
07-01_NEW_NDVI_2020_export.csv has (720, 3) shape
07-01_NEW_NDVI_2021_export.csv
07-01_NEW_NDVI_2021_export.csv has (668, 3) shape
07-01_NEW_NDVI_2022_export.csv
07-01_NEW_NDVI_2022_export.csv has (478, 3) shape
07-01_NEW_NDVI_2023_export.csv
07-01_NEW_NDVI_2023_export.csv has (542, 3) shape
07-01_NEW_NDVI_2024_export.csv
07-01_NEW_NDVI_2024_export.csv has (487, 3) shape


Unnamed: 0,Field_ID,7_NDVI,Year
0,235Абайское,0.466679,2020
1,181Абайское,0.611387,2020
2,183Абайское,0.634810,2020
3,184Абайское,0.212849,2020
4,191Абайское,0.572347,2020
...,...,...,...
2890,2699Узунколь ПК,0.740203,2024
2891,2701Узунколь ПК,0.834017,2024
2892,2703Узунколь ПК,0.654343,2024
2893,2704Узунколь ПК,0.864275,2024


Load and Process June NDVI Data

In [5]:
directory_path_06_ndvi =r'c:\Users\Dgebe N\Documents\Olzha Datasets\Field Analysis\NDVI\06-NDVI-20250121T045301Z-001\06-NDVI'
ndvi_06_df = pd.DataFrame()
for files in os.listdir(directory_path_06_ndvi):
    print(files)
    if files.endswith('.csv'):
        df = pd.read_csv(os.path.join(directory_path_06_ndvi, files))
        df = df.dropna()
        df['date'] = pd.to_datetime(df['date'])
        df['Year'] = df['date'].dt.year
        df = df.drop(columns=['.geo', 'system:index', 'date'])
        print(f'{files} has {df.shape} shape')
        ndvi_06_df = pd.concat([ndvi_06_df, df], ignore_index=True)
ndvi_06_df = ndvi_06_df.rename(columns={'NDVI': '6_NDVI'})
ndvi_06_df

06-01_NEW_NDVI_2020_export.csv
06-01_NEW_NDVI_2020_export.csv has (720, 3) shape
06-01_NEW_NDVI_2021_export.csv
06-01_NEW_NDVI_2021_export.csv has (668, 3) shape
06-01_NEW_NDVI_2022_export.csv
06-01_NEW_NDVI_2022_export.csv has (478, 3) shape
06-01_NEW_NDVI_2023_export.csv
06-01_NEW_NDVI_2023_export.csv has (542, 3) shape
06-01_NEW_NDVI_2024_export.csv
06-01_NEW_NDVI_2024_export.csv has (489, 3) shape


Unnamed: 0,Field_ID,6_NDVI,Year
0,235Абайское,0.259751,2020
1,181Абайское,0.220680,2020
2,183Абайское,0.232390,2020
3,184Абайское,0.505847,2020
4,191Абайское,0.220677,2020
...,...,...,...
2892,2699Узунколь ПК,0.240161,2024
2893,2701Узунколь ПК,0.275308,2024
2894,2703Узунколь ПК,0.185534,2024
2895,2704Узунколь ПК,0.341774,2024


In [6]:
merge_df = pd.merge(ndvi_06_df, ndvi_07_df, on=['Field_ID', 'Year'], how='outer')
merge_df = merge_df[['Field_ID', 'Year', '6_NDVI', '7_NDVI']]
merge_df['Year'] = merge_df['Year'].astype(int)
merge_df = merge_df.sort_values(by=['Year', 'Field_ID'], ascending=[True, True]).reset_index(drop=True)
merge_df

Unnamed: 0,Field_ID,Year,6_NDVI,7_NDVI
0,1001Докучаева,2020,0.587879,0.490213
1,1002Докучаева,2020,0.216777,0.521416
2,1003Докучаева,2020,0.303708,0.479431
3,1004Докучаева,2020,0.537144,0.486303
4,1005Докучаева,2020,0.255808,0.529345
...,...,...,...,...
2892,967Раушан-Агро,2024,0.224627,0.701219
2893,969Раушан-Агро,2024,0.216753,0.700184
2894,979Раушан-Агро,2024,0.470643,0.670046
2895,980Раушан-Агро,2024,0.455206,0.702079


In [8]:
merge_df.to_csv(r'c:\Users\Dgebe N\Documents\Olzha Datasets\Field Analysis\NDVI\ML_Data\new_monthly_field_ndvi_2020-2024.csv', index = False, encoding='cp1251')

In [10]:
# csv_path_1 = r"C:\Users\Dgebe N\Documents\Olzha Datasets\Field Analysis\NDVI\8_day_NDVI_2020_export.csv"
# csv_path_2 = r"C:\Users\Dgebe N\Documents\Olzha Datasets\Field Analysis\NDVI\8_day_NDVI_2021_export.csv"
# csv_path_3 = r"C:\Users\Dgebe N\Documents\Olzha Datasets\Field Analysis\NDVI\8_day_NDVI_2022_export.csv"
# csv_path_4 = r"C:\Users\Dgebe N\Documents\Olzha Datasets\Field Analysis\NDVI\8_day_NDVI_2023_export.csv"
# csv_paths = [csv_path_1, csv_path_2, csv_path_3, csv_path_4]

# # Create an empty DataFrame to store the concatenated results
# all_ndvi_df = pd.DataFrame()
# for csv_path in csv_paths:
#     ndvi_df = pd.read_csv(csv_path)
#     ndvi_df = add_date_info(ndvi_df)
#     ndvi_df = ndvi_df.sort_values(['date', 'Field_ID'])
#     ndvi_df = ndvi_df.drop(columns=['system:index', '.geo', 'pixel_count', 'date'])
#     all_ndvi_df = pd.concat([all_ndvi_df, ndvi_df], ignore_index=True)
# all_ndvi_df

In [11]:
# pivoted_df = all_ndvi_df.pivot_table(
#     index=['Field_ID', 'Year'],
#     columns='Month',
#     values=['NDVI']
# )

# # Rename columns
# new_columns = [f"{col[1]}_{col[0]}" for col in pivoted_df.columns]
# pivoted_df.columns = new_columns

# # Reset index to make Field_ID and Year regular columns
# pivoted_df = pivoted_df.reset_index()

# print(pivoted_df.isnull().sum())
# print(pivoted_df.shape)

In [12]:
# coords_df = pd.read_csv(r"C:\Users\Dgebe N\Documents\Olzha Datasets\Field Analysis\Unqiue_Coords_Wheat_Fields\filtered_coords.csv", encoding = 'cp1251')
# pivoted_with_coords = pd.merge(pivoted_df, coords_df, on=['Field_ID'], how='left')
# pivoted_with_coords


In [13]:
# from sklearn.impute import KNNImputer
# from sklearn.preprocessing import StandardScaler
# columns_to_impute = ['6_NDVI']
# imputer = KNNImputer(n_neighbors=5)

# # Initialize the scaler
# scaler = StandardScaler()

# pivoted_with_coords[['lat_scaled', 'lon_scaled']] = scaler.fit_transform(pivoted_with_coords[['Latitude', 'Longitude']])

# # Prepare the features for imputation
# imputation_features = pivoted_with_coords[['lat_scaled', 'lon_scaled'] + columns_to_impute]

# # Perform imputation
# imputed_values = imputer.fit_transform(imputation_features)

# # Update the original DataFrame with imputed values
# pivoted_with_coords[columns_to_impute] = imputed_values[:, 2:]  
# pivoted_with_coords = pivoted_with_coords.drop(columns=['lat_scaled', 'lon_scaled' , 'Latitude', 'Longitude'], errors='ignore')
# print(pivoted_with_coords[columns_to_impute].isnull().sum())
# print(pivoted_with_coords.shape)
# pivoted_with_coords

Load Daily NDVI Values for Growing Seasons From 2020-2024 and obtain Maximum NDVIs during the growing season

In [14]:
directory_path_ndvi =r'c:\Users\Dgebe N\Documents\Olzha Datasets\Field Analysis\NDVI\MAX_NDVI-20250121T045305Z-001\MAX_NDVI'
max_ndvi_df = pd.DataFrame()
ndvi_df = pd.DataFrame()
for files in os.listdir(directory_path_ndvi):
    
    if files.endswith('.csv'):
        df = pd.read_csv(os.path.join(directory_path_ndvi, files))
        df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
        df['Year'] = df['date'].dt.year
        df = df.drop(columns=['.geo', 'system:index', 'imageID'])
        print(f'{files} has {df.shape} shape')
        ndvi_df = pd.concat([ndvi_df, df], ignore_index=True)
        sorted_ndvi_df = df.sort_values(['Field_ID', 'Year', 'NDVI'], ascending=[True, True, False])
        max_df = sorted_ndvi_df.groupby(['Field_ID', 'Year']).first().reset_index()
        print(f'Max {files} has {max_df.shape} shape')
        max_ndvi_df = pd.concat([max_ndvi_df, max_df], ignore_index=True)


max_ndvi_df = max_ndvi_df.sort_values(['Field_ID', 'Year']).reset_index(drop=True)
max_ndvi_df = max_ndvi_df.rename(columns={'NDVI': 'MAX_NDVI'})
print(f'MAX NDVI DF has {max_ndvi_df.shape} shape')
max_ndvi_df

MAX_NDVI_2020_export.csv has (9722, 4) shape
Max MAX_NDVI_2020_export.csv has (720, 4) shape
MAX_NDVI_2021_export.csv has (10173, 4) shape
Max MAX_NDVI_2021_export.csv has (668, 4) shape
MAX_NDVI_2022_export.csv has (3919, 4) shape
Max MAX_NDVI_2022_export.csv has (478, 4) shape
MAX_NDVI_2023_export.csv has (6960, 4) shape
Max MAX_NDVI_2023_export.csv has (542, 4) shape
MAX_NDVI_2024_export.csv has (3843, 4) shape
Max MAX_NDVI_2024_export.csv has (489, 4) shape
MAX NDVI DF has (2897, 4) shape


Unnamed: 0,Field_ID,Year,MAX_NDVI,date
0,1001Докучаева,2020,0.662199,2020-06-20
1,1002Докучаева,2020,0.552762,2020-07-10
2,1002Докучаева,2023,0.689470,2023-07-05
3,1003Докучаева,2020,0.479476,2020-07-15
4,1003Докучаева,2021,0.603534,2021-07-25
...,...,...,...,...
2892,996Докучаева,2023,0.529311,2023-07-25
2893,999Докучаева,2020,0.577168,2020-07-05
2894,999Докучаева,2023,0.509828,2023-07-25
2895,999Докучаева,2024,0.665956,2024-07-04


In [15]:
max_ndvi_df.to_csv(r'c:\Users\Dgebe N\Documents\Olzha Datasets\Field Analysis\NDVI\ML_Data\Wheat_Max_ndvi_2020-2024.csv', index=False, encoding='cp1251')

Load and Process Climate Data

In [32]:
def add_date_info_climate(df):
    """Processes climate dataframe by extracting temporal components (year, month, day of year) from date column"""
    df['date'] = pd.to_datetime(df['date'])
    df['Year'] = df['date'].dt.year
    df['Month'] = df['date'].dt.month
    df['DOY'] = df['date'].dt.dayofyear
    return df

def drop_column_climate(df):
    """Removes unnecessary columns from climate dataframe if they exist"""
    if 'system:index' in df.columns:
        return df.drop(columns=['system:index', 'dewpoint_temperature_2m', 'saturation_vapor_pressure', 'actual_vapor_pressure', '.geo'])
    return df

def vapor_pressure_calc(x):
    """Calculates vapor pressure using temperature input following the Tetens formula"""
    return 0.6108 * np.exp((17.27 * (x - 273.15)) / ((x - 273.15) + 237.3))

def column_difference(df, col1, col2):
    """Calculates the difference between two columns in a dataframe"""
    return df[col1] - df[col2]

def calculate_relative_humidity(df, col1, col2):
    """Calculates relative humidity as percentage ratio between two vapor pressure columns"""
    return 100 * df[col1] / df[col2]


In [29]:
directory_path = r"c:\Users\Dgebe N\Documents\Olzha Datasets\Field Analysis\Climate Data\WHEAT_CLIMATE_DATA-20250121T045307Z-001\WHEAT_CLIMATE_DATA"
all_climate_df = pd.DataFrame()
for files in os.listdir(directory_path):
    if files.endswith('.csv'):
        df = pd.read_csv(os.path.join(directory_path, files))
        df = add_date_info_climate(df)
        df['saturation_vapor_pressure'] = df['temperature_2m'].apply(lambda x: vapor_pressure_calc(x))
        df['actual_vapor_pressure'] = df['dewpoint_temperature_2m'].apply(lambda x: vapor_pressure_calc(x))
        df['relative_humidity'] = calculate_relative_humidity(df, 'actual_vapor_pressure', 'saturation_vapor_pressure')
        df['vapor_pressure_deficit'] = column_difference(df, 'saturation_vapor_pressure', 'actual_vapor_pressure')
        df = drop_column_climate(df)
        all_climate_df = pd.concat([all_climate_df, df], ignore_index=True)
all_climate_df

Unnamed: 0,Field_ID,date,surface_solar_radiation_downwards_sum,temperature_2m,temperature_2m_max,temperature_2m_min,total_precipitation_sum,v_component_of_wind_10m,Year,Month,DOY,relative_humidity,vapor_pressure_deficit
0,235Абайское,2020-05-01,6.508537e+08,289.390941,304.662234,273.079732,0.099813,0.635355,2020,5,122,54.604540,0.838209
1,235Абайское,2020-06-01,6.797563e+08,290.574761,304.325191,279.313869,0.017888,-1.172492,2020,6,153,46.424682,1.066421
2,235Абайское,2020-07-01,6.869613e+08,296.390803,308.227766,282.124747,0.023402,-0.572286,2020,7,183,50.769116,1.403384
3,235Абайское,2020-08-01,4.983671e+08,292.808981,307.887137,283.535319,0.100578,0.099662,2020,8,214,60.710184,0.899491
4,181Абайское,2020-05-01,6.505489e+08,289.255293,304.574020,273.029572,0.102224,0.647809,2020,5,122,55.160341,0.820813
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11583,2704Узунколь ПК,2024-08-01,4.864619e+08,290.106597,302.236209,280.750412,0.073213,-0.123774,2024,8,214,72.438302,0.532604
11584,2707Узунколь ПК,2024-05-01,6.061513e+08,282.980471,294.525775,271.572821,0.065404,0.153182,2024,5,122,58.753874,0.500762
11585,2707Узунколь ПК,2024-06-01,6.588655e+08,293.887565,304.282267,281.179992,0.104523,0.309998,2024,6,153,65.237204,0.850714
11586,2707Узунколь ПК,2024-07-01,5.898715e+08,293.400450,303.194584,284.931474,0.089511,-0.884350,2024,7,183,68.899509,0.738568


In [30]:
all_climate_df.to_csv('TS_Wheat_Climate_Data.csv', index=False)

In [20]:
all_climate_df = all_climate_df.drop(columns=['date'])
pivoted_df = all_climate_df.pivot_table(index=['Field_ID', 'Year'], 
                                         columns='Month', 
                                         values=['surface_solar_radiation_downwards_sum',
       'temperature_2m_max', 'temperature_2m_min', 'total_precipitation_sum',
       'v_component_of_wind_10m', 'relative_humidity',
       'vapor_pressure_deficit'
                                                 ], 
                                         aggfunc='mean')
new_columns = [f"{col[1]}_{col[0]}" for col in pivoted_df.columns]
pivoted_df.columns = new_columns

# Reset index to make Field_ID and Year regular columns
pivoted_df = pivoted_df.reset_index()

print(pivoted_df.isnull().sum())
print(pivoted_df.shape)

pivoted_df

Field_ID                                   0
Year                                       0
5_relative_humidity                        0
6_relative_humidity                        0
7_relative_humidity                        0
8_relative_humidity                        0
5_surface_solar_radiation_downwards_sum    0
6_surface_solar_radiation_downwards_sum    0
7_surface_solar_radiation_downwards_sum    0
8_surface_solar_radiation_downwards_sum    0
5_temperature_2m_max                       0
6_temperature_2m_max                       0
7_temperature_2m_max                       0
8_temperature_2m_max                       0
5_temperature_2m_min                       0
6_temperature_2m_min                       0
7_temperature_2m_min                       0
8_temperature_2m_min                       0
5_total_precipitation_sum                  0
6_total_precipitation_sum                  0
7_total_precipitation_sum                  0
8_total_precipitation_sum                  0
5_v_compon

Unnamed: 0,Field_ID,Year,5_relative_humidity,6_relative_humidity,7_relative_humidity,8_relative_humidity,5_surface_solar_radiation_downwards_sum,6_surface_solar_radiation_downwards_sum,7_surface_solar_radiation_downwards_sum,8_surface_solar_radiation_downwards_sum,...,7_total_precipitation_sum,8_total_precipitation_sum,5_v_component_of_wind_10m,6_v_component_of_wind_10m,7_v_component_of_wind_10m,8_v_component_of_wind_10m,5_vapor_pressure_deficit,6_vapor_pressure_deficit,7_vapor_pressure_deficit,8_vapor_pressure_deficit
0,1001Докучаева,2020,57.588484,49.543625,53.612926,59.324315,6.612215e+08,6.916903e+08,6.884807e+08,5.203218e+08,...,0.028914,0.091383,0.598832,-1.490612,-0.808794,0.270842,0.795641,1.001958,1.286773,0.938937
1,1002Докучаева,2020,57.664972,49.323350,53.081460,58.019088,6.655147e+08,6.973077e+08,6.890515e+08,5.243194e+08,...,0.026434,0.081976,0.616087,-1.592158,-0.887436,0.289672,0.804741,1.020113,1.316964,0.984295
2,1002Докучаева,2023,38.578616,44.984933,53.802362,58.220122,6.749530e+08,6.584740e+08,6.867235e+08,5.416422e+08,...,0.042831,0.072032,-0.566802,-0.827220,0.289480,-1.265735,1.102688,1.259800,1.324034,0.948204
3,1003Докучаева,2020,56.860332,48.891047,53.084527,59.351714,6.605500e+08,6.929765e+08,6.893271e+08,5.185445e+08,...,0.027030,0.109693,0.578130,-1.438730,-0.792869,0.211184,0.805452,1.013111,1.303479,0.932420
4,1003Докучаева,2021,42.504983,42.154358,54.357024,50.316968,7.376825e+08,7.403149e+08,6.393779e+08,5.874854e+08,...,0.109616,0.009179,0.136228,-1.386168,-1.037628,-1.328542,1.236937,1.353457,1.115274,1.302015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2892,996Докучаева,2023,37.053394,44.158082,53.335712,57.827691,6.796764e+08,6.551584e+08,6.862531e+08,5.389480e+08,...,0.039981,0.064675,-0.554081,-0.862318,0.305635,-1.294499,1.131500,1.272067,1.331213,0.948472
2893,999Докучаева,2020,57.538405,49.485660,53.565688,59.328213,6.611801e+08,6.918470e+08,6.885548e+08,5.201614e+08,...,0.028769,0.092708,0.596807,-1.488371,-0.808451,0.266116,0.795749,1.002210,1.287273,0.937665
2894,999Докучаева,2023,37.053394,44.158082,53.335712,57.827691,6.796764e+08,6.551584e+08,6.862531e+08,5.389480e+08,...,0.039981,0.064675,-0.554081,-0.862318,0.305635,-1.294499,1.131500,1.272067,1.331213,0.948472
2895,999Докучаева,2024,57.726065,60.767957,68.036552,70.283301,6.148117e+08,6.700669e+08,6.003586e+08,4.855810e+08,...,0.111294,0.056679,0.026383,0.240172,-0.994239,-0.254151,0.520823,0.987227,0.766400,0.580460


In [21]:
pivoted_df.to_csv('Wheat_Climate_Data.csv', index=False)

Load and Process SoilGrid Data

In [23]:
def drop_column_soil(df):

    if 'system:index' in df.columns:
        return df.drop(columns=['system:index',  '.geo'])
    return df

In [24]:
directory_path = r"c:\Users\Dgebe N\Documents\Olzha Datasets\Field Analysis\Soil INFO\SOIL_INFO-20250121T045307Z-001\SOIL_INFO"
all_soil_df = pd.DataFrame()
for files in os.listdir(directory_path):
    if files.endswith('.csv'):
        df = pd.read_csv(os.path.join(directory_path, files))
        all_soil_df = pd.concat([all_soil_df, df], ignore_index=True)
all_soil_df = drop_column_soil(all_soil_df)
all_soil_df = all_soil_df.groupby(['Field_ID']).first().reset_index()
all_soil_df

Unnamed: 0,Field_ID,bdod,cec,clay,phh2o,sand,silt,soc
0,1001Докучаева,101.257506,443.201486,295.488666,68.805723,393.932739,310.486957,627.293475
1,1002Докучаева,100.787303,390.000403,299.897608,69.340970,400.911784,299.190271,534.997044
2,1003Докучаева,98.233863,418.670559,265.305704,68.052833,401.421708,333.258040,693.199579
3,1004Докучаева,101.136550,445.344236,272.231308,68.326805,398.160576,329.506292,604.105794
4,1005Докучаева,102.415520,410.786870,303.280465,69.957037,387.238527,309.606016,548.267862
...,...,...,...,...,...,...,...,...
1499,991Докучаева,99.928087,410.888471,234.294501,68.178876,422.286779,343.377829,552.598268
1500,994Докучаева,100.398082,393.641284,276.620690,67.344828,399.073755,324.340517,652.855156
1501,996Докучаева,100.115784,430.122341,309.519534,67.880091,378.381942,312.113389,667.009727
1502,999Докучаева,99.970276,432.350273,292.264699,68.337541,390.925082,316.772295,693.436619


In [25]:
all_soil_df.to_csv(r"C:\Users\Dgebe N\Documents\Olzha Datasets\Field Analysis\Soil INFO\Long_Range_Soil_INFO_2020_2024.csv", index=False, encoding='cp1251')


Merge Data For Model Training Dataset

In [33]:
field_data = pd.read_csv(r"c:\Users\Dgebe N\Documents\Olzha Datasets\Field Analysis\Yield_Field_Info\Long_Range_Wheat_Yield_2020-2024.csv", encoding='cp1251')
sowing_df = pd.read_csv(r"C:\Users\Dgebe N\Documents\Olzha Datasets\Sowing Date\sowing_date_cleaned.csv", encoding='cp1251')

In [34]:
# List of dataframes to merge
sowing_df_dropped = sowing_df.drop(columns=['Подразделение', 'Поле'])
dataframes = [field_data, max_ndvi_df, merge_df, pivoted_df, sowing_df_dropped]

# Merge all dataframes
merged_data = dataframes[0]
for df in dataframes[1:]:
    merged_data = pd.merge(merged_data, df, on=['Field_ID', 'Year'], how='inner')

merged_data = pd.merge(merged_data, all_soil_df, on=['Field_ID'], how='inner')
ml_data = merged_data.copy()
# Display info about the merged dataframe
print(f'Merged Data Columns: {ml_data.columns}')
print(f'Merged Data Shape: {ml_data.shape}')
print(f'Merged Data Null Sum: \n{ml_data.isnull().sum()}')


Merged Data Columns: Index(['Подразделение', 'Поле', 'Field_ID', 'Year', 'Area', 'Yield', 'Агрофон',
       'Культура', 'MAX_NDVI', 'date', '6_NDVI', '7_NDVI',
       '5_relative_humidity', '6_relative_humidity', '7_relative_humidity',
       '8_relative_humidity', '5_surface_solar_radiation_downwards_sum',
       '6_surface_solar_radiation_downwards_sum',
       '7_surface_solar_radiation_downwards_sum',
       '8_surface_solar_radiation_downwards_sum', '5_temperature_2m_max',
       '6_temperature_2m_max', '7_temperature_2m_max', '8_temperature_2m_max',
       '5_temperature_2m_min', '6_temperature_2m_min', '7_temperature_2m_min',
       '8_temperature_2m_min', '5_total_precipitation_sum',
       '6_total_precipitation_sum', '7_total_precipitation_sum',
       '8_total_precipitation_sum', '5_v_component_of_wind_10m',
       '6_v_component_of_wind_10m', '7_v_component_of_wind_10m',
       '8_v_component_of_wind_10m', '5_vapor_pressure_deficit',
       '6_vapor_pressure_deficit', '7_va

In [35]:
ml_data.to_csv(r'new_ml_data.csv', index=False)
ml_data.head()

Unnamed: 0,Подразделение,Поле,Field_ID,Year,Area,Yield,Агрофон,Культура,MAX_NDVI,date,...,8_vapor_pressure_deficit,DOY_min,DOY_max,bdod,cec,clay,phh2o,sand,silt,soc
0,Абайское,16254,235Абайское,2020,230.0,13.1,Стерня,Тризо,0.497964,2020-07-07,...,0.899491,150,151,100.337298,397.188277,297.142827,67.811299,329.994797,372.765637,704.696368
1,Абайское,16254,235Абайское,2021,230.0,11.0,Стерня,Тризо,0.497984,2021-07-22,...,1.342147,131,135,100.337298,397.188277,297.142827,67.811299,329.994797,372.765637,704.696368
2,Абайское,16254,235Абайское,2024,230.0,19.0,Пар комбинированный,Шортандинская,0.822252,2024-07-16,...,0.569373,157,157,100.337298,397.188277,297.142827,67.811299,329.994797,372.765637,704.696368
3,Абайское,32,181Абайское,2020,240.0,10.6,Стерня,Шортандинская,0.662204,2020-07-10,...,0.892055,154,157,104.324063,395.126847,304.744982,67.587264,327.487417,367.783495,681.533619
4,Абайское,32,181Абайское,2024,240.9,26.2,Стерня,Ликамеро,0.793934,2024-07-04,...,0.560712,152,152,104.324063,395.126847,304.744982,67.587264,327.487417,367.783495,681.533619
