In [None]:
import pandas as pd
import os
import numpy as np
import rasterio
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf

All directory paths are defined at the beginning of the code, making it easier to manage and modify paths if the data location changes. 

In [None]:
# Set the paths for the label files for 2022 and 2023 datasets.
# Modify these paths according to the location of your data files.
base_dir_2022 = '2022/2022/DataPublication_final'
base_dir_2023 = '2023/2023/DataPublication_final'
base_dir_validation= '2023_validacion/2023'
base_dir_test= 'Test'

# Define paths for ground truth data.
labels_path_2022 = os.path.join(base_dir_2022, 'GroundTruth/HYBRID_HIPS_V3.5_ALLPLOTS.csv')
labels_path_2023 = os.path.join(base_dir_2023, 'GroundTruth/train_HIPS_HYBRIDS_2023_V2.3.csv')
labels_path_validation= os.path.join(base_dir_validation, 'GroundTruth/val_HIPS_HYBRIDS_2023_V2.3.csv')
labels_path_test= os.path.join(base_dir_test, 'GroundTruth/test_HIPS_HYBRIDS_2023_V2.3.csv')

# Define paths for satellite and UAV images.
satellite_dir_2022 = os.path.join(base_dir_2022, 'Satellite')
uav_dir_2022 = os.path.join(base_dir_2022, 'UAV')
satellite_dir_2023 = os.path.join(base_dir_2023, 'Satellite')
satellite_dir_validation= os.path.join(base_dir_validation, 'Satellite')
satellite_dir_test=os.path.join(base_dir_test,'Satellite')


In this section, we load and inspect the CSV files for the 2022 and 2023 maize yield trials.  After loading the data, we display the first few rows to verify the structure and content.

In [None]:
# Load the CSV files containing the ground truth labels for 2022 and 2023 into pandas DataFrames.
labels_df_2022 = pd.read_csv(labels_path_2022)
labels_df_2023 = pd.read_csv(labels_path_2023)

# Display the first few rows of the 2022 dataset to inspect its structure and content.
print("Data from 2022:")
print(labels_df_2022.head())

# Display the first few rows of the 2023 dataset to inspect its structure and content.
print("\nData from 2023:")
print(labels_df_2023.head())

This section includes functions to load satellite and UAV images and to associate these images with yield data from the corresponding plots. The extract_images_and_yield function processes the label DataFrame, matches each plot with its corresponding images based on location, experiment, row, and range, and returns a DataFrame containing the paths to the images along with the yield data.

In [1]:

# Function to load a .tif satellite image using the rasterio library.
def load_tif_image(image_path):
    with rasterio.open(image_path) as src:
        return src.read()
# Function to load a UAV image using matplotlib's imread function.
def load_uav_image(image_path):
    return plt.imread(image_path)

# Function to extract images and associated yield data based on the label DataFrame.
def extract_images_and_yield(labels_df, satellite_paths, uav_paths=None):
    data = []
    
    for idx, plot_data in labels_df.iterrows():
        location = plot_data['location']
        row = str(int(plot_data['row']))  # Ensure row number is an integer and convert to string
        range_no = str(int(plot_data['range']))  # Ensure range number is an integer and convert to string
        experiment = plot_data['experiment']
        yield_per_acre = plot_data['yieldPerAcre']
        
        # Search for satellite images corresponding to the plot
        for path in satellite_paths:
            if location in path and f"{experiment}_{range_no}_{row}" in path:
                image_data = load_tif_image(path)
                data.append({
                    'image_path': path,
                    'yield_per_acre': yield_per_acre,
                    'year': 2023 if '2023' in path else 2022
                })
        
        # Search for UAV images (applicable only for 2022)
        if uav_paths:
            for path in uav_paths:
                if location in path and f"{experiment}_{range_no}_{row}" in path:
                    image_data = load_uav_image(path)
                    data.append({
                        'image_path': path,
                        'yield_per_acre': yield_per_acre,
                        'year': 2022
                    })
    
    return pd.DataFrame(data)

The yield data is loaded, image paths are retrieved, and the data from both years is extracted and combined into a single dataset, ready for analysis.

In [None]:

# Load the ground truth yield data for 2022 and 2023 from CSV files.
print("Loading yield data...")
# We already loaded the CSV files containing the ground truth labels for 2022 and 2023 into pandas DataFrames.

# Get paths to satellite and UAV images for 2022.
print("Obtaining image paths for 2022...")
satellite_paths_2022 = [os.path.join(root, file) for root, _, files in os.walk(satellite_dir_2022) for file in files if file.endswith('.TIF')]
uav_paths_2022 = [os.path.join(root, file) for root, _, files in os.walk(uav_dir_2022) for file in files if file.endswith('.PNG')]

# Get paths to satellite images for 2023.
print("Obtaining image paths for 2023...")
satellite_paths_2023 = [os.path.join(root, file) for root, _, files in os.walk(satellite_dir_2023) for file in files if file.endswith('.TIF')]

# Extract and combine image and yield data for 2022.
print("Extracting data for 2022...")
data_2022 = extract_images_and_yield(labels_df_2022, satellite_paths_2022, uav_paths_2022)
print(f"2022 data extracted: {len(data_2022)} records.")

# Extract and combine image and yield data for 2023.
print("Extracting data for 2023...")
data_2023 = extract_images_and_yield(labels_df_2023, satellite_paths_2023)
print(f"2023 data extracted: {len(data_2023)} records.")

# Combine data from both years into a single DataFrame.
print("Combining data from 2022 and 2023...")
combined_data = pd.concat([data_2022, data_2023], ignore_index=True)
print(f"Combined data: {len(combined_data)} total records.")

This section introduces a function to filter out rows with missing yield_per_acre values from the combined dataset. The function is then applied to ensure only complete records are kept for further analysis, with the total number of valid records printed afterward.

In [None]:
# Function to filter out rows with NaN values in the 'yield_per_acre' column.
def filter_data_with_yield(data):
    return data.dropna(subset=['yield_per_acre'])

# Filter the combined dataset to remove records with NaN yield values.
print("Filtering data to remove records with NaN yield values...")
filtered_data = filter_data_with_yield(combined_data)
print(f"Filtered data: {len(filtered_data)} records.")

This section separates the filtered dataset to include only satellite images, using a string filter on the image paths. The number of satellite image records is then printed to verify the separation process.

In [None]:
# Separate the filtered data to include only satellite images.
satellite_data = filtered_data[filtered_data['image_path'].str.contains('Satellite')]

# Check the number of records in the satellite dataset.
print(f"Total satellite images: {len(satellite_data)}")

This code provides a function to display a random sample of image data, showing the image path, yield per acre, and year for each entry. By running this function on the satellite dataset, we can visually validate that the image paths are correctly associated with the corresponding yield data, helping ensure that the dataset is properly structured for further analysis.

In [None]:
# Function to display a sample of image data, including the image path, yield per acre, and year.
def show_sample_info(data, sample_size=5, title="Sample Data"):
    sample_data = data.sample(n=sample_size)
    for index, row in sample_data.iterrows():
        print(f"{title} - Example {index + 1}:")
        print(f"Image Path: {row['image_path']}")
        print(f"Yield per Acre: {row['yield_per_acre']}")
        print(f"Year: {row['year']}")
        print("-" * 50)

# Display a sample of satellite data to verify that the image paths and yield data are correctly linked.
print("\nSatellite Data Sample:")
show_sample_info(satellite_data, title="Satellite Data")


Note: You can skip this part and move directly to model training using the complete training data if you have access to it. This section is primarily for cases where only a single dataset is available, and it is necessary to create training, validation, and test sets from it.

In this part of the notebook, we prepare the data for modeling by splitting the satellite data into training, validation, and test sets. This step allows us to train the initial version of the model and assess its performance, even when only the training dataset is available.

In [None]:

# Separate the filtered data to include only satellite images.
satellite_data = filtered_data[filtered_data['image_path'].str.contains('Satellite')]

# Verify the number of satellite data records.
print(f"Total satellite data: {len(satellite_data)}")

# Function to split the dataset into training, validation, and test sets.
def split_data(data, test_size=0.2, val_size=0.1):
    train_data, test_data = train_test_split(data, test_size=test_size, random_state=42)
    train_data, val_data = train_test_split(train_data, test_size=val_size / (1 - test_size), random_state=42)
    return train_data, val_data, test_data

# Split the satellite data into training, validation, and test sets.
satellite_train, satellite_val, satellite_test = split_data(satellite_data)
print(f"Satellite data - Training: {len(satellite_train)}, Validation: {len(satellite_val)}, Test: {len(satellite_test)}")


This code prepares and trains a Convolutional Neural Network (CNN) to predict maize yield from satellite images. The images are preprocessed by normalizing and resizing them, then fed into the model. 

The model is trained on the satellite training dataset, validated on the validation set, and finally evaluated on the test set to assess its performance. The Root Mean Squared Error (RMSE) metric is used to evaluate how well the model predicts the yield from the images.

Note: If you have complete training data, you can skip the splitting step and proceed directly to training the model with the full dataset.

In [None]:
# Function to load and preprocess TIF images.
def load_and_preprocess_tif(image_path):
    with rasterio.open(image_path) as src:
        image = src.read()
        image = np.moveaxis(image, 0, -1)  # Move the band to the last dimension
        image = np.array(image, dtype=np.float32)
        image = (image - image.min()) / (image.max() - image.min())  # Normalize
        image = tf.image.resize(image, [128, 128])  # Resize to 128x128
    return image

# Function to load images and associated yield labels.
def load_images_and_labels(data):
    images = []
    labels = []
    for _, row in data.iterrows():
        image = load_and_preprocess_tif(row['image_path'])
        images.append(image)
        labels.append(row['yield_per_acre'])
    return np.array(images), np.array(labels)

# Load and prepare the satellite image data.
print("Loading and preparing satellite image data...")
satellite_train_imgs, satellite_train_labels = load_images_and_labels(satellite_train)
satellite_val_imgs, satellite_val_labels = load_images_and_labels(satellite_val)
satellite_test_imgs, satellite_test_labels = load_images_and_labels(satellite_test)

# Function to define a CNN model for regression.
def create_cnn_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='linear')  # Regression output for predicting yield
    ])
    
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model


In [None]:
# Create the CNN model for satellite images.
input_shape = (128, 128, 6)  # 6 channels in TIF images
satellite_cnn_model = create_cnn_model(input_shape)

# Train the CNN model on satellite images.
satellite_cnn_model.fit(satellite_train_imgs, satellite_train_labels, validation_data=(satellite_val_imgs, satellite_val_labels), epochs=10, batch_size=32)

# Evaluate the CNN model on the test set.
test_loss, test_rmse = satellite_cnn_model.evaluate(satellite_test_imgs, satellite_test_labels)
print(f"RMSE on satellite test data: {test_rmse}")

In this code section, we load and preprocess satellite images, handling any potential errors if images are missing. All available data (training, validation, and test trainings sets) is combined into one dataset for training the CNN model. The model is defined using a previously mentioned function, and it is trained on the entire dataset to leverage all available data for better model performance.

Note: The functions load_and_preprocess_tif, load_images_and_labels, and create_cnn_model are used here as defined earlier, ensuring consistency and reusability of the code.

In [None]:
# Combine the training, validation, and test datasets into one.
all_data = pd.concat([satellite_train, satellite_val, satellite_test])
all_images, all_labels = load_images_and_labels(all_data)

# Create the CNN model for satellite images.
input_shape = (128, 128, 6)  # 6 channels in TIF images
satellite_cnn_model = create_cnn_model(input_shape)

# Train the CNN model using the combined dataset.
satellite_cnn_model.fit(all_images, all_labels, epochs=10, batch_size=32)

In this section, we prepare the validation dataset by constructing image paths, loading and preprocessing the images, and making yield predictions using a previously trained CNN model. The predictions are then added to the original CSV file, which is saved with the updated yield predictions.

This process is crucial for validating the model's performance on a separate validation dataset, ensuring the model generalizes well to unseen data. The final CSV contains both the original data and the predicted yield values for further analysis or comparison.

In [None]:

# Function to construct the image path based on CSV row data.
def build_image_path(row, base_path):
    location = row['location']
    block = f"TP{int(row['block'])}"  # Assuming 'block' is a number converted to TP1, TP2, etc.
    experiment = str(int(row['experiment']))
    row_value = str(int(row['row']))
    range_value = str(int(row['range']))

    image_name = f"{location}-{block}-{experiment}_{range_value}_{row_value}.TIF"
    image_path = os.path.join(base_path, block, image_name)
    
    # Print the constructed image path to verify it
    print(f"Constructed image path: {image_path}")
    
    return image_path

# Load the CSV containing validation data.
csv_path = labels_path_validation # Update to correct CSV path
csv_data = pd.read_csv(csv_path)

# Base path where images are stored.
base_image_path = satellite_dir_validation # Update to the correct base path

# Filter the data to include only TP1, TP2, TP3 blocks.
valid_blocks = ['TP1', 'TP2', 'TP3']
csv_data = csv_data[csv_data['block'].apply(lambda x: f"TP{int(x)}" in valid_blocks)]

# Prepare images and make predictions.
predicciones = []
for _, row in csv_data.iterrows():
    image_path = build_image_path(row, base_image_path)
    image = load_and_preprocess_tif(image_path)
    if image is not None:
        image = np.expand_dims(image, axis=0)  # Add batch dimension
        predicted_yield = satellite_cnn_model.predict(image)[0][0]  # Make the prediction
        predicciones.append(predicted_yield)
    else:
        predicciones.append(np.nan)  # Handle missing image case

# Update the CSV with predicted yield values.
csv_data['yieldPerAcre'] = predicciones

# Save the updated CSV.
csv_data.to_csv(csv_path, index=False)

This section prepares the test dataset by using previously defined functions to construct image paths, load and preprocess images, and make yield predictions using the trained CNN model. The predictions are then added to the original test CSV file, and the updated file is saved. This process is crucial for evaluating the model's performance on the test data and generating results that can be analyzed or submitted for further evaluation.

Note: The functions build_image_path and load_and_preprocess_tif are reused as defined earlier, ensuring consistency and efficiency in the code.

In [None]:


# Load the CSV containing test data.
csv_path = labels_path_test  # Update to the correct CSV path
csv_data = pd.read_csv(csv_path)

# Base path where images are stored.
base_image_path = satellite_dir_test  # Update to the correct base path

# Filter the data to include only TP1, TP2, TP3 blocks.
valid_blocks = ['TP1', 'TP2', 'TP3']
csv_data = csv_data[csv_data['block'].apply(lambda x: f"TP{int(x)}" in valid_blocks)]

# Prepare images and make predictions.
predicciones = []
for _, row in csv_data.iterrows():
    image_path = build_image_path(row, base_image_path)
    image = load_and_preprocess_tif(image_path)
    if image is not None:
        image = np.expand_dims(image, axis=0)  # Add batch dimension
        predicted_yield = satellite_cnn_model.predict(image)[0][0]  # Make the prediction
        predicciones.append(predicted_yield)
    else:
        predicciones.append(np.nan)  # Handle missing image case

# Update the CSV with predicted yield values.
csv_data['yieldPerAcre'] = predicciones

# Save the updated CSV.
csv_data.to_csv(csv_path, index=False)