# K-Fold Validation 

In [None]:
#!/usr/bin/env python
# coding: utf-8

# Import libraries
from fastai.vision.all import *
import torch
from ipywidgets import IntProgress
from glob import glob
from fastai.vision.augment import *
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import cv2
from pathlib import Path
from functools import partial
from tqdm import tqdm
from fastai.losses import CrossEntropyLossFlat
from fastai import *
from fastai.data.all import *
import optuna
import fastai.optimizer as optim
import joblib
import argparse
from optuna.integration import FastAIPruningCallback
from torchvision import models
from fastai.vision import models as fastai_models

# Custom functions
from msi_utils_Image import *
from kfold_utils_Image import *


if __name__ == "__main__":
	# Load the dataset
    df_all = pd.read_csv('/path/Train_Val_Holdout.csv')
	df_train_val = pd.read_csv('/path/Train_Val_AllMixed.csv')
	# KFOLD VALIDATION
	val_loss = []
	rmse_kfold = []
	rmse_pct_kfold =[]
	r2_kfold=[]

	kfold_preds = pd.DataFrame(columns=['predictions', 'target_yield'])
	split_list = kfold_splitter(df=df_train_val)

	# Callbacks
	csvlogger = CSVLogger(f"/path/metrics_kfold.csv", append=True)
	early_stopping = EarlyStoppingCallback(monitor='valid_loss', patience=20, min_delta=0.01)
	cbs = [csvlogger, early_stopping]

	for i in range(5):
		# Path to where the images are located
		path = Path('/path/train_images')
		getter = get_fold(split_list, fold=i)

		rgb_fold = DataBlock(blocks=(ImageBlock, RegressionBlock),
    						get_items=get_image_files_from_df,
    						get_y=get_y,
    						splitter=getter,
    						item_tfms=[FlipItem, Resize(360, None)],
    						batch_tfms=[Normalize])

		rgb_dl = rgb_fold.dataloaders(path, bs=64)


		# Learner for RGB (new model for each iteration)
		model_rgb = models.densenet121(pretrained=True)
    
		# Modify the architecture to have 3 output classes
		num_classes = 1
		model_rgb.classifier = nn.Linear(model_rgb.classifier.in_features, num_classes)
    
		# Add this line after creating the model architecture
		learn_rgb = Learner(rgb_dl,
							model_rgb,
							opt_func=RAdam,
							loss_func=root_mean_squared_error,  # Use CrossEntropyLoss for classification
							metrics=[rmse, R2Score()])  # Use accuracy as the evaluation metric

		# Disable Fastai progress bar (optional but cleaner)
		with learn_rgb.no_bar() and learn_rgb.no_logging():
			learn_rgb.fit(100, cbs=cbs, lr=0.0001289, wd=0.000137)
    
		df_ymin, df_ymax = df_all['Yield'].min(), df_all['Yield'].max() 
		val_loss_k, rmse_k, r2score_k = learn_rgb.validate()
		val_loss.append(val_loss_k)
		rmse_kfold.append(rmse_k)
		rmse_pct_kfold.append(((rmse_k/(df_ymax - df_ymin))*100))
		r2_kfold.append(r2score_k)


		# Extract the predictions and save in vis_results
		ypred, yval = learn_rgb.get_preds()

		pn = rgb_dl.valid_ds.items
		images_id = []
		for i in range(len(pn)):
			path = Path(pn[i])  # Convert the file path to a Path object
			name = path.stem
			images_id.append(name)

		vis_df = pd.DataFrame()
		vis_df['items'] = images_id
		vis_df['items'] = vis_df['items'].str.replace('id_', '')
		vis_df['predictions'] = ypred.flatten()
		vis_df['target_yield'] = yval.numpy()  # Convert yval tensor to NumPy array
		vis_df = vis_df.merge(df_train_val, how='left', left_on='items', right_on='Barcode')

		kfold_preds = kfold_preds.append(vis_df)

	# Allows you to save the predictions, and then calculate the desired metrics
	kfold_preds.to_csv(f"/path/predictions_kfold.csv", index=False)

	# Stratified kfold with emb_ps, ps and wd for around 18 epochs with early stopping
	d ={"validation loss":val_loss, "rmse": rmse_kfold, "rmse %": rmse_pct_kfold, "r2score":r2_kfold}

	fastkfold = pd.DataFrame(data=d)
	fastkfold['rmse %'] = fastkfold['rmse %'].apply(lambda x: np.mean(x))
	fastkfold.to_csv(f"/path/metrics_kfold.csv", index=False)




# Final Image Model Development

In [None]:
#!/usr/bin/env python
# coding: utf-8

# Import libraries
from fastai.vision.all import *
import torch
from ipywidgets import IntProgress
from glob import glob
from fastai.vision.augment import *
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import cv2
from pathlib import Path
from functools import partial
from tqdm import tqdm
from fastai.losses import CrossEntropyLossFlat
from fastai import *
from fastai.data.all import *
import optuna
import fastai.optimizer as optim
import joblib
import argparse
from optuna.integration import FastAIPruningCallback
from torchvision import models
from fastai.vision import models as fastai_models

# Custom functions
from msi_utils_Image import *
from kfold_utils_Image import *

if __name__ == "__main__":
	# Path to where the images are located
	path = Path('/path/train_images')
	# Load the dataset
	df = pd.read_csv('/path/Train_Val.csv')


	rgb_fold = DataBlock(blocks=(ImageBlock, RegressionBlock),
    					get_items=get_image_files_from_df,
    					get_y=get_y,
    					splitter=RandomSplitter(valid_pct=0.3, seed=42),
    					item_tfms=[FlipItem, Resize(360, None)],
    					batch_tfms=[Normalize])

	rgb_dl = rgb_fold.dataloaders(path, bs=64)

	# Callbacks
	csvlogger = CSVLogger(f"metrics.csv", append=True)
	save_callback = SaveModelCallback(monitor='valid_loss', fname='/path/model', reset_on_fit=False, at_end=True)
	cbs = [csvlogger, save_callback]

	# Learner for RGB (new model for each iteration)
	model_rgb = models.densenet121(pretrained=True)
    
	# Modify the architecture to have 3 output classes
	num_classes = 1
	model_rgb.classifier = nn.Linear(model_rgb.classifier.in_features, num_classes)
    
	# Add this line after creating the model architecture
	learn_rgb = Learner(rgb_dl,
						model_rgb,
						opt_func=RAdam,
						loss_func=root_mean_squared_error,  # Use CrossEntropyLoss for classification
						metrics=[rmse, R2Score()])  # Use accuracy as the evaluation metric

	# Disable Fastai progress bar (optional but cleaner)
	with learn_rgb.no_bar() and learn_rgb.no_logging():
		learn_rgb.fit(100, cbs=cbs, lr=0.0001289, wd=0.000137)
        

	preds = pd.DataFrame(columns=['predictions', 'target_yield'])

	# Extract the predictions and save in vis_results
	ypred, yval = learn_rgb.get_preds()

	pn = rgb_dl.valid_ds.items
	images_id = []
	for i in range(len(pn)):
		path = Path(pn[i])  # Convert the file path to a Path object
		name = path.stem
		images_id.append(name)

	vis_df = pd.DataFrame()
	vis_df['items'] = images_id
	vis_df['items'] = vis_df['items'].str.replace('id_', '')
	vis_df['predictions'] = ypred.flatten()
	vis_df['target_yield'] = yval.numpy()  # Convert yval tensor to NumPy array
	vis_df = vis_df.merge(df, how='left', left_on='items', right_on='Barcode')

	preds = preds.append(vis_df)

	# Allows you to save the predictions, and then calculate the desired metrics
	preds.to_csv(f"/path/predictions.csv")


# Holdout Evaluation

In [None]:
#!/usr/bin/env python
# coding: utf-8

##### RUN BELOW CODE IF WANTING TO INITIALISE AND LOAD THE MODEL FOR FURTHER EVALUATION #####
##### IF MODEL IS ALREADY LOADED/INITIALISED, SKIP THIS BIT ######

# Import libraries
from fastai.vision.all import *
import torch
from ipywidgets import IntProgress
from glob import glob
from fastai.vision.augment import *
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import cv2
from pathlib import Path
from functools import partial
from tqdm import tqdm
from fastai.losses import CrossEntropyLossFlat
from fastai import *
from fastai.data.all import *
import optuna
import fastai.optimizer as optim
import joblib
import argparse
from optuna.integration import FastAIPruningCallback
from torchvision import models
from fastai.vision import models as fastai_models

# Custom functions
from msi_utils_Image import *
from kfold_utils_Image import *

#Holdout set
df_test = pd.read_csv('/path/Holdout.csv')
path_test = Path('/path/holdout_images')

if __name__ == "__main__":
	# Path to where the images are located
	path = Path('/path/train_images')
	# Load the dataset
	df = pd.read_csv('/path/Train_Val.csv')
    
	rgb_fold = DataBlock(blocks=(ImageBlock, RegressionBlock),
    					get_items=get_image_files_from_df,
    					get_y=get_y,
    					splitter=RandomSplitter(valid_pct=0.3, seed=42),
    					item_tfms=[FlipItem, Resize(360, None)],
    					batch_tfms=[Normalize])
    
	rgb_dl = rgb_fold.dataloaders(path, bs=64)

	# Learner for RGB (new model for each iteration)
	model_rgb = models.densenet121(pretrained=True)
    
	# Modify the architecture to have 1 output classes
	num_classes = 1
	model_rgb.classifier = nn.Linear(model_rgb.classifier.in_features, num_classes)
    
	# Add this line after creating the model architecture
	learn_rgb = Learner(rgb_dl,
						model_rgb,
						opt_func=RAdam,
						loss_func=root_mean_squared_error,  # Use CrossEntropyLoss for classification
						metrics=[rmse, R2Score()])  # Use accuracy as the evaluation metric
        
	learn_rgb.load('/path/model')

In [None]:
#Holdout set
df_test = pd.read_csv('/path/Holdout.csv')
path_test = Path('/path/holdout_images')

In [None]:
# Part 1- Load the test set
test_dls = rgb_fold.dataloaders(path_test)
learn_rgb.dls.loaders.append(rgb_dl.test_dl(test_dls[0].items, with_labels=True))
dl_testing = learn_rgb.dls.test_dl(test_dls[0].items, with_labels=True)
predicts, targets = learn_rgb.get_preds(dl=dl_testing)

images_id = []
for fname in dl_testing.items:
    fname = str(fname)
    fname = fname.split(sep='/')[-1]
    fname = fname.replace('.npy', '')
    fname = fname[3:18]
    images_id.append(fname)


In [None]:
test_results = pd.DataFrame()
test_results['Items'] = images_id
test_results['Predictions'] = predicts.flatten().tolist()
test_results['Target_yield'] = targets
test_results = test_results.merge(df_test, how='left', left_on='Items', right_on='Replicate')

In [None]:
# Part 2 - Repeat the step above with the second hald of the holdout dataset
learn_rgb.dls.loaders.append(rgb_dl.test_dl(test_dls[1].items, with_labels=True))
dl_testing = learn_rgb.dls.test_dl(test_dls[1].items, with_labels=True)
predicts, targets = learn_rgb.get_preds(dl=dl_testing)

images_id = []
for fname in dl_testing.items:
    fname = str(fname)
    fname = fname.split(sep='/')[-1]
    fname = fname.replace('.npy', '')
    fname = fname[3:18]
    images_id.append(fname)
     
test_results1 = pd.DataFrame()
test_results1['Items'] = images_id
test_results1['Predictions'] = predicts.flatten().tolist()
test_results1['Target_yield'] = targets
test_results1 = test_results1.merge(df_test, how='left', left_on='Items', right_on='Replicate')

In [None]:
test_df = test_results.append(test_results1)

In [None]:
test_df.to_csv('/path/Predictions.csv', index=False)