# Tabular Model Development

## Tabular K-Fold Evaluation

In [None]:
#!/usr/bin/env python
# coding: utf-8

# Import libraries
from fastai.tabular.all import *

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold

if __name__ == "__main__":
	# DATASET
	df_train_val = pd.read_csv('/path/Train_Val_AllMixed.csv')

	# Random splitter function from fastai
	splitter = RandomSplitter(valid_pct=0.3, seed=42)
	splits = splitter(range_of(df_train_val))
	splits

	kfold_preds = pd.DataFrame(columns=['predictions', 'target_yield'])

	def kfold_splitter(df, column='Year', splits=5, shuffle=True):
	    from sklearn.model_selection import StratifiedKFold
	    kfold = StratifiedKFold(n_splits=splits, shuffle=shuffle)
	    train_idx = []
	    val_idx = []

	    for train_index, val_index in kfold.split(df.index, df[column]):
	        train_idx.append(L(train_index, use_list=True))
	        val_idx.append(L(val_index, use_list=True))
    
	    return train_idx, val_idx

	procs = [Categorify, Normalize, FillMissing]
	cont_vars = df_train_val.columns[21:].tolist()
	additional_cont_vars = ['JulianPlantDatePerYear', 'Year', 'DTA', 'DTS', 'Moist', 'Population', 'Range', 'Row']
	cont_names =  cont_vars + additional_cont_vars 
	cat_names = ['Pedigree1', 'Pedigree2', 'Stock', 'Test']

	val_loss = []
	rmse_kfold = []
	rmse_pct_kfold =[]
	r2_kfold=[]


	#Callbacks 
	early_stopping = EarlyStoppingCallback(monitor='valid_loss', patience=20, min_delta=0.01)
	csvlogger = CSVLogger(f'/path/Metrics_kfold.csv', append=True)

	train_index, val_index = kfold_splitter(df_train_val)

	for i in range(5):
		init = TabularPandas(df_train_val,
		                   procs,
		                   cat_names=cat_names,
		                   cont_names=cont_names,
		                   y_names='Yield',
		                   y_block=RegressionBlock(),
		                   splits=(train_index[i], val_index[i]))

		data_init = init.dataloaders(bs=64)

		config = tabular_config(ps=0.5, embed_p=0.5)

		learn_tab = tabular_learner(data_init,
		                            config=config,
		                            layers=[200,100],
		                            metrics=[rmse, R2Score()],
		                            opt_func=ranger,
		                            y_range=[0,20],
		                            wd=1.425482107813348e-06)

		learn_tab.fit_one_cycle(100, lr_max=0.00018479913871295546, cbs=[csvlogger, early_stopping])


		df_ymin, df_ymax = df_train_val['Yield'].min(), df_train_val['Yield'].max()
		val_loss_k, rmse_k, r2score_k = learn_tab.validate()
		val_loss.append(val_loss_k)
		rmse_kfold.append(rmse_k)
		rmse_pct_kfold.append(((rmse_k/(df_ymax - df_ymin))*100))
		r2_kfold.append(r2score_k)

		preds = pd.DataFrame()

		# Extract the predictions and save in vis_results
		ypred, yval = learn_tab.get_preds()
		preds['predictions'] = ypred.flatten()
		preds['target_yield'] = yval.numpy()  # Convert yval tensor to NumPy array

		kfold_preds = kfold_preds.append(preds)


	# Save predictions to a CSV file
	kfold_preds.to_csv('/path/Preds_kfold.csv', index=False)  # Set index=False to avoid saving row indices


	# Stratified kfold metrics per round
	d ={"validation loss":val_loss, "rmse": rmse_kfold, "rmse %": rmse_pct_kfold, "r2score":r2_kfold}

	dnnkfold = pd.DataFrame(data=d)
	dnnkfold['rmse %'] = dnnkfold['rmse %'].apply(lambda x: np.mean(x))
	dnnkfold.to_csv('/path/Summary_Metrics_kfold.csv', index=False)



## Tabular Final Model Development

In [None]:
# Import libraries
from fastai.tabular.all import *

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold

In [None]:
# Load the train/val datasets for DNN
df_train_val = pd.read_csv('/path/Train_Val.csv')


In [None]:
# Random splitter function from fastai
splitter = RandomSplitter(valid_pct=0.3, seed=42)
splits = splitter(range_of(df_train_val))
splits

In [None]:
procs = [Categorify, Normalize, FillMissing]
cont_vars = df_train_val.columns[21:].tolist()

additional_cont_vars = ['JulianPlantDatePerYear', 'Year', 'DTA', 'DTS', 'Moist', 'Population', 'Range', 'Row']
cont_names =  cont_vars + additional_cont_vars 
cat_names = ['Pedigree1', 'Pedigree2', 'Stock', 'Test']

init = TabularPandas(df_train_val,
                   procs,
                   cat_names=cat_names,
                   cont_names=cont_names,
                   y_names='Yield',
                   y_block=RegressionBlock(),
                   splits=splits)

data_init = init.dataloaders(bs=64)



In [None]:
#Callbacks 
#early_stopping = EarlyStoppingCallback(monitor='valid_loss', patience=3, min_delta=0.01) #Uncomment if you want to implement early stopping.
csvlogger = CSVLogger(f'/path/Metrics.csv', append=True)
save_callback = SaveModelCallback(monitor='valid_loss', fname='Tabular_Model', reset_on_fit=False, at_end=True)


In [None]:
config = tabular_config(ps=0.5, embed_p=0.5)

In [None]:
learn_tab = tabular_learner(data_init,
                            config=config,
                            layers=[200,100],
                            metrics=[rmse, R2Score()],
                            opt_func=ranger,
                            y_range=[0,20],
                            wd=1.425482107813348e-06)

In [None]:
learn_tab.fit_one_cycle(100, lr_max=0.00018479913871295546), cbs=[csvlogger, save_callback])

In [None]:
# Show results for some samples in the validation set
learn_tab.show_results(ds_idx=1, shuffle=False, max_n=10)

In [None]:
preds = pd.DataFrame(columns=['predictions', 'target_yield'])

# Extract the predictions and save in vis_results
ypred, yval = learn_tab.get_preds()
preds['predictions'] = ypred.flatten()
preds['target_yield'] = yval.numpy()  # Convert yval tensor to NumPy array


In [None]:
# Save predictions to a CSV file
preds.to_csv('/path/Preds.csv', index=False)  # Set index=False to avoid saving row indices


## Holdout Evaluation

In [None]:
learn_tab.load('/path/Tabular_Model') #Using saved model from SaveModelCallback function.

In [None]:
df_test = pd.read_csv('/path/Holdout.csv')

In [None]:
# predict the grain yield in the holdout dataset
dl = learn_tab.dls.test_dl(df_test)
test_preds = learn_tab.get_preds(dl=dl, reorder=False)


In [None]:
df_test2019 = pd.DataFrame()
df_test2019['ID_Predictions'] = df_test['Barcode']
df_test2019['Target_yield'] = test_preds[1].flatten()
df_test2019['Predictions'] = test_preds[0].flatten()


In [None]:
df_test2019.to_csv('/path/predictions.csv')

Feature Importance

In [None]:
from fastinference.tabular import *

In [None]:
exp = ShapInterpretation(learn_tab, df_test)


In [None]:
exp1 = ShapInterpretation(learn_tab)


In [None]:
exp.summary_plot(plot_type='bar', max_display=10) #Run if want to see the top ten features.
 

In [None]:
exp1.summary_plot(plot_type='bar', max_display=20) #Run if want to see the top 20 features.
