## Data set preparation for ML model development - correlation based variable pool with RTMA data

### Set up/check environment

In [None]:
# Check environment
!conda info

In [None]:
# Import packages 
import pandas as pd
import random
import numpy as np
import sklearn
from sklearn import datasets
from datetime import datetime
from itertools import cycle
import glob2
import os
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import LabelBinarizer as lb

# Set the number of maximum displayed rows for printed dataframes to 1000
pd.set_option('display.max_rows', 1000)

In [None]:
weather_df = pd.read_csv('')

### Perform correlation-based feature selection

In [None]:
# reduce predictor pool using point biserial and kendall correlation analyses to pick strongly and significanly correlated weather predictors

data = weather_df

# creating new column 'row shading' which is the product of canopy closure and row spacing

row_shading = weather_df['canopy_avg'] * weather_df['spacing (m)']
insert_index = weather_df.columns.get_loc('target') + 1  # Insert after col1
weather_df.insert(insert_index, 'row_shading', row_shading)

# select the binary target variable and the list of continuous variables
target_variable = 'target'

# RTMA: 
continuous_variables = ['canopy_avg', 'row_shading', 
                        'MA_MeanAT', 'MA_MeanDP', 'MA_MeanRH', 'MA_MeanWS', 'MA_MeanRH86', 'MA_MeanATD', 'MA_MeanRF_IBM',
                        'MA_MaxAT','MA_MaxDP', 'MA_MaxRH', 'MA_MaxWS', 'MA_MaxRH86','MA_MaxRF_IBM',
                        'MA_MinAT', 'MA_MinDP', 'MA_MinRH', 'MA_MinWS', 'MA_MinRH86','MA_MinATD', 
                        'MA_SumAT', 'MA_SumDP','MA_SumRH', 'MA_SumWS', 'MA_SumRH86', 'MA_SumATD', 'MA_SumRF_IBM']

In [None]:
# perform point biserial correlation analysis for each continuous variable
correlations = []
p_values = []
for variable in continuous_variables:
    correlation, p_value = stats.pointbiserialr(data[variable], data[target_variable])
    correlations.append(correlation)
    p_values.append(p_value)

# create a DataFrame to store and display the correlation results
correlation_results = pd.DataFrame({'Variable': continuous_variables, 'Correlation': correlations, 'P-value': p_values}).sort_values('P-value', ascending = True)

# print the correlation results
display(correlation_results)

In [None]:
# perform kendall correlation analysis for each continuous variable
correlations = []
p_values = []
for variable in continuous_variables:
    correlation, p_value = stats.kendalltau(data[variable], data[target_variable])
    correlations.append(correlation)
    p_values.append(p_value)

# create a DataFrame to store and display the correlation results
correlation_results = pd.DataFrame({'Variable': continuous_variables, 'Correlation': correlations, 'P-value': p_values}).sort_values('P-value', ascending = True)

# print the correlation results
display(correlation_results)

In [None]:
# index highest correlated predictors (unique functions) and most informative biological predictors
    # and add to new data frame for training and testing data set
weather_reduced_df = weather_df.loc[:,['target', 
                                       'soil type', #must remove categorical data for augmentation
                                       'row_shading',
                                       'MA_MaxRH86',
                                       'MA_SumATD',
                                       'MA_SumRF_IBM',
                                       'MA_MaxWS'
                                       ]]

In [None]:
# perform max absolute rescaling to normalize predictor variables due to major scale differences
# copy the data - only the numerical data, not including spacing since it is truly categorical
dt_max_scaled = weather_reduced_df.iloc[:, 2:]
display(dt_max_scaled.head())

### Normalize weather-only data set

In [None]:
# apply normalization techniques
for column in dt_max_scaled.columns:
    dt_max_scaled[column] = dt_max_scaled[column]  / dt_max_scaled[column].abs().max()

# plot normalized data
sns.set_theme()
sns.set(font_scale=0.5) 
ax = sns.catplot(data=dt_max_scaled, kind = 'bar')
ax.set_xticklabels(rotation=90, ha="right")
ax.set(title = 'Max absolute rescaled data')

In [None]:
# split into x and y
xdat = dt_max_scaled.loc[:,dt_max_scaled.columns != 'target' ]
ydat = pd.DataFrame(weather_df.loc[:, 'target'])

### Weather data load out 

### Data splitting
* Training: 80%, testing 20%  
* Stratification by apothecia threshold binary

In [None]:
# returning the categorical soil type variable back to the data frame before stratification and load out
xdat = pd.merge(weather_reduced_df.iloc[:, 1], xdat, left_index = True, right_index = True) 

In [None]:
# using binary encoding to transform categorical soil type (ML cannot handle categorical)
xdat['soil type'].replace(['sand', 'loamy sand', 'loam'], [0, 1, 2], inplace=True)

In [None]:
# stratifying by ydat
x_train, x_test, y_train, y_test = train_test_split(xdat, ydat, test_size=0.20, random_state=42, stratify=ydat)

In [None]:
# check training and testing data sets

display(x_train)
display(y_train)
display(x_test)
display(y_test)

In [None]:
# checking that indexes for data split between x and y match for training and test
print('x training indexes:', x_train.index)
print('y training indexes:', y_train.index)

print('x testing indexes:', x_test.index)
print('y testing indexes:', y_test.index)

In [None]:
# checking stratification (equal proportions of positive solutions (target = 1) between train and test)
print(sum(y_train['target'])/len(y_train))
print(sum(y_test['target'])/len(y_test))

### Weather only - data load out

In [None]:
x_train.to_csv('', index=False, header=True)
y_train.to_csv('', index=False, header=True)
x_test.to_csv('', index=False, header=True)
y_test.to_csv('', index=False, header=True)