In [None]:
target_features = ['GrainYield']
# target_features = ['Days2Maturity']

# Importing Libraries

In [None]:
import os
import csv
import numpy as np
import pandas as pd
from copy import copy
from datetime import datetime as dt

# Dictionaries
import json
from pprint import pprint

# Iterate in loops
import itertools
from itertools import zip_longest

# Simpsons integration
from numpy import trapz
from scipy.integrate import simps

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# To display df nicely in loops
from IPython.display import display 
# Display rows and columns Pandas
pd.options.display.max_columns = 100
pd.set_option('display.max_rows',100)

# # For displaying max rows in series
# pd.options.display.max_rows = 10

# Importing data

In [None]:
# Prints the current working directory
os.getcwd()
# os.listdir()

## Finding Username folder to make general path for multi PC use

In [None]:
username = str(os.getcwd()).split('\\')[2]
user_path = r'C:/Users/'+username+'/'
username, user_path

## Declaring Import paths

In [None]:
main_path = r'./Data/'
path = r'./Data/3. merged data/'
if target_features[0] == 'GrainYield':
    export_path = './Data/4. results/'
    export_path_comparability = './Data/4. results/comparability/'
elif target_features[0] == 'Days2Maturity':
    export_path = './Data/4. results_dm/'
    export_path_comparability = './Data/4. results_dm/comparability/'

# Create export_path folder if not exists already
os.makedirs(path, exist_ok=True)
os.makedirs(export_path, exist_ok=True)
os.makedirs(export_path_comparability, exist_ok=True)

# os.listdir(path)

In [None]:
# Making dictionary of files in each folder, in case there are multiple types of data
dict_paths = {}
def explore(starting_path):
    for dirpath, dirnames, filenames in os.walk(starting_path):
        dict_paths[dirpath.split('/')[-2]] = filenames
#     pprint(dict_paths)
explore(path)

# Data Preparation
## Creating list of complete files

In [None]:
# Get the list of all files in directory tree at given path

files_with_address = []
files_list = []

for (dirpath, dirnames, filenames) in os.walk(path):
    files_with_address += [os.path.join(dirpath, file) for file in filenames]
    files_list.extend(filenames)
    
print(len(files_with_address), 'files found in the directory')
# files_with_address
# files_list

## Data Checking/control

### Check for duplicate filenames

In [None]:
print('Total number of files are :', len(files_list))

print('Number of unique file names are:', len(set(files_list)))

print('There is/are', len(files_list) - len(set(files_list)),'duplicate file name/names.')
if len(files_list) - len(set(files_list)) > 0:
    raise NameError

# Importing data files to Pandas

In [None]:
all_df = []
for data in files_with_address:
    file_name = os.path.splitext(os.path.basename(data))[0]

    # Replce all invalid characters in the name
    file_name = file_name.replace(" ", "_")
    file_name = file_name.replace("-", "_")
    file_name = file_name.replace(")", "")
    file_name = file_name.replace("(", "")
    df_name = file_name.replace(".", "")
    # Test: Check if the same date is already present in the current dict key
    if df_name in all_df:
        print(f'A file with the same name {df_name} has already been imported. \n Please check if there is duplication of data.')
        raise NameError
    all_df.append(df_name)

    locals()[df_name] = pd.read_csv(data, index_col=False)
    print(df_name, '=====', locals()[df_name].shape)
# all_df

In [None]:
print(f'Total imported {len(all_df)}')
# all_df

## Importing Weather variables, yield columns, spectral indices, base indices columsn list

In [None]:
a_file = open(main_path+'vollebekk_weather_columns.json', "r")
output_str = a_file.read()
# The file is imported as string

# Converting it to python format
weather_cols_vollebekk = json.loads(output_str)
a_file.close()

pprint(len(weather_cols_vollebekk))

In [None]:
a_file = open(main_path+'staur_weather_columns.json', "r")
output_str = a_file.read()
# The file is imported as string

# Converting it to python format
weather_cols_staur = json.loads(output_str)
a_file.close()

pprint(len(weather_cols_staur))

In [None]:
a_file = open(main_path+"yield_columns.json", "r")
output_str = a_file.read()

# The file is imported as string
# Converting it to python format
yield_cols = json.loads(output_str)
a_file.close()
print(yield_cols)

In [None]:
a_file = open(main_path+"spectral_indices_columns.json", "r")
output_str = a_file.read()

# The file is imported as string
# Converting it to python format
spectral_indices_all = json.loads(output_str)
a_file.close()
print(spectral_indices_all)

In [None]:
a_file = open(main_path+"base_indices_columns.json", "r")
output_str = a_file.read()

# The file is imported as string
# Converting it to python format
base_indices = json.loads(output_str)
a_file.close()
print(base_indices)

## Defining categories of features

In [None]:
# ToDo: Add check for duplicate columns in the df
base_indices
spectral_indices_all 
drop_indices = ['EVI', 'GLI', 'MTCI']
spectral_indices = [x for x in spectral_indices_all if x not in drop_indices]

# Staur weather columns are all also present in Vollebekk weather so they can be use as general weather features
weather_features = weather_cols_staur.copy()
environment_var = weather_features + ['Staur_Env', 'Vollebekk_Env']
# yield_cols

# Dropping DF which donot have the Target Feature

In [None]:
# Dropping unnecessary columns
all_df_dm = []
for df in all_df:
    temp_df = locals()[df].copy()
    if target_features[0] in temp_df.columns.tolist():
        all_df_dm.append(df)
#         print(df)

In [None]:
all_df_simps = [x for x in all_df_dm if 'Simps' in x]
all_df_trapz = [x for x in all_df_dm if 'Trapz' in x]
# all_df_simps, all_df_trapz

# Dropping unnecessary columns


In [None]:
# Dropping unnecessary columns
for df in all_df_dm:
    temp_df = locals()[df].copy()
    locals()[df] = temp_df[base_indices+spectral_indices_all+environment_var+['Name',target_features[0]]]
    print(df, temp_df.shape, '==>', locals()[df].shape)

# Dealing with Nan values

## Dropping Missing values

In [None]:
# Dropping rows with missing value in any column
for df in all_df_dm:
    temp_df = locals()[df].copy()
    locals()[df] = temp_df.dropna(axis=0)
    print(temp_df.shape[0] - locals()[df].shape[0], ' rows dropped in ', df)
#     print(locals()[df].shape[0])

# Data Normalization

## Plot one index for different fields to check comparability

In [None]:
for col in base_indices+spectral_indices:
#     col='Blue'
    fig_size=(8, 5)
    fig, ax = plt.subplots(figsize=fig_size)
    plots = ax

    for df in all_df_simps:
#         if not 'Robot' in df and  not 'Staur' in df:
#         if 'Gram' in df and  'Masb' in df:
#             if '2020' in df:
        temp_df = locals()[df].copy()
        ax.boxplot(sorted(temp_df[col].values), positions = [all_df_simps.index(df)], labels=[df.split('_')[0][:5]+'_'+df.split('_')[1]])
#         ax.plot(sorted(temp_df[col].values), label=df.split('_')[0]+'_'+df.split('_')[1])
    # Printing the band/index name in plot of the fiels_sample for reference
    text = col
    ax.text(.98, .98, text, ha='right', va='top', weight=100, color='blue', fontsize ='xx-large', transform=ax.transAxes)

    ax.legend(loc=1)
    plt.tight_layout()
    plt.savefig(export_path_comparability+col+'_box.jpg',dpi=250, bbox_inches='tight', transform=ax.transAxes)
    plt.show()
#     break

In [None]:
for col in base_indices+spectral_indices:
#     col='Blue'
    fig_size=(8, 5)
    fig, ax = plt.subplots(figsize=fig_size)
    plots = ax

    for df in all_df_simps:
#         if not 'Robot' in df and  not 'Staur' in df:
#         if 'Gram' in df and  'Masb' in df:
#             if '2020' in df:
        temp_df = locals()[df].copy()
#         ax.boxplot(sorted(temp_df[col].values), positions = [all_df_simps.index(df)], labels=[df.split('_')[0][:5]+'_'+df.split('_')[1]])
        ax.plot(sorted(temp_df[col].values), label=df.split('_')[0]+'_'+df.split('_')[1])
    # Printing the band/index name in plot of the fiels_sample for reference
    text = col
    ax.text(.87, .6, text, ha='center', va='top', weight=100, color='blue', fontsize ='xx-large', transform=ax.transAxes)

    ax.legend(loc=1)
    plt.tight_layout()
#     plt.savefig(export_path_comparability+col+'_sorted.jpg',dpi=250, bbox_inches='tight', transform=ax.transAxes)
    plt.show()
#     break

In [None]:
for col in base_indices+spectral_indices:
#     col='Blue'
    fig_size=(8, 5)
    fig, ax = plt.subplots(figsize=fig_size)
    plots = ax

    for df in all_df_simps:
#         if not 'Robot' in df and  not 'Staur' in df:
#         if 'Gram' in df and  'Masb' in df:
#             if '2020' in df:
        temp_df = locals()[df].copy()
#         ax.boxplot(sorted(temp_df[col].values), positions = [all_df_simps.index(df)], labels=[df.split('_')[0][:5]+'_'+df.split('_')[1]])
        ax.plot((temp_df[col].values), label=df.split('_')[0]+'_'+df.split('_')[1])
    # Printing the band/index name in plot of the fiels_sample for reference
    text = col
    ax.text(.87, .6, text, ha='center', va='top', weight=100, color='blue', fontsize ='xx-large', transform=ax.transAxes)

    ax.legend(loc=1)
    plt.tight_layout()
#     plt.savefig(export_path_comparability+col+'_random.jpg',dpi=250, bbox_inches='tight', transform=ax.transAxes)
    plt.show()
#     break

## Normalizing the data using Z-Score from scipy

In [None]:
from scipy.stats import zscore

for df in all_df_dm:
    temp_df = locals()[df].copy()
    for col in temp_df.columns:
        # Checking if the column is not a yield column
        if col not in yield_cols+environment_var:
            temp_df[col] = zscore(temp_df[col])
    locals()[df] = temp_df.copy()
    print(df)

## Checking comparability after normalization

In [None]:
for col in base_indices+spectral_indices:
#     col='Blue'
    fig_size=(8, 5)
    fig, ax = plt.subplots(figsize=fig_size)
    plots = ax

    for df in all_df_simps:
#         if not 'Robot' in df and  not 'Staur' in df:
#         if 'Gram' in df and  'Masb' in df:
#             if '2020' in df:
        temp_df = locals()[df].copy()
        ax.boxplot(sorted(temp_df[col].values), positions = [all_df_simps.index(df)], labels=[df.split('_')[0][:5]+'_'+df.split('_')[1]])
#         ax.plot(sorted(temp_df[col].values), label=df.split('_')[0]+'_'+df.split('_')[1])
    # Printing the band/index name in plot of the fiels_sample for reference
    text = col
    ax.text(.98, .98, text, ha='right', va='top', weight=100, color='blue', fontsize ='xx-large', transform=ax.transAxes)

    ax.legend(loc=1)
    plt.tight_layout()
    plt.savefig(export_path_comparability+col+'_norm_box.jpg',dpi=250, bbox_inches='tight', transform=ax.transAxes)
    plt.show()
#     break

# Correlation

## Scatter Plot

In [None]:
for df in all_df_simps:
    temp_df = locals()[df][base_indices+spectral_indices+[target_features[0]]].copy()
    data = temp_df.copy()
    for col in base_indices:
        print(df)
        df_a = temp_df[col]
        df_b = temp_df[target_features[0]]


        fig, ax = plt.subplots(1, figsize=(12,8))
        sns.kdeplot(df_a, y=df_b, cmap='Blues',
                   shade=True, thresh=0.05, clip=(-1,300))
        plt.scatter(df_a, df_b, color='orangered')
        plt.show()

## Heat Map

In [None]:
for df in all_df_simps:
    print(df)
    temp_df = locals()[df][[target_features[0]]+spectral_indices].copy()
#     temp_df = locals()[df][spectral_indices+[target_features[0]]].copy()
    data = temp_df
    columns = temp_df.columns
    corr = data.corr()
    fig_size=(15,8)

    fig, ax = plt.subplots(figsize=fig_size)
    
    mask = np.triu(np.ones_like(corr, dtype=np.bool))

    
    ax = sns.heatmap(
        corr, mask=mask,
        vmin=-1, vmax=1, center=0,
        cmap=sns.diverging_palette(20, 220, n=200),
        square=True
    )    
    
    ax.set_xticklabels(
        ax.get_xticklabels(),
        rotation=45,
        horizontalalignment='right'
    );
#     plt.savefig(export_path+df+'_corr.jpg',dpi=250, bbox_inches='tight', transform=ax.transAxes)

    plt.show()

In [None]:
temp_list = [x for x in all_df_simps if not 'Robot' in x]

# Making list of df for conct before training
# This is different form list of srtings, as this is a list of actual dataframes
df_list = []
for x in temp_list:
    df_list.append(locals()[x])

df_ = pd.concat(df_list)

data = df_[[target_features[0]]+base_indices+spectral_indices]
columns = df_[[target_features[0]]+base_indices+spectral_indices].columns
corr = data.corr()
fig_size=(15,8)

fig, ax = plt.subplots(figsize=fig_size)

mask = np.triu(np.ones_like(corr, dtype=np.bool))


ax = sns.heatmap(
    corr, mask=mask,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)    

ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);
# plt.savefig(export_path+'all_mix_corr.jpg',dpi=250, bbox_inches='tight', transform=ax.transAxes)

plt.show()

## Restart the kernel

In [None]:
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")