# Weather and Genomics Data

Note: Half datasets, with separate files for east and west subplots have been merged manually in excel

In [1]:
%%time

import os
import math
import datetime
import numpy as np
import pandas as pd
from copy import copy

# Dictionaries
import json
from pprint import pprint

# Iterate in loops
from itertools import zip_longest

# Simpsons integration
from numpy import trapz
from scipy.integrate import simps

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# To display df nicely in loops
from IPython.display import display 
# display(df1.head()) 
# display(df2.head())

# Display rows and columns Pandas
pd.options.display.max_columns = 100
pd.set_option('display.max_rows',100)

# # For displaying max rows in series
# pd.options.display.max_rows = 10

Wall time: 1.74 s


In [2]:
# Prints the current working directory
os.getcwd()
# os.listdir()

'C:\\Users\\fahad\\MegaSync\\NMBU\\GitHub\\vPheno'

## Finding Username folder to make general path for multi PC use

In [3]:
username = str(os.getcwd()).split('\\')[2]
user_path = r'C:/Users/'+username+'/'
username, user_path

('fahad', 'C:/Users/fahad/')

## Importing Data

In [4]:
main_path = r'./Data/'
path = r'./Data/2. renamed_merged/'
export_path = r'./Data/3. merged data/'
# temp_export_path = r'./Data/3. Temp_Data/'
weather_data_vollebekk = user_path+r'\\MegaSync\NMBU\Master Thesis\Data\Weather\Weather_Data_Ås-Vollebekk.csv'
weather_data_staur = user_path+r'\\MegaSync\NMBU\Master Thesis\Data\Weather\Weather_Data_Ilseng-Staur.csv'
genomics_data = user_path+r'\\MegaSync\NMBU\Master Thesis\Data\Genomics\\'

# Create export_path folder if not exists already
os.makedirs(path, exist_ok=True)
os.makedirs(export_path, exist_ok=True)
# os.makedirs(temp_export_path, exist_ok=True)

os.listdir(path)

['Graminor_2019_all.csv',
 'Graminor_2020_all.csv',
 'Masbasis_2019_all.csv',
 'Masbasis_2020_all_lodg.csv',
 'Robot_2020_all.csv',
 'Staur_2019_all.csv',
 'Staur_2020_all_lodg.csv']

## Data Preparation
### Creating list of complete files

In [5]:
# Get the list of all files in directory tree at given path

files_with_address = []
files_list = []

for (dirpath, dirnames, filenames) in os.walk(path):
    files_with_address += [os.path.join(dirpath, file) for file in filenames]
    files_list.extend(filenames)
    
print(len(files_with_address), 'files found in the directory')
# files_with_address
files_list

7 files found in the directory


['Graminor_2019_all.csv',
 'Graminor_2020_all.csv',
 'Masbasis_2019_all.csv',
 'Masbasis_2020_all_lodg.csv',
 'Robot_2020_all.csv',
 'Staur_2019_all.csv',
 'Staur_2020_all_lodg.csv']

## Data Checking/control

### Check for duplicate filenames

In [6]:
print('Total number of files are :', len(files_list))

print('Number of unique file names are:', len(set(files_list)))

print('There is/are', len(files_list) - len(set(files_list)),'duplicate file name/names.')
if len(files_list) - len(set(files_list)) > 0:
    raise NameError

Total number of files are : 7
Number of unique file names are: 7
There is/are 0 duplicate file name/names.


## Importing data files to Pandas

In [7]:
   
%%time

all_df = []
for data in files_with_address:
    file_name = os.path.splitext(os.path.basename(data))[0]

    # Replce all invalid characters in the name
    file_name = file_name.replace(" ", "_")
    file_name = file_name.replace("-", "_")
    file_name = file_name.replace(")", "")
    file_name = file_name.replace("(", "")
    df_name = file_name.replace(".", "")
    # Test: Check if the same date is already present in the current dict key
    if df_name in all_df:
        print(f'A file with the same name {df_name} has already been imported. \n Please check if there is duplication of data.')
        raise NameError
    all_df.append(df_name)

    locals()[df_name] = pd.read_csv(data, index_col=False)
    print(df_name, '=====', locals()[df_name].shape)
# all_df

Graminor_2019_all ===== (600, 381)
Graminor_2020_all ===== (800, 381)
Masbasis_2019_all ===== (528, 280)
Masbasis_2020_all_lodg ===== (659, 418)
Robot_2020_all ===== (96, 485)
Staur_2019_all ===== (1328, 181)
Staur_2020_all_lodg ===== (1504, 212)
Wall time: 712 ms


In [8]:
print(f'Total imported {len(all_df)}')
all_df

Total imported 7


['Graminor_2019_all',
 'Graminor_2020_all',
 'Masbasis_2019_all',
 'Masbasis_2020_all_lodg',
 'Robot_2020_all',
 'Staur_2019_all',
 'Staur_2020_all_lodg']

## Summary of imported data

In [9]:
for df in all_df:
    temp_df = locals()[df].copy()
    print(df, temp_df.shape)

Graminor_2019_all (600, 381)
Graminor_2020_all (800, 381)
Masbasis_2019_all (528, 280)
Masbasis_2020_all_lodg (659, 418)
Robot_2020_all (96, 485)
Staur_2019_all (1328, 181)
Staur_2020_all_lodg (1504, 212)


## GrainYield data 

In [10]:
# Masbasis_2019_Simps.info(null_counts=True)
for df in all_df:
    temp_df = locals()[df].copy()
#     print('*************', df, '**************')
#     print(locals()[df].info())

    print (df)
    print('Grain Yield data missing for ', temp_df['GrainYield'].isna().sum(), 'out of ', temp_df.shape[0])
# Graminor_2019_Simps.info()

Graminor_2019_all
Grain Yield data missing for  1 out of  600
Graminor_2020_all
Grain Yield data missing for  1 out of  800
Masbasis_2019_all
Grain Yield data missing for  6 out of  528
Masbasis_2020_all_lodg
Grain Yield data missing for  116 out of  659
Robot_2020_all
Grain Yield data missing for  0 out of  96
Staur_2019_all
Grain Yield data missing for  0 out of  1328
Staur_2020_all_lodg
Grain Yield data missing for  568 out of  1504


# Looking for zero values in data

In [11]:
# Masbasis_2019_Simps.info(null_counts=True)
for df in all_df:
    temp_df = locals()[df].copy()
    for col in temp_df.columns.to_list():
        if (temp_df[col]==0).sum() >0:
#             print(temp_df.columns.to_list())
            print(df, col, (temp_df[col]==0).sum())

Masbasis_2020_all_lodg Lodging 502
Staur_2020_all_lodg Lodging 157


# Importing Yield columns and Spectral Indices

In [12]:
a_file = open(main_path+"yield_columns.json", "r")
output_str = a_file.read()

# The file is imported as string
# Converting it to dictionary
yield_cols = json.loads(output_str)
a_file.close()
print(yield_cols)

['GrainYield', 'CodeName', 'Block', 'Line', 'Entry', 'Replicates', 'Maturity_Date', 'Days2Maturity', 'Pedigree', 'Lodging', 'Heading_Date', 'iBlock', 'Days2Heading', 'Name']


In [13]:
a_file = open(main_path+"spectral_indices_columns.json", "r")
output_str = a_file.read()

# The file is imported as string
# Converting it to dictionary
spectral_indices = json.loads(output_str)
a_file.close()
print(spectral_indices)

['NDVI', 'MTCI', 'DVI', 'GDVI', 'MTCI_CI', 'EXG', 'EXGR', 'RDVI', 'TDVI', 'GNDVI', 'NDRE', 'SCCI', 'EVI', 'TVI', 'VARI', 'GARI', 'GCI', 'GLI', 'NLI', 'MNLI', 'SAVI', 'GSAVI', 'OSAVI', 'GOSAVI', 'MSAVI2', 'MSR', 'GRVI', 'WDRVI', 'SR']


In [14]:
a_file = open(main_path+"base_indices_columns.json", "r")
output_str = a_file.read()

# The file is imported as string
# Converting it to dictionary
base_indices = json.loads(output_str)
a_file.close()
print(base_indices)

['Blue', 'Green', 'Red', 'RedEdge', 'NIR']


Zeros only present in Lodging columns where present in the dataset.

# Finding yield columns

In [15]:
# ToDo: Add check for duplicate columns in the df

base_indices

spectral_indices

yield_cols

id_cols_new = ['Plot_ID']

# Counter for location of column in columns list

# Dict for saving the name and location of the yield column/s
loc_yield_cols = {}
for df in all_df:
    loc = 0
    for cols in locals()[df].columns.tolist():
        for y_col in yield_cols:
            if not cols.find(y_col):
                loc_yield_cols[cols+'_'+df] = loc
                print(f'\"{cols}\" column in {df} is the yield column\n as it contains the text \"{y_col}\". It is located at location {loc}')
        loc += 1

    yield_cols_found = list(loc_yield_cols.keys())
    target_cols=yield_cols_found[0]
loc_yield_cols

"GrainYield" column in Graminor_2019_all is the yield column
 as it contains the text "GrainYield". It is located at location 6
"Block" column in Graminor_2019_all is the yield column
 as it contains the text "Block". It is located at location 7
"iBlock" column in Graminor_2019_all is the yield column
 as it contains the text "iBlock". It is located at location 8
"Entry" column in Graminor_2019_all is the yield column
 as it contains the text "Entry". It is located at location 9
"Name" column in Graminor_2019_all is the yield column
 as it contains the text "Name". It is located at location 10
"Pedigree" column in Graminor_2019_all is the yield column
 as it contains the text "Pedigree". It is located at location 11
"GrainYield" column in Graminor_2020_all is the yield column
 as it contains the text "GrainYield". It is located at location 6
"Block" column in Graminor_2020_all is the yield column
 as it contains the text "Block". It is located at location 7
"iBlock" column in Graminor_

{'GrainYield_Graminor_2019_all': 6,
 'Block_Graminor_2019_all': 7,
 'iBlock_Graminor_2019_all': 8,
 'Entry_Graminor_2019_all': 9,
 'Name_Graminor_2019_all': 10,
 'Pedigree_Graminor_2019_all': 11,
 'GrainYield_Graminor_2020_all': 6,
 'Block_Graminor_2020_all': 7,
 'iBlock_Graminor_2020_all': 8,
 'Entry_Graminor_2020_all': 9,
 'Name_Graminor_2020_all': 10,
 'Pedigree_Graminor_2020_all': 11,
 'GrainYield_Masbasis_2019_all': 6,
 'Replicates_Masbasis_2019_all': 7,
 'Block_Masbasis_2019_all': 8,
 'Name_Masbasis_2019_all': 9,
 'Line_Masbasis_2019_all': 10,
 'Days2Heading_Masbasis_2019_all': 11,
 'Days2Maturity_Masbasis_2019_all': 12,
 'GrainYield_Masbasis_2020_all_lodg': 6,
 'Replicates_Masbasis_2020_all_lodg': 7,
 'Block_Masbasis_2020_all_lodg': 8,
 'Name_Masbasis_2020_all_lodg': 9,
 'Line_Masbasis_2020_all_lodg': 10,
 'Maturity_Date_Masbasis_2020_all_lodg': 11,
 'Days2Heading_Masbasis_2020_all_lodg': 12,
 'Days2Maturity_Masbasis_2020_all_lodg': 13,
 'Lodging_Masbasis_2020_all_lodg': 14,
 'G

# Finding dates between heading and maturity

In [16]:
yield_cols

['GrainYield',
 'CodeName',
 'Block',
 'Line',
 'Entry',
 'Replicates',
 'Maturity_Date',
 'Days2Maturity',
 'Pedigree',
 'Lodging',
 'Heading_Date',
 'iBlock',
 'Days2Heading',
 'Name']

In [17]:
for df in all_df:
    temp_df = locals()[df].copy()
    if 'Days2Maturity' in temp_df.columns:
        print(df)
all_df

Masbasis_2019_all
Masbasis_2020_all_lodg
Robot_2020_all
Staur_2019_all


['Graminor_2019_all',
 'Graminor_2020_all',
 'Masbasis_2019_all',
 'Masbasis_2020_all_lodg',
 'Robot_2020_all',
 'Staur_2019_all',
 'Staur_2020_all_lodg']

## Declaring the important dates for each field

In [18]:
# Dates listed in dict in order; sowing, heading, maturity
# The order of fields must be the same as in all_df list
# sowing_dict = {
#     'Graminor_2019': ['240419', 'XX', 'XX'],
#     'Graminor_2020': ['150420', 'XX', 'XX'],
#     'Masbasis_2019': ['190519', 'XX', 'XX'],
#     'Masbasis_2020': ['150520', 'XX', 'XX'],
#     'Robot_2020': ['200420', '170620', '310720'],
#     'Staur_2019': ['040619', 'XX', 'XX'],
#     'Staur_2020': ['210420', 'XX', 'XX'],
# }

sowing_dict = {
    'Graminor_2019': '240419',
    'Graminor_2020': '150420',
    'Masbasis_2019': '190519',
    'Masbasis_2020': '150520',
    'Robot_2020': '200420',
    'Staur_2019': '040619',
    'Staur_2020': '210420',
}


## Filtering df which have Days2Maturity and Days2Heading 

In [19]:
# If the dataset had Days 2 heading and days to maturity columns then create the
# following dictionary with the respective sowing dates of each field as value
all_df_sowing = {}

for df in all_df:
    temp_df = locals()[df].copy()
    field_temp = df.split('_')[0]+'_'+df.split('_')[1]
#     if 'Days2Heading' in temp_df.columns and 'Days2Maturity' in temp_df.columns:
#         print(df)
#         all_df_sowing[df] = sowing_dict[field_temp]
    all_df_sowing[df] = sowing_dict[field_temp]

all_df_sowing

{'Graminor_2019_all': '240419',
 'Graminor_2020_all': '150420',
 'Masbasis_2019_all': '190519',
 'Masbasis_2020_all_lodg': '150520',
 'Robot_2020_all': '200420',
 'Staur_2019_all': '040619',
 'Staur_2020_all_lodg': '210420'}

## Average DH and DM

In [20]:
# Creating a dictionary with average Days2Heading and Days2Maturity for fields whose data is available
dict_avg_dh_dm = {}
df_dh_dm = []
for df in all_df_sowing.keys():
    temp_df = locals()[df].copy()

#     print('Days2Heading')
#     print(locals()[df].Days2Heading.min(), ':      ', locals()[df].Days2Heading.max(), ':     ', locals()[df].Days2Heading.mean())
#     print('Days2Maturity')
#     print(locals()[df].Days2Maturity.min(), ':      ', locals()[df].Days2Maturity.max(), ':     ', locals()[df].Days2Maturity.mean())

    if 'Days2Heading' in temp_df.columns and 'Days2Maturity' in temp_df.columns:
        df_dh_dm.append(df)
        dict_avg_dh_dm[df] = [locals()[df].Days2Heading.mean(), locals()[df].Days2Maturity.mean()]
dict_avg_dh_dm


{'Masbasis_2019_all': [68.18939393939394, 108.64393939393939],
 'Masbasis_2020_all_lodg': [66.28983308042488, 87.94881170018282],
 'Robot_2020_all': [61.09375, 110.84375],
 'Staur_2019_all': [48.53333333333333, 101.25757575757575]}

In [21]:
from statistics import mean

list_dh = []
list_dm = []
for field, dhdm in dict_avg_dh_dm.items():
    list_dh.append(dhdm[0])
    list_dm.append(dhdm[1])
mean_dh = mean(list_dh)
mean_dm = mean(list_dm)
print(f'Average Days2Heading is {mean_dh}')
print(f'Average Days2Maturity is {mean_dm}')


Average Days2Heading is 61.026577588288035
Average Days2Maturity is 102.17351921292449


# Integration

In [22]:
from scipy.integrate import simps
from numpy import trapz
from scipy.integrate import cumulative_trapezoid
from scipy.integrate import romb

## Testing different alternatives

In [23]:
# simps(temp_entries_dropna, days_sow)

In [24]:
# days_sow

# days = [50, 64, 72, ((72 + 87) / 2), 87]
# band = [21, 14, 9, ((9 + 2) / 2), 2]

# days2 = [50, 64, 72, 87]
# band2 = [21, 14, 9, 2]

# simps(band, days), simps(band2, days2), 

In [25]:
# temp_entries_dropna

In [26]:
all_df

['Graminor_2019_all',
 'Graminor_2020_all',
 'Masbasis_2019_all',
 'Masbasis_2020_all_lodg',
 'Robot_2020_all',
 'Staur_2019_all',
 'Staur_2020_all_lodg']

## Creating df with Plot_ID and Grain_Yield only
## Calculating AUC and creating new df with calculated values

In [27]:
simp_df_all = []
trapz_df_all = []
samples_record_simps = {}
for df, sowing in all_df_sowing.items():

    temp_df = locals()[df].copy()
    cols = temp_df.columns
    
    # Creating a list of columns which other than the indices (ID and yield columns)
    # Making a temp list of yield columns since all entries from yield cols are not present in every df
    temp_yield_cols = [x for x in temp_df.columns if x in yield_cols]
    non_indices_cols = id_cols_new+temp_yield_cols
#     print(non_indices_cols)
    
    df_auc_simps = temp_df[non_indices_cols].copy()
    df_auc_trapz = temp_df[non_indices_cols].copy()
#     display(df_auc.head())

    # Calculating AUC and creating new df with calculated values
    temp_samples = {}
    for col_name in base_indices+spectral_indices:
        df_simp = []
        df_trapz = []
        # Making temp_cols list avoids problems finding and differentiating 'OSAVI' and 'GOSAVI'
        temp_cols = [x for x in cols if col_name.split('_') == x.split('_')[:-1]]
        temp_dates = [datetime.datetime.strptime(date.split('_')[-1], '%d%m%y').date() for date in temp_cols]

        # Calculating the days from sowing,i.e. age of the crop in days
        sowing_date = datetime.datetime.strptime(sowing, '%d%m%y').date() 
        
        temp_samples_list = []
        for sample in range(temp_df.shape[0]):
            # Number of days since sowing for each entry
            days_sow = [(x-sowing_date).days for x in temp_dates]
            # The respective value of the index in question 
            temp_entries = [temp_df[x][sample] for x in temp_cols]

            #### DROPPING DATES OUTSIDE HEADING AND MATURITY DATES ####
            
            # Determining Days2Heading values
            if 'Days2Heading' in temp_df.columns:
                # Days to heading for current sample 
                DH = temp_df.Days2Heading[sample]
                # If DH is missing then use the smallest of Mean DH from fields whose DH is available
                if str(DH)=='nan':
                    DH = round(min(list_dh))
            else:
                DH = round(min(list_dh))
                
                
                
                
            if 'Days2Maturity' in temp_df.columns:
                # Days to maturity for current sample 
                DM = temp_df.Days2Maturity[sample]
                # If DM is missing then use the largest of Mean DM from fields whose DM is available
                if str(DM)=='nan':
                    DM = round(max(list_dm))
            else:
                DM = round(max(list_dm))  
                    
            DH = int(DH)
            DM = int(DM)
            # Making sure that the maturity comes after heading
            if DM < DH:
                print(DM, DH)
            assert DM > DH
#             print(DM, DH)
            heading_date = sowing_date + datetime.timedelta(days=DH)
            maturity_date = sowing_date + datetime.timedelta(days=DM)
            
            # Replacing the respective values of items in temp_entries with np.nan which correspond 
            # to dates not in between heading and maturity for that specific sub-plot
            temp_entries_filtered = [y if heading_date <= x <= maturity_date else np.nan for x,y in zip(temp_dates, temp_entries)]
            
            # Dropping missing(nan) values from the entries
            temp_entries_dropna = [x for x in temp_entries_filtered if str(x) != 'nan']
            days_before = days_sow.copy()
            # Checking if the number of items in temp_entries_filtered and days_sow is the same
            # If not, i.e., there are missing values(nan) in temp_entries_filtered then drop the
            # respective entries from days_sow list
            if not len(temp_entries_dropna) == len(days_sow):
                # Dictionary comprehension
                # Creating dictionary(dict comprehension) where temp_entries_filtered are not nan
                dict_dropna = {i: [temp_entries_filtered[i], days_sow[i]] for i in range(len(temp_entries_filtered))\
                       if not str(temp_entries_filtered[i]) == 'nan' }
                
                # Checking if the previously created temp_entries_dropna is the same as the new that will
                # be created from dict_dropna (Unnecessary check but curious to check if any problems arise)
                assert temp_entries_dropna == [dict_dropna[i][0] for i in dict_dropna.keys()]
                
                # Creating new temp entries and days_sow after dropping nan and respective entries in days_sow
                temp_entries_dropna = [dict_dropna[i][0] for i in dict_dropna.keys()]
                days_sow = [dict_dropna[i][1] for i in dict_dropna.keys()]

            # Checking if the lists have the same number of entries
            if len(temp_entries_dropna) != len(days_sow):
                print(df, col_name, temp_entries, days_before, temp_entries_dropna, days_sow, dict_dropna)
            assert len(temp_entries_dropna) == len(days_sow)
            
            simps_value = simps(temp_entries_dropna, days_sow)
            trapz_value = trapz(temp_entries_dropna, days_sow)
            
            df_simp.append(simps_value)
            df_trapz.append(trapz_value)
            
#             if simps_value == 0:
#                 print(temp_entries_filtered, days_before, temp_entries_dropna, days_sow)
            # Adding values to a list for reference and record to verify the results and identify problems later
            temp_samples_list.append([temp_df['Plot_ID'][sample], simps_value, trapz_value, temp_cols, temp_dates, sowing_date, DH, DM, heading_date, maturity_date, temp_entries, days_before, temp_entries_dropna, days_sow])
        temp_samples[col_name] = temp_samples_list

        # Insert the new column at the end, but before GrainYield
        df_auc_simps.insert(len(df_auc_simps.columns)-1, col_name, df_simp)
        df_auc_trapz.insert(len(df_auc_trapz.columns)-1, col_name, df_trapz)
        
    samples_record_simps[df.split('_')[0]+'_'+df.split('_')[1]] = temp_samples

    # Adding the new name of the df to a list named simp_df_all
    simp_df = df.split('_')[0]+'_'+df.split('_')[1]+'_Simps'
    trapz_df = df.split('_')[0]+'_'+df.split('_')[1]+'_Trapz'

    simp_df_all.append(simp_df)
    trapz_df_all.append(trapz_df)
    print(simp_df, df_auc_simps.shape, trapz_df, df_auc_trapz.shape)
    locals()[simp_df] = df_auc_simps.copy()
    locals()[trapz_df] = df_auc_trapz.copy()
# simp_df_all, trapz_df_all

Graminor_2019_Simps (600, 41) Graminor_2019_Trapz (600, 41)
Graminor_2020_Simps (800, 41) Graminor_2020_Trapz (800, 41)
Masbasis_2019_Simps (528, 42) Masbasis_2019_Trapz (528, 42)
Masbasis_2020_Simps (659, 44) Masbasis_2020_Trapz (659, 44)
Robot_2020_Simps (96, 43) Robot_2020_Trapz (96, 43)
Staur_2019_Simps (1328, 45) Staur_2019_Trapz (1328, 45)
Staur_2020_Simps (1504, 42) Staur_2020_Trapz (1504, 42)


## Looking for Zero values in data

In [28]:
# Masbasis_2019_Simps.info(null_counts=True)
for df in simp_df_all+trapz_df_all:
    temp_df = locals()[df][base_indices+spectral_indices].copy()
    for col in temp_df.columns.to_list():
        if (temp_df[col]==0).sum() >0:
#             print(temp_df.columns.to_list())
            print(df, col, (temp_df[col]==0).sum())

Masbasis_2019_Simps Blue 19
Masbasis_2019_Simps Green 19
Masbasis_2019_Simps Red 19
Masbasis_2019_Simps RedEdge 19
Masbasis_2019_Simps NIR 19
Masbasis_2019_Simps NDVI 19
Masbasis_2019_Simps MTCI 19
Masbasis_2019_Simps DVI 19
Masbasis_2019_Simps GDVI 19
Masbasis_2019_Simps MTCI_CI 19
Masbasis_2019_Simps EXG 19
Masbasis_2019_Simps EXGR 19
Masbasis_2019_Simps RDVI 19
Masbasis_2019_Simps TDVI 19
Masbasis_2019_Simps GNDVI 19
Masbasis_2019_Simps NDRE 19
Masbasis_2019_Simps SCCI 19
Masbasis_2019_Simps EVI 19
Masbasis_2019_Simps TVI 19
Masbasis_2019_Simps VARI 19
Masbasis_2019_Simps GARI 19
Masbasis_2019_Simps GCI 19
Masbasis_2019_Simps GLI 19
Masbasis_2019_Simps NLI 19
Masbasis_2019_Simps MNLI 19
Masbasis_2019_Simps SAVI 19
Masbasis_2019_Simps GSAVI 19
Masbasis_2019_Simps OSAVI 19
Masbasis_2019_Simps GOSAVI 19
Masbasis_2019_Simps MSAVI2 19
Masbasis_2019_Simps MSR 19
Masbasis_2019_Simps GRVI 19
Masbasis_2019_Simps WDRVI 19
Masbasis_2019_Simps SR 19
Masbasis_2020_Simps Blue 1
Masbasis_2020_Simp

In [29]:
# Plot_ID, simps_value, trapz_value, temp_cols, temp_dates, sowing_date, DH, DM, heading_date, maturity_date, temp_entries, days_before, temp_entries_dropna, days_sow])

list_problem = []
pprint(samples_record_simps['Masbasis_2019'].keys())
for keya, data in samples_record_simps['Masbasis_2019'].items():
    for x in data:
        if x [1]==0:
            temp_entries = x[10]
            days_sow = x[11]
#             simps(xx,dd)
            sowing_date = x[5]
            DH = x[6]
            DM = x[7]
            temp_dates = x[4]
            heading_date= x[8]
            maturity_date = x[9]
#             print(x[0], x[5])
#             print(x)
            list_problem.append(x[0])
            temp_entries_filtered = [x if heading_date <= x <= maturity_date else np.nan for x,y in zip(temp_dates, temp_entries)]
            print(heading_date.strftime('%d%m%y'),[x.strftime('%d%m%y') for x in temp_dates],  maturity_date.strftime('%d%m%y'))
            print('************************')

dict_keys(['Blue', 'Green', 'Red', 'RedEdge', 'NIR', 'NDVI', 'MTCI', 'DVI', 'GDVI', 'MTCI_CI', 'EXG', 'EXGR', 'RDVI', 'TDVI', 'GNDVI', 'NDRE', 'SCCI', 'EVI', 'TVI', 'VARI', 'GARI', 'GCI', 'GLI', 'NLI', 'MNLI', 'SAVI', 'GSAVI', 'OSAVI', 'GOSAVI', 'MSAVI2', 'MSR', 'GRVI', 'WDRVI', 'SR'])
300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 060919
************************
010819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 120919
************************
010819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 070919
************************
300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 090919
************************
310719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 140919
************************
300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 140919
******************

310719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 060919
************************
310719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 080919
************************
300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 060919
************************
300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 040919
************************
300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 060919
************************
010819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 120919
************************
010819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 070919
************************
300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 090919
************************
310719 ['070819', '290719', '220719', '1

************************
020819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 070919
************************
020819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 080919
************************
300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 140919
************************
010819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 090919
************************
020819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 080919
************************
310719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 050919
************************
300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 080919
************************
010819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 120919
************************
310719 ['070819

020819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 080919
************************
300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 140919
************************
010819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 090919
************************
020819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 080919
************************
310719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 050919
************************
300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 080919
************************
010819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 120919
************************
310719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 060919
************************
310719 ['070819', '290719', '220719', '1

300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 090919
************************
310719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 140919
************************
300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 140919
************************
010819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 100919
************************
020819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 070919
************************
020819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 080919
************************
300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 140919
************************
010819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 090919
************************
020819 ['070819', '290719', '220719', '1

300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 090919
************************
310719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 140919
************************
300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 140919
************************
010819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 100919
************************
020819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 070919
************************
020819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 080919
************************
300719 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 140919
************************
010819 ['070819', '290719', '220719', '150719', '050719', '280619', '260619', '060619'] 090919
************************
020819 ['070819', '290719', '220719', '1

In [30]:

temp_entries_filtered = [y if heading_date <= x <= maturity_date else np.nan for x,y in zip(temp_dates, temp_entries)]
temp_entries_filtered
# 
sowing_date, DH, DM, heading_date, maturity_date, temp_dates

(datetime.date(2019, 5, 19),
 72,
 108,
 datetime.date(2019, 7, 30),
 datetime.date(2019, 9, 4),
 [datetime.date(2019, 8, 7),
  datetime.date(2019, 7, 29),
  datetime.date(2019, 7, 22),
  datetime.date(2019, 7, 15),
  datetime.date(2019, 7, 5),
  datetime.date(2019, 6, 28),
  datetime.date(2019, 6, 26),
  datetime.date(2019, 6, 6)])

In [31]:
temp_entries

[3.661298923199796,
 12.443592660173952,
 25.23857708843455,
 24.91080496083957,
 50.45753398347596,
 28.845165982512942,
 26.549500501056094,
 18.087622942260765]

In [32]:
blu_cols = [x for x in Masbasis_2019_all.columns if 'Blue' in x]

Masbasis_2019_all['Plot_ID']
Masbasis_2019_all.iloc[95:96,:][blu_cols]

# list_problem

Unnamed: 0,Blue_070819,Blue_290719,Blue_220719,Blue_150719,Blue_050719,Blue_280619,Blue_260619,Blue_060619
95,0.027306,0.016146,0.014434,0.023842,0.009863,0.015405,0.021012,0.024337


In [33]:
idx_found = []
for x in list_problem:
    idx_found.append(int(Masbasis_2019_all[Masbasis_2019_all['Plot_ID']==x].index.values))
#     print()
Masbasis_2019_all.iloc[idx_found,:][['Plot_ID']+blu_cols]
# idx_found

Unnamed: 0,Plot_ID,Blue_070819,Blue_290719,Blue_220719,Blue_150719,Blue_050719,Blue_280619,Blue_260619,Blue_060619
95,1230,0.027306,0.016146,0.014434,0.023842,0.009863,0.015405,0.021012,0.024337
104,1239,0.025600,0.015743,0.014608,0.028146,0.013328,0.019248,0.025816,0.028921
128,1263,0.023095,0.018011,0.015735,0.032513,0.016847,0.021034,0.030060,0.034576
158,1327,0.021836,0.015360,0.012360,0.024240,0.016347,0.017926,0.024122,0.024535
189,1358,0.021469,0.014657,0.013210,0.035064,0.018151,0.022021,0.030716,0.040643
...,...,...,...,...,...,...,...,...,...
403,1708,0.026613,0.020416,0.014928,0.025384,0.012373,0.016434,0.025196,0.025509
454,1759,0.025247,0.018615,0.014826,0.034066,0.014901,0.023356,0.033556,0.039553
471,1810,0.027780,0.022841,0.016675,0.028434,0.014408,0.021508,0.030092,0.029373
488,1827,0.022435,0.018452,0.014112,0.026933,0.016467,0.023138,0.031967,0.027474


In [34]:
# df['Plot_ID']==x
type(x)

numpy.int64

## Dropping Missing values

In [35]:
df_dh_dm

['Masbasis_2019_all',
 'Masbasis_2020_all_lodg',
 'Robot_2020_all',
 'Staur_2019_all']

In [36]:
# Masbasis_2019_Simps.info(null_counts=True)
for df in simp_df_all:
    temp_df = locals()[df].copy()
#     print('*************', df, '**************')
#     print(locals()[df].info())

    print (df)
    print('Grain Yield data missing for ', temp_df['GrainYield'].isna().sum(), 'out of ', temp_df.shape[0])
# Graminor_2019_Simps.info()

Graminor_2019_Simps
Grain Yield data missing for  1 out of  600
Graminor_2020_Simps
Grain Yield data missing for  1 out of  800
Masbasis_2019_Simps
Grain Yield data missing for  6 out of  528
Masbasis_2020_Simps
Grain Yield data missing for  116 out of  659
Robot_2020_Simps
Grain Yield data missing for  0 out of  96
Staur_2019_Simps
Grain Yield data missing for  0 out of  1328
Staur_2020_Simps
Grain Yield data missing for  568 out of  1504


Yield data of Staur 2020 is disjoint with the banda data. There are 568 subplots whose yield is not available in the yield file and there are 10 subplots in the yield data which are not in the bands data sheets.


There are atleast 116 missing grain yield values in Msbasis 2020 yield dataset.

In [37]:
simp_df_all_dropna = []
for df in simp_df_all:
    temp_df = locals()[df].copy()
    rows = temp_df.shape[0]
#     print(temp_df.shape)
    temp_df.dropna(subset=['Blue'],inplace = True)
    print(df, 'Dropped entries', rows- temp_df.shape[0],':', rows, temp_df.shape[0])
    new_df = df+'_dropna'
    locals()[new_df] = temp_df.copy()
    simp_df_all_dropna.append(new_df)
simp_df_all_dropna

Graminor_2019_Simps Dropped entries 0 : 600 600
Graminor_2020_Simps Dropped entries 0 : 800 800
Masbasis_2019_Simps Dropped entries 0 : 528 528
Masbasis_2020_Simps Dropped entries 0 : 659 659
Robot_2020_Simps Dropped entries 0 : 96 96
Staur_2019_Simps Dropped entries 0 : 1328 1328
Staur_2020_Simps Dropped entries 0 : 1504 1504


['Graminor_2019_Simps_dropna',
 'Graminor_2020_Simps_dropna',
 'Masbasis_2019_Simps_dropna',
 'Masbasis_2020_Simps_dropna',
 'Robot_2020_Simps_dropna',
 'Staur_2019_Simps_dropna',
 'Staur_2020_Simps_dropna']

In [38]:
trapz_df_all_dropna = []
for df in trapz_df_all:
    temp_df = locals()[df].copy()
    rows = temp_df.shape[0]
#     print(temp_df.shape)
    temp_df.dropna(subset=['Blue'],inplace = True)
    print(df, 'Dropped entries', rows- temp_df.shape[0],':', rows, temp_df.shape[0])
    new_df = df+'_dropna'
    locals()[new_df] = temp_df.copy()
    trapz_df_all_dropna.append(new_df)
trapz_df_all_dropna

Graminor_2019_Trapz Dropped entries 0 : 600 600
Graminor_2020_Trapz Dropped entries 0 : 800 800
Masbasis_2019_Trapz Dropped entries 0 : 528 528
Masbasis_2020_Trapz Dropped entries 0 : 659 659
Robot_2020_Trapz Dropped entries 0 : 96 96
Staur_2019_Trapz Dropped entries 0 : 1328 1328
Staur_2020_Trapz Dropped entries 0 : 1504 1504


['Graminor_2019_Trapz_dropna',
 'Graminor_2020_Trapz_dropna',
 'Masbasis_2019_Trapz_dropna',
 'Masbasis_2020_Trapz_dropna',
 'Robot_2020_Trapz_dropna',
 'Staur_2019_Trapz_dropna',
 'Staur_2020_Trapz_dropna']

Old:  
Masbasis_2019_Trapz Dropped entries 0 : 528 528  
Masbasis_2020_Trapz Dropped entries 112 : 659 547  
Robot_2020_Trapz Dropped entries 0 : 96 96  
Staur_2019_Trapz Dropped entries 1166 : 1328 162  

In [39]:
def describe_nan(df):
    return pd.DataFrame([(i, df[df[i].isna()].shape[0],df[df[i].isna()].shape[0]/df.shape[0]) for i in df.columns], columns=['column', 'nan_counts', 'nan_rate'])
for df in simp_df_all_dropna+trapz_df_all_dropna:
    print(df)
    display(describe_nan(locals()[df][base_indices+spectral_indices+['GrainYield']]).nan_counts.sum())
#     display(describe_nan(locals()[df][base_indices+spectral_indices+['GrainYield']]))

Graminor_2019_Simps_dropna


1

Graminor_2020_Simps_dropna


1

Masbasis_2019_Simps_dropna


6

Masbasis_2020_Simps_dropna


116

Robot_2020_Simps_dropna


0

Staur_2019_Simps_dropna


0

Staur_2020_Simps_dropna


568

Graminor_2019_Trapz_dropna


1

Graminor_2020_Trapz_dropna


1

Masbasis_2019_Trapz_dropna


6

Masbasis_2020_Trapz_dropna


116

Robot_2020_Trapz_dropna


0

Staur_2019_Trapz_dropna


0

Staur_2020_Trapz_dropna


568

# Weather Data

## Correcting datetime format

### Vollebekk

In [40]:
weather_vollebekk = pd.read_csv(weather_data_vollebekk)

# Converting date time to python datetime
weather_vollebekk['Time measured'] = pd.to_datetime(weather_vollebekk['Time measured'], infer_datetime_format=True)
# weather_vollebekk['Time measured'] = weather_vollebekk['Time measured'].dt.normalize()

# Removing timezone info from datetime sice other date data is without timezone info
weather_vollebekk['Time measured'] = pd.Series(x.replace(tzinfo=None) for x in weather_vollebekk['Time measured'])

weather_vollebekk.columns

Index(['Time measured', 'Middeltemperatur i 2m høyde (TM)',
       'Maksimum lufttemperatur i 2m høyde (TX)',
       'Minimums lufttemperatur i 2m høyde (TN)', 'Nedbør (RR)',
       'Relativ luftfuktighet i 2m', 'Relativ luftfuktighet i 2m.1',
       'Bladfuktighet i 2m høyde (BT)', ' 10 min glidende middel (FF2)',
       ' vindkast (FG2)', 'Vindhastighet i 2m'],
      dtype='object')

### Staur

In [41]:
weather_staur = pd.read_csv(weather_data_staur)

# Converting date time to python datetime
weather_staur['Time measured'] = pd.to_datetime(weather_staur['Time measured'], infer_datetime_format=True)
# weather_staur['Time measured'] = weather_staur['Time measured'].dt.normalize()

# Removing timezone info from datetime sice other date data is without timezone info
weather_staur['Time measured'] = pd.Series(x.replace(tzinfo=None) for x in weather_staur['Time measured'])

weather_staur.columns

Index(['Time measured', 'Middeltemperatur i 2m høyde (TM)',
       'Maksimum lufttemperatur i 2m høyde (TX)',
       'Minimums lufttemperatur i 2m høyde (TN)', 'Nedbør (RR)',
       'Relativ luftfuktighet i 2m', 'Relativ luftfuktighet i 2m.1',
       ' vindkast (FG2)', 'Vindhastighet i 2m'],
      dtype='object')

## Translating column names

In [42]:
# Translated the column heading using google translate


weather_vollebekk.columns = ['Time measured', 'Average temperature at 2m altitude (TM)',
                             'Maximum air temperature at 2m altitude (TX)',
                             'Minimum air temperature at 2m altitude (TN)', 'Precipitation (RR)',
                             'Relative humidity in 2m - 1', 'Relative humidity in 2m - 2',
                             'Leaf moisture at 2m height (BT)', '10 min lubricant (FF2)',
                             'Wind gust (FG2)', 'Wind speed in 2m']


# Translated the column heading using google translate

weather_staur.columns = ['Time measured', 'Average temperature at 2m altitude (TM)',
                         'Maximum air temperature at 2m altitude (TX)',
                         'Minimum air temperature at 2m altitude (TN)', 'Precipitation (RR)',
                         'Relative humidity in 2m - 1', 'Relative humidity in 2m - 2',
                         'Wind gust (FG2)', 'Wind speed in 2m']

## Craeting a list of min and max date in every field

In [43]:
max_min_dates = {}
for df in all_df:
    temp_df = locals()[df].copy()
    dates = [x.split('_')[1] for x in temp_df.columns if 'Blue' in x]
    df_name_temp = df.split('_')[0]+'_'+df.split('_')[1]
    sowing_date_temp = datetime.datetime.strptime(sowing_dict[df_name_temp], '%d%m%y')
    min_date_temp = min([datetime.datetime.strptime(x, '%d%m%y') for x in dates ])
    max_date_temp = max([datetime.datetime.strptime(x, '%d%m%y') for x in dates ])
    max_min_dates[df] = [sowing_date_temp, min_date_temp, max_date_temp]
# max_min_dates

## Calculating average DH and DM

In [44]:
dict_avg_dh_dm

{'Masbasis_2019_all': [68.18939393939394, 108.64393939393939],
 'Masbasis_2020_all_lodg': [66.28983308042488, 87.94881170018282],
 'Robot_2020_all': [61.09375, 110.84375],
 'Staur_2019_all': [48.53333333333333, 101.25757575757575]}

In [45]:
print(f'Average Days2Heading is {mean_dh}')
print(f'Average Days2Maturity is {mean_dm}')

Average Days2Heading is 61.026577588288035
Average Days2Maturity is 102.17351921292449


## Filtering weather data

In [46]:
# Weather data for days from sowing date. Largerst of the average number of Days2Maturity 
# is a good measaure to use
# Could have used max_date(last date availabel for the field) but that date does not 
# correspond to the actual crop maturity.
# So using approxipame maturity time is a better measure of the affect of weather on yield
days_delta = max(list_dm)

weather_dfs = []
weathers_processed_df = []
for df, dates in max_min_dates.items():
    df_weather_temp = pd.DataFrame()

    sowing_date_temp = dates[0]
    min_date_temp = dates[1]
    max_date_temp = dates[2]
    
    if 'Staur' in df:
        # Filtering the weather date from sowing_date to max_date the data is available for
        temp_weather = weather_staur.loc[(weather_staur['Time measured'] >= sowing_date_temp) &\
                              (weather_staur['Time measured'] <= sowing_date_temp + datetime.timedelta(days=days_delta))]
        # Filling the missing values with the average value of the area for the given complete weather data
        temp_weather.fillna(weather_staur.mean(), inplace = True)

    else:
        # Filtering the weather date from sowing_date to max_date the data is available for
        temp_weather = weather_vollebekk.loc[(weather_vollebekk['Time measured'] >= sowing_date_temp) &\
                              (weather_vollebekk['Time measured'] <= sowing_date_temp + datetime.timedelta(days=days_delta))]
        # Filling the missing values with the average value of the area for the given complete weather data
        temp_weather.fillna(weather_vollebekk.mean(), inplace = True)

#     print(df, sowing_date_temp.date(), min_date_temp.date(), max_date_temp.date())
#     print(temp_weather.shape)

#     # See info to find hoe many missing values and in which column
#     display(temp_weather.info())

    # Filling themissing values with the average of the column
    # Applying Only on columns with NaN values
    for i in temp_weather.columns[temp_weather.isnull().any(axis=0)]:
        temp_weather[i].fillna(temp_weather[i].mean(),inplace=True)
    # Drop the time measures column
    temp_weather.drop(['Time measured'], axis=1, inplace=True)
    
    df_weat_temp = df.split('_')[0]+'_'+df.split('_')[1]+'_weather_all'
    locals()[df_name_temp] = temp_weather.copy()
    weather_dfs.append(df_name_temp)
    
    
    
    # Aggregating the weather data using several statistical methods
    mean_df = temp_weather.mean().to_frame().transpose().add_prefix('MEAN ')
    median_df = temp_weather.median().to_frame().transpose().add_prefix('MEDIAN ')
    # Different for mode since mode returns a df, instead of series object
    mode_df = temp_weather.mode().transpose().iloc[:,0].to_frame().transpose().add_prefix('MODE ')
    sum_df = temp_weather.sum().to_frame().transpose().add_prefix('SUM ')
    min_df = temp_weather.min().to_frame().transpose().add_prefix('MIN ')
    max_df = temp_weather.max().to_frame().transpose().add_prefix('MAX ')
    std_df = temp_weather.std().to_frame().transpose().add_prefix('STD_DEV ')
    # Reset index in quantile since it takes quantile of index as well
    quantile_25 = temp_weather.quantile(q=0.25).to_frame().transpose().add_prefix('QUANTILE_25 ').reset_index(drop=True)
    quantile_50 = temp_weather.quantile(q=0.5).to_frame().transpose().add_prefix('QUANTILE_50 ').reset_index(drop=True)
    quantile_75 = temp_weather.quantile(q=0.75).to_frame().transpose().add_prefix('QUANTILE_75 ').reset_index(drop=True)

    single_row_df = pd.concat([mean_df, median_df, mode_df, sum_df, min_df, max_df, std_df, quantile_25, quantile_50, quantile_75], axis=1)
    
    if single_row_df.isna().sum().sum() > 1:
        print(df)
        print(single_row_df.isna().sum())
        raise ValueError
#     display(single_row_df)

    df_processed_temp = df.split('_')[0]+'_'+df.split('_')[1]+'_weather_agg'
    locals()[df_processed_temp] = single_row_df.copy()
    weathers_processed_df.append(df_processed_temp)
weathers_processed_df

  temp_weather.fillna(weather_vollebekk.mean(), inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  temp_weather.fillna(weather_staur.mean(), inplace = True)


['Graminor_2019_weather_agg',
 'Graminor_2020_weather_agg',
 'Masbasis_2019_weather_agg',
 'Masbasis_2020_weather_agg',
 'Robot_2020_weather_agg',
 'Staur_2019_weather_agg',
 'Staur_2020_weather_agg']

In [47]:
Masbasis_2019_weather_agg.shape

(1, 100)

In [48]:
# Staur_2020_weather_agg.shape

In [49]:
vollebekk_fields_weather = [x for x in weathers_processed_df if 'Staur' not  in x]
staur_fields_weather = [x for x in weathers_processed_df if 'Staur' in x]

weather_cols_vollebekk = locals()[vollebekk_fields_weather[0]].columns.tolist()
weather_cols_staur = locals()[staur_fields_weather[0]].columns.tolist()

## Exporting Weather Indices for later use

In [50]:
# Make sure the folder/dir is there. If not, create one
os.makedirs(main_path, exist_ok=True)
import json
a_file = open(main_path+'vollebekk_weather_columns.json', "w")
json.dump(weather_cols_vollebekk, a_file)
a_file.close()

# a_file = open("Data\vollebekk_weather_columns.json", "r")
# output_str = a_file.read()

# # The file is imported as string
# # Converting it to python format
# weather_cols_vollebekk = json.loads(output_str)
# a_file.close()
# print(weather_cols_vollebekk)

In [51]:
# Make sure the folder/dir is there. If not, create one
os.makedirs(main_path, exist_ok=True)
import json
a_file = open(main_path+'staur_weather_columns.json', "w")
json.dump(weather_cols_staur, a_file)
a_file.close()

# a_file = open("Data\staur_weather_columns.json", "r")
# output_str = a_file.read()

# # The file is imported as string
# # Converting it to python format
# weather_cols_staur = json.loads(output_str)
# a_file.close()
# print(weather_cols_staur)

## Adding weather data to the simps integrated df

In [52]:
df_to_export = []
for df in simp_df_all_dropna+trapz_df_all_dropna:
    temp_df = locals()[df].copy()
    
    field_name = df.split('_')[0]+'_'+df.split('_')[1]
    integration_type = df.split('_')[2]
    
    single_row_name = field_name+'_weather_agg'
    single_row_df = locals()[single_row_name]
    # Replicating the single_row data multiple times to make the df equal to the number of rows in the original df
    rows_df = temp_df.shape[0] 
    new_df = pd.DataFrame(np.repeat(single_row_df.values, rows_df, axis=0), columns=single_row_df.columns)
    
    pd.concat([temp_df, new_df], axis=1)
    merged_df = pd.concat([temp_df, new_df], axis=1)
    
    locals()[field_name+'_'+integration_type] = merged_df.copy()
    df_to_export.append(field_name+'_'+integration_type)

In [53]:
Graminor_2019_Simps.columns

Index(['Plot_ID', 'GrainYield', 'Block', 'iBlock', 'Entry', 'Name', 'Blue',
       'Green', 'Red', 'RedEdge',
       ...
       'QUANTILE_75 Average temperature at 2m altitude (TM)',
       'QUANTILE_75 Maximum air temperature at 2m altitude (TX)',
       'QUANTILE_75 Minimum air temperature at 2m altitude (TN)',
       'QUANTILE_75 Precipitation (RR)',
       'QUANTILE_75 Relative humidity in 2m - 1',
       'QUANTILE_75 Relative humidity in 2m - 2',
       'QUANTILE_75 Leaf moisture at 2m height (BT)',
       'QUANTILE_75 10 min lubricant (FF2)', 'QUANTILE_75 Wind gust (FG2)',
       'QUANTILE_75 Wind speed in 2m'],
      dtype='object', length=141)

## Summary of processed data

In [54]:
for df in df_to_export:
    temp_df = locals()[df].copy()
    temp_cols = temp_df.columns.tolist()
    # Bands, Indices, Plot_ID and GrainYield columns only
    chk_cols = [x for x in temp_cols if x not in weather_cols_staur if x not in yield_cols]+['Plot_ID', 'GrainYield']
    nan_found = False
    for col in chk_cols:
        if temp_df[col].isna().sum() > 0:
            nan_found = True
            print(df,': Missing', col,':', temp_df[col].isna().sum())
    if not nan_found:
        print(f'No nan values found in any column in {df}')
    
    print('*************************')

Graminor_2019_Simps : Missing GrainYield : 1
*************************
Graminor_2020_Simps : Missing GrainYield : 1
*************************
Masbasis_2019_Simps : Missing GrainYield : 6
*************************
Masbasis_2020_Simps : Missing GrainYield : 116
*************************
No nan values found in any column in Robot_2020_Simps
*************************
No nan values found in any column in Staur_2019_Simps
*************************
Staur_2020_Simps : Missing GrainYield : 568
*************************
Graminor_2019_Trapz : Missing GrainYield : 1
*************************
Graminor_2020_Trapz : Missing GrainYield : 1
*************************
Masbasis_2019_Trapz : Missing GrainYield : 6
*************************
Masbasis_2020_Trapz : Missing GrainYield : 116
*************************
No nan values found in any column in Robot_2020_Trapz
*************************
No nan values found in any column in Staur_2019_Trapz
*************************
Staur_2020_Trapz : Missing GrainYield 

In [55]:
df_to_export

['Graminor_2019_Simps',
 'Graminor_2020_Simps',
 'Masbasis_2019_Simps',
 'Masbasis_2020_Simps',
 'Robot_2020_Simps',
 'Staur_2019_Simps',
 'Staur_2020_Simps',
 'Graminor_2019_Trapz',
 'Graminor_2020_Trapz',
 'Masbasis_2019_Trapz',
 'Masbasis_2020_Trapz',
 'Robot_2020_Trapz',
 'Staur_2019_Trapz',
 'Staur_2020_Trapz']

# Creating Environment Variable to differentiate Vollebek and Staur

In [56]:
for df in df_to_export:
    temp_df = locals()[df].copy()
    if 'Staur' in df:
        temp_df['Staur_Env'] = int(1)
        temp_df['Vollebekk_Env'] = int(0)
    else:
        temp_df['Staur_Env'] = int(0)
        temp_df['Vollebekk_Env'] = int(1)
    locals()[df] = temp_df.copy()

In [57]:
Graminor_2020_Trapz

Unnamed: 0,Plot_ID,GrainYield,Block,iBlock,Entry,Name,Blue,Green,Red,RedEdge,NIR,NDVI,MTCI,DVI,GDVI,MTCI_CI,EXG,EXGR,RDVI,TDVI,GNDVI,NDRE,SCCI,EVI,TVI,VARI,GARI,GCI,GLI,NLI,MNLI,SAVI,GSAVI,OSAVI,GOSAVI,MSAVI2,MSR,GRVI,WDRVI,SR,Pedigree,MEAN Average temperature at 2m altitude (TM),MEAN Maximum air temperature at 2m altitude (TX),MEAN Minimum air temperature at 2m altitude (TN),MEAN Precipitation (RR),MEAN Relative humidity in 2m - 1,MEAN Relative humidity in 2m - 2,MEAN Leaf moisture at 2m height (BT),MEAN 10 min lubricant (FF2),MEAN Wind gust (FG2),...,MAX Minimum air temperature at 2m altitude (TN),MAX Precipitation (RR),MAX Relative humidity in 2m - 1,MAX Relative humidity in 2m - 2,MAX Leaf moisture at 2m height (BT),MAX 10 min lubricant (FF2),MAX Wind gust (FG2),MAX Wind speed in 2m,STD_DEV Average temperature at 2m altitude (TM),STD_DEV Maximum air temperature at 2m altitude (TX),STD_DEV Minimum air temperature at 2m altitude (TN),STD_DEV Precipitation (RR),STD_DEV Relative humidity in 2m - 1,STD_DEV Relative humidity in 2m - 2,STD_DEV Leaf moisture at 2m height (BT),STD_DEV 10 min lubricant (FF2),STD_DEV Wind gust (FG2),STD_DEV Wind speed in 2m,QUANTILE_25 Average temperature at 2m altitude (TM),QUANTILE_25 Maximum air temperature at 2m altitude (TX),QUANTILE_25 Minimum air temperature at 2m altitude (TN),QUANTILE_25 Precipitation (RR),QUANTILE_25 Relative humidity in 2m - 1,QUANTILE_25 Relative humidity in 2m - 2,QUANTILE_25 Leaf moisture at 2m height (BT),QUANTILE_25 10 min lubricant (FF2),QUANTILE_25 Wind gust (FG2),QUANTILE_25 Wind speed in 2m,QUANTILE_50 Average temperature at 2m altitude (TM),QUANTILE_50 Maximum air temperature at 2m altitude (TX),QUANTILE_50 Minimum air temperature at 2m altitude (TN),QUANTILE_50 Precipitation (RR),QUANTILE_50 Relative humidity in 2m - 1,QUANTILE_50 Relative humidity in 2m - 2,QUANTILE_50 Leaf moisture at 2m height (BT),QUANTILE_50 10 min lubricant (FF2),QUANTILE_50 Wind gust (FG2),QUANTILE_50 Wind speed in 2m,QUANTILE_75 Average temperature at 2m altitude (TM),QUANTILE_75 Maximum air temperature at 2m altitude (TX),QUANTILE_75 Minimum air temperature at 2m altitude (TN),QUANTILE_75 Precipitation (RR),QUANTILE_75 Relative humidity in 2m - 1,QUANTILE_75 Relative humidity in 2m - 2,QUANTILE_75 Leaf moisture at 2m height (BT),QUANTILE_75 10 min lubricant (FF2),QUANTILE_75 Wind gust (FG2),QUANTILE_75 Wind speed in 2m,Staur_Env,Vollebekk_Env
0,101,654.708159,1,1,1,Zebra,0.357189,0.708058,0.984795,2.430557,7.496881,12.572435,58.557849,6.512086,6.788823,58.557849,0.074131,-0.596524,9.076398,11.169290,13.967147,8.393775,10.821504,-19.348822,379.655671,0.710513,10.683550,158.460087,-2.460994,8.371224,4.663654,9.654279,10.269130,9.655557,10.462211,26.579387,276.873029,175.460087,0.660661,282.990792,Ralle/Dragon,12.990991,19.154054,6.290991,2.777477,64.209009,90.356757,1.83964,19.667568,12.218018,...,16.7,42.4,93.9,99.7,3.8,30.1,18.3,19.2,4.790607,5.415597,5.258682,6.632615,14.891081,8.833558,0.730414,6.88387,4.135922,4.421542,9.6,15.7,1.9,0.0,50.55,86.25,1.4,15.35,7.95,8.25,13.3,19.3,6.9,0.0,63.9,93.5,1.7,21.3,14.0,14.4,15.8,22.05,10.8,1.3,73.9,96.9,2.2,24.5,15.4,16.55,0,1
1,102,709.595446,1,1,19,GN18666,0.344390,0.746604,0.986895,2.390526,7.615864,12.365737,60.106876,6.628969,6.869260,60.106876,0.161924,-0.473125,9.093451,11.253830,13.853989,8.466149,11.068005,-20.306163,388.126495,0.426309,11.037205,152.906807,-2.309750,7.859990,4.802248,9.679191,10.251524,9.596091,10.398152,26.749668,278.593065,169.906807,0.302249,284.877001,GN04528/GN03509,12.990991,19.154054,6.290991,2.777477,64.209009,90.356757,1.83964,19.667568,12.218018,...,16.7,42.4,93.9,99.7,3.8,30.1,18.3,19.2,4.790607,5.415597,5.258682,6.632615,14.891081,8.833558,0.730414,6.88387,4.135922,4.421542,9.6,15.7,1.9,0.0,50.55,86.25,1.4,15.35,7.95,8.25,13.3,19.3,6.9,0.0,63.9,93.5,1.7,21.3,14.0,14.4,15.8,22.05,10.8,1.3,73.9,96.9,2.2,24.5,15.4,16.55,0,1
2,103,707.100569,1,1,11,GN15590,0.356473,0.752264,0.995796,2.344988,7.229487,11.938677,55.034116,6.233690,6.477222,55.034116,0.152260,-0.489590,8.631207,10.584490,13.436760,7.988492,10.848835,-19.273311,364.280159,0.300482,10.823468,140.660170,-2.333807,6.492268,4.283645,9.156730,9.736268,9.147707,9.958952,25.741403,263.308171,157.660170,-0.432195,269.948962,Demonstrant/SW51114,12.990991,19.154054,6.290991,2.777477,64.209009,90.356757,1.83964,19.667568,12.218018,...,16.7,42.4,93.9,99.7,3.8,30.1,18.3,19.2,4.790607,5.415597,5.258682,6.632615,14.891081,8.833558,0.730414,6.88387,4.135922,4.421542,9.6,15.7,1.9,0.0,50.55,86.25,1.4,15.35,7.95,8.25,13.3,19.3,6.9,0.0,63.9,93.5,1.7,21.3,14.0,14.4,15.8,22.05,10.8,1.3,73.9,96.9,2.2,24.5,15.4,16.55,0,1
3,104,724.958634,1,1,5,Mirakel,0.435815,0.883467,1.153322,2.752884,7.641620,11.831737,50.754890,6.488298,6.758153,50.754890,0.177797,-0.553387,8.788554,10.903625,13.133611,7.517882,10.059720,-20.355972,378.503650,1.089963,11.080420,151.488482,-2.108609,6.974546,4.489474,9.339851,9.930690,9.204469,9.971205,26.442453,307.957912,168.488482,-0.069058,314.608892,SW38337/NK98533//NK98535,12.990991,19.154054,6.290991,2.777477,64.209009,90.356757,1.83964,19.667568,12.218018,...,16.7,42.4,93.9,99.7,3.8,30.1,18.3,19.2,4.790607,5.415597,5.258682,6.632615,14.891081,8.833558,0.730414,6.88387,4.135922,4.421542,9.6,15.7,1.9,0.0,50.55,86.25,1.4,15.35,7.95,8.25,13.3,19.3,6.9,0.0,63.9,93.5,1.7,21.3,14.0,14.4,15.8,22.05,10.8,1.3,73.9,96.9,2.2,24.5,15.4,16.55,0,1
4,105,740.978368,1,1,24,GN18751,0.430810,1.005594,1.062782,3.008847,9.132592,12.956361,54.878328,8.069809,8.126997,54.878328,0.517597,0.035296,10.314404,13.103610,13.600388,8.294736,10.383674,-25.556727,481.901037,2.293918,11.577263,149.919642,-1.471260,9.973359,6.673209,10.967576,11.147025,10.456513,10.765142,29.686309,305.940892,166.919642,1.151561,311.740343,QUARNA/GN03531,12.990991,19.154054,6.290991,2.777477,64.209009,90.356757,1.83964,19.667568,12.218018,...,16.7,42.4,93.9,99.7,3.8,30.1,18.3,19.2,4.790607,5.415597,5.258682,6.632615,14.891081,8.833558,0.730414,6.88387,4.135922,4.421542,9.6,15.7,1.9,0.0,50.55,86.25,1.4,15.35,7.95,8.25,13.3,19.3,6.9,0.0,63.9,93.5,1.7,21.3,14.0,14.4,15.8,22.05,10.8,1.3,73.9,96.9,2.2,24.5,15.4,16.55,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,896,663.899810,2,10,23,GN20696,-1.142463,-2.812421,-1.026365,-6.107954,-22.535456,-23.562207,-77.449385,-21.509091,-19.723035,-77.449385,-3.456015,-4.831525,-21.710500,-25.477091,-18.711007,-13.960441,-14.780271,108.062768,-1361.987713,-16.836558,-17.967621,-164.783556,-2.530717,-23.976822,-18.848564,-21.284327,-18.221899,-19.194051,-15.863818,-55.451528,-436.028628,-189.783556,-8.204379,-441.058704,Saar/2*Avle,12.990991,19.154054,6.290991,2.777477,64.209009,90.356757,1.83964,19.667568,12.218018,...,16.7,42.4,93.9,99.7,3.8,30.1,18.3,19.2,4.790607,5.415597,5.258682,6.632615,14.891081,8.833558,0.730414,6.88387,4.135922,4.421542,9.6,15.7,1.9,0.0,50.55,86.25,1.4,15.35,7.95,8.25,13.3,19.3,6.9,0.0,63.9,93.5,1.7,21.3,14.0,14.4,15.8,22.05,10.8,1.3,73.9,96.9,2.2,24.5,15.4,16.55,0,1
796,897,722.332448,2,10,20,GN20693,-1.176186,-2.769116,-1.337967,-5.972457,-21.741130,-21.874550,-80.045897,-20.403164,-18.972015,-80.045897,-3.024079,-3.920042,-20.634375,-24.704812,-18.183498,-13.302102,-14.865297,91.003022,-1281.435797,-13.526451,-18.960994,-166.719680,-1.285292,-20.776402,-17.865826,-20.415924,-17.983221,-18.177217,-15.581737,-54.012397,-405.156055,-191.719680,-5.184949,-411.568731,ONPMSYDER 5/GN08588,12.990991,19.154054,6.290991,2.777477,64.209009,90.356757,1.83964,19.667568,12.218018,...,16.7,42.4,93.9,99.7,3.8,30.1,18.3,19.2,4.790607,5.415597,5.258682,6.632615,14.891081,8.833558,0.730414,6.88387,4.135922,4.421542,9.6,15.7,1.9,0.0,50.55,86.25,1.4,15.35,7.95,8.25,13.3,19.3,6.9,0.0,63.9,93.5,1.7,21.3,14.0,14.4,15.8,22.05,10.8,1.3,73.9,96.9,2.2,24.5,15.4,16.55,0,1
797,898,782.997343,2,10,6,GN20679,-1.264409,-2.927311,-1.254880,-6.076431,-22.796005,-22.414157,-81.481755,-21.541125,-19.868694,-81.481755,-3.335332,-4.505811,-21.293259,-25.062794,-18.036576,-13.751230,-15.182898,99.714485,-1359.364713,-15.546076,-18.953399,-156.422154,-1.829875,-21.777077,-18.529335,-20.818410,-17.994052,-18.530202,-15.490224,-55.247198,-404.327994,-181.422154,-6.254764,-410.299373,SW71139/GN06600,12.990991,19.154054,6.290991,2.777477,64.209009,90.356757,1.83964,19.667568,12.218018,...,16.7,42.4,93.9,99.7,3.8,30.1,18.3,19.2,4.790607,5.415597,5.258682,6.632615,14.891081,8.833558,0.730414,6.88387,4.135922,4.421542,9.6,15.7,1.9,0.0,50.55,86.25,1.4,15.35,7.95,8.25,13.3,19.3,6.9,0.0,63.9,93.5,1.7,21.3,14.0,14.4,15.8,22.05,10.8,1.3,73.9,96.9,2.2,24.5,15.4,16.55,0,1
798,899,712.352941,2,10,9,GN20682,-1.095302,-2.852831,-1.179844,-6.577272,-24.908074,-23.074687,-80.804596,-23.728230,-22.055243,-80.804596,-3.430516,-4.631565,-22.742353,-26.608663,-19.070187,-14.048091,-15.142762,176.331984,-1490.613275,-15.031171,-21.456872,-174.613269,-2.166557,-23.725325,-20.109137,-22.085015,-19.395115,-19.402941,-16.572476,-58.544844,-436.892297,-199.613269,-7.740546,-442.282497,GN07574/SW71139,12.990991,19.154054,6.290991,2.777477,64.209009,90.356757,1.83964,19.667568,12.218018,...,16.7,42.4,93.9,99.7,3.8,30.1,18.3,19.2,4.790607,5.415597,5.258682,6.632615,14.891081,8.833558,0.730414,6.88387,4.135922,4.421542,9.6,15.7,1.9,0.0,50.55,86.25,1.4,15.35,7.95,8.25,13.3,19.3,6.9,0.0,63.9,93.5,1.7,21.3,14.0,14.4,15.8,22.05,10.8,1.3,73.9,96.9,2.2,24.5,15.4,16.55,0,1


# Temp: Exporting data for modeling

In [58]:
os.makedirs(export_path, exist_ok=True)
for df in df_to_export:
    locals()[df].to_csv(export_path+df+'.csv', index=False)

# END OF SECTION

In [129]:
ERROR

NameError: name 'ERROR' is not defined

# Import Genomics Data

In [None]:
genomics_data

## Importing Genomics Data

In [None]:
# If the dataset had Days 2 heading and days to maturity columns then create the
# following dictionary with the respective sowing dates of each field as value

for df in all_df:
    temp_df = locals()[df].copy()
    field_temp = df.split('_')[0]+'_'+df.split('_')[1]
    if 'Line' in temp_df.columns:
        print(df)
#         all_df_dates_filtered[df] = sowing_dict[field_temp]
# all_df_dates_filtered

In [None]:
# Importing Yield data with line information

In [None]:
# Vollebekk 2019: Graminor_2019_x_19TvPhenores_x_Vollebekk_res
# Masbasis 2020: Masbasis_x_20BMLGI1_2020_tm_x_data
# Robot 2020: Robot_x_ROBOT_2020_x_raw
# Masbasis 2019: Masbasis_2019_x_Field_data_2019

In [None]:
a_file = open(main_path+'yield_df.json', "r")
output_str = a_file.read()
# The file is imported as string

# Converting it to dictionary
output_dict = json.loads(output_str)
a_file.close()

pprint(output_dict)

### Checking number of unique cultivars in the field

In [None]:
# plots_data = pd.read_excel(files_with_address[0],engine='openpyxl')
# # Pandas converts 'NA' string to NaN. Need to change those to 
# # some string to get a count as NaNs are not counted as unique values

# plots_data.Name.fillna('-', inplace=True)
# plots_data.CodeName.fillna('-', inplace=True)

# # Creating a new column as multiple plots were named 'NA' but the 
# # CodeName was different for each one of them
# plots_data['NameCode'] = plots_data.Name+plots_data.CodeName

# plots_data
# len(plots_data.NameCode.unique())
# plots_data.NameCode.value_counts()
# # plots_data.NameCode.value_counts().sum()
# # plots_data

# ToDo: Dropping NAN

## Finding NAN values
### ToDo: Test: Raise error if missing values found

In [None]:
# Finding number of missing values in each dataframe
df_with_nan = []
missing_values = False
for df in all_df:
    if locals()[df].isna().sum().sum() > 0:
        print(f'Total missing values in {df} are {locals()[df].isna().sum().sum()}')
        missing_values = True
        df_with_nan.append(df)
#     if len(df_with_nan) > 0:
#         raise ValueError
if not missing_values:
    print('No missing value found in any dataframe')

In [None]:
Graminor_2019_all.isnull().sum().sort_values()

In [None]:
df_with_nan

In [None]:
# Finding which column has NAN values
for df in df_with_nan:
    print(f'{df}:\n {locals()[df].shape[1]-locals()[df].dropna(axis=1).shape[1]} columns or {locals()[df].shape[0]-locals()[df].dropna().shape[0]} rows to be dropped,')

## ToDo: Automate: Drop rows with missing values in df_with_nan

In [None]:
print(f'{Graminor_eastwest_020719_NIR_half_missing.shape} Before dropping')
# Graminor_eastwest_020719_NIR_half_missing.dropna(inplace=True)
print(f'{Graminor_eastwest_020719_NIR_half_missing.shape} After dropping')


## ORRR

## ToDo: Droppping df with Nan from the all_df_std

In [130]:
print(f'Number of items in all_df is {len(all_df)}')

Number of items in all_df is 7


In [131]:
# for df in df_with_nan:
#     all_df.remove(df)

###  ToDo: Update field_year_dict and sorted_field_year_dict after dropping the dataset

In [132]:
print(f'Number of items in all_df now is {len(all_df)}')

Number of items in all_df now is 7


# Data Trends

## Normal Distribution of data

ToDo:  
see the distribution of data if it is normal  
else make transpose to make it normal  
dist in Gausion function   
in each field  
what if the data is normal dist?  
the use some transpose to box pox   
try diff funct to see which one iis able to make data normal  
make heat map of whole if not normal  
see which parts are not normal and exculde them  
ls_means in R to make the normalisation/transpose  
pearson corr bw yield and indices for diff dates  


In [133]:
x_labels

NameError: name 'x_labels' is not defined

### Yeo-Johnson Transformation

In [134]:
col_for_plotting = ['Blue', 'Green', 'Red', 'RedEdge', 'NIR', 'NDVI', 'MTCI', 'EVI']

from sklearn.preprocessing import PowerTransformer, normalize, StandardScaler
data_agg_list = ['_median_indices']

# col_for_plotting = ['Blue']
# col_for_plotting = ['Green']
# col_for_plotting = ['Red']

for d_type in data_agg_list:
    for col in col_for_plotting:
        fields = len(field_year_dict_yield.keys())
        rows = math.ceil(fields/2)
        
        fig, ax = plt.subplots(rows,2, figsize=(15,10))
        plots = ax.flatten()
        n = 0
        # TODO: Fix the x ticks
        

        for field_sample, dates in sorted_field_year_dict_yield.items():
            x_labels = []
            # Adding required data to a temp dataframe
            temp_df = pd.DataFrame()
            for date in dates:
                date_str = date.strftime('%d%m%y')
                field_df = field_sample[:-5]+'_'+date_str+d_type
                temp_df[date] = locals()[field_df][col]
                x_label = date.strftime('%d-%m-%y')+':'+str(len(locals()[field_df][col]))
                
                x_labels.append(x_label)
                x_labels= list(set(x_labels))
            # Transform the df
#             pt = PowerTransformer(method='box-cox', standardize=False)
            pt = PowerTransformer(method='yeo-johnson', standardize=False)

            temp_arr = pt.fit_transform(temp_df)
            temp_df = pd.DataFrame(temp_arr)
            
            # Adding field plot to the subplots
            num_of_fields = len(field_year_dict_yield.keys())
            
            text = "Grain Yield"
            ax_n = plots[n]
            
            temp_df.boxplot(ax=ax_n)
            ax_n.set_xticklabels(x_labels, rotation=-35)
            ax_n.set_title(field_sample+'_'+col+d_type[:-5]+'_yeo-johnson')
            
#             # Printing the grain yield in plot of the fiels_sample for reference
#             ax_n.text(0.85, 1.05, text, ha='center', va='top', weight='bold', color='blue', transform=ax_n.transAxes)
            n+=1
        plt.tight_layout()



NameError: name 'field_year_dict_yield' is not defined

### Box-Cox Transformation

In [None]:
col_for_plotting = ['Blue', 'Green', 'Red', 'RedEdge', 'NIR', 'NDVI', 'MTCI', 'EVI']

from sklearn.preprocessing import PowerTransformer, normalize, StandardScaler
data_agg_list = ['_median_indices']

# col_for_plotting = ['Blue']
# col_for_plotting = ['Green']
# col_for_plotting = ['Red']

for d_type in data_agg_list:
    for col in col_for_plotting:
        fields = len(field_year_dict_yield.keys())
        rows = math.ceil(fields/2)
        
        fig, ax = plt.subplots(rows,2, figsize=(15,10))
        plots = ax.flatten()
        n = 0
        # TODO: Fix the x ticks
        for field_sample, dates in sorted_field_year_dict_yield.items():
            
            # Adding required data to a temp dataframe
            temp_df = pd.DataFrame()
            for date in dates:
                date_str = date.strftime('%d%m%y')
                field_df = field_sample[:-5]+'_'+date_str+d_type
                temp_df[date] = locals()[field_df][col]
            x_labels = temp_df.columns.tolist()

            # Transform the df
#             pt = PowerTransformer(method='box-cox', standardize=False)
            pt = PowerTransformer(method='box-cox', standardize=False)

            # Taking absolute values of the dataframe(avoiding negative values)
            temp_arr = pt.fit_transform(temp_df.abs())
            temp_df = pd.DataFrame(temp_arr)
            
            # Adding field plot to the subplots
            num_of_fields = len(field_year_dict_yield.keys())
            
            text = "Grain Yield"
            ax_n = plots[n]

            temp_df.boxplot(ax=ax_n)
            ax_n.set_xticklabels(x_labels, rotation=90)
            ax_n.set_title(field_sample+'_'+col+d_type[:-5]+'_box-cox')
            
#             # Printing the grain yield in plot of the fiels_sample for reference
#             ax_n.text(0.85, 1.05, text, ha='center', va='top', weight='bold', color='blue', transform=ax_n.transAxes)
            n+=1
        plt.tight_layout()



### ToDo: Identify Dates and index with problems

### Ecxclude the problematic data/dates
or
### Take average values where the problematic data is

Take average of data for date 20200708 and 20200624  
Masbasis  
Cleanup  
Remove dates which have drop  

## ToDo: Remove outliers

### Find AUC for all dates of one field
See if it covers tha gaps under the dates,i.e.

Since data points are different  
Flying time is different  
Cover the gaps between the dates  

Since the data collection is not uniform throughout the year so AUC will give a single value instead of multiple values for one field year which will be representative of all the dates 

#### Option 1: Use Scipy

In [135]:
import scipy
scipy.__version__

'1.6.1'

In [136]:
from scipy import integrate
from scipy.integrate import simps

In [137]:
from scipy.integrate import simpson

In [138]:
x = np.arange(0, 10)
y = np.arange(0, 10)


In [139]:
# integrate.simpson(y, x)
integrate.simps(y, x)

40.5

In [140]:
y = np.power(x, 3)
y

array([  0,   1,   8,  27,  64, 125, 216, 343, 512, 729], dtype=int32)

In [141]:
integrate.simpson(y, x)
# integrate.simps(y, x)


1642.5

In [142]:
integrate.quad(lambda x: x**3, 0, 9)[0]

1640.25

In [143]:
integrate.simpson(y, x, even='first')
# integrate.simps(y, x, even='first')

1644.5

#### Option 2

In [144]:
data
# plot: Plot ID
# x: Number of days after sowing or actual date
# y: Value of the index


[[1101,
  -42.16505560740016,
  -42.16505560740016,
  ['SR_070819',
   'SR_290719',
   'SR_220719',
   'SR_150719',
   'SR_050719',
   'SR_280619',
   'SR_260619',
   'SR_060619'],
  [datetime.date(2019, 8, 7),
   datetime.date(2019, 7, 29),
   datetime.date(2019, 7, 22),
   datetime.date(2019, 7, 15),
   datetime.date(2019, 7, 5),
   datetime.date(2019, 6, 28),
   datetime.date(2019, 6, 26),
   datetime.date(2019, 6, 6)],
  datetime.date(2019, 5, 19),
  66,
  107,
  datetime.date(2019, 7, 24),
  datetime.date(2019, 9, 3),
  [2.810675695103792,
   6.5593366620962446,
   18.18864257606032,
   23.358257610937272,
   31.396227438670703,
   25.55585566110504,
   25.900747439808864,
   13.153452166550789],
  [80, 71, 64, 57, 47, 40, 38, 18],
  [2.810675695103792, 6.5593366620962446],
  [80, 71]],
 [1102,
  -55.52096155158504,
  -55.52096155158504,
  ['SR_070819',
   'SR_290719',
   'SR_220719',
   'SR_150719',
   'SR_050719',
   'SR_280619',
   'SR_260619',
   'SR_060619'],
  [datetime.date

In [145]:
# x: Days from sowing to data collection
# May 5 2019 Masbasis and Graminor
# Robot: 

data={'plot':['1','1','2','2','3','3'],'x':['5','6','7','8','9','10'],'y':['0.9','0.8','0.7','0.6','0.5','0.4'] }

ACC=[]
A=pd.DataFrame(data, columns=['plot','x','y'])
AA=0

for item in range(len(A)-1):
    if A['plot'][item]== A['plot'][item+1]:
        Ans=(float((A['y'][item]))+float((A['y'][item+1])))*((float((A['x'][item+1]))-float((A['x'][item]))))/2
        AA+=Ans
        print(AA)
        ACC.append(AA)

0.8500000000000001
1.5
1.95


### Alternative

In [146]:
df1=Data.set_index(['Plot'])
ACC=[]

for item in Numbers_final:
    df2=df1[df1.index==item]
    df2=df2.filter(['Blue', 'Green', 'Red', 'RedEdge', 'NIR','NDVI', 'MTCI', 'EVI', 'DVI', 'RVI', 'VARI', 'EXG', 'EXGR', 'GLI', 'GNDVI', 'GVI','Time','timepoint'], axis=1)
    df2=df2.sort_values(by='timepoint')
    df3=df2.reset_index()

AA=0
for j in range(0,3):
    Ans=(float((df3['GVI'][j]))+float((df3['GVI'][j+1])))*((float((df3['timepoint'][j+1]))-float((df3['timepoint'][j]))))/2
    AA+=Ans

    print(AA)
    ACC.append(AA)



DA=pd.DataFrame(ACC)
DD=pd.DataFrame(Numbers_final)
DDA=pd.concat([DD, DA], axis=1)
DDA.to_excel('Staur_Accumulative_GVI_2019.xlsx')

NameError: name 'Data' is not defined

### Time series data vs the AUC

# ToDo: Model Training


Make model for one year at a time and try to predict yield of another field  

TODO: Train on Masbasis 2019 an 2020  
Test on Staur  

Use data until august for yield prediction since it is most relavant  
Use all data for predicting date to maturity  

Data Collection:  
Data collection usually starts after heading  
2019 has the data before hading as well. To use that, dont use dates before heading  

NDVI is resistant to shadows  

DAT390 Report: Do the report with Robot Data only  

TODO: Use AUC for each index for prediction  

TODO:   
Time series data vs the AUC  