# Weather and Genomics Data

Note: Half datasets, with separate files for east and west subplots have been merged manually in excel

In [1]:
%%time

import os
import math
import datetime
import numpy as np
import pandas as pd
from copy import copy

# Dictionaries
import json
from pprint import pprint

# Iterate in loops
from itertools import zip_longest

# Simpsons integration
from numpy import trapz
from scipy.integrate import simps

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# To display df nicely in loops
from IPython.display import display 
# display(df1.head()) 
# display(df2.head())

# Display rows and columns Pandas
pd.options.display.max_columns = 100
pd.set_option('display.max_rows',100)

Wall time: 1.78 s


In [2]:
# Prints the current working directory
os.getcwd()
# os.listdir()

'C:\\Users\\fahad\\MegaSync\\NMBU\\GitHub\\vPheno'

## Finding Username folder to make general path for multi PC use

In [3]:
username = str(os.getcwd()).split('\\')[2]
user_path = r'C:/Users/'+username+'/'
username, user_path

('fahad', 'C:/Users/fahad/')

## Importing Data

In [4]:
main_path = r'./Data/'
path = r'./Data/renamed_merged/'
export_path = './Data/results/'

# Create export_path folder if not exists already
os.makedirs(path, exist_ok=True)
os.makedirs(export_path, exist_ok=True)

os.listdir(path)

['Graminor_2019_all.csv',
 'Graminor_2020_all.csv',
 'Masbasis_2019_all.csv',
 'Masbasis_2020_all_lodg.csv',
 'Robot_2020_all.csv',
 'Staur_2019_all.csv',
 'Staur_2020_all_lodg.csv']

## Data Preparation
### Creating list of complete files

In [5]:
# Get the list of all files in directory tree at given path

files_with_address = []
files_list = []

for (dirpath, dirnames, filenames) in os.walk(path):
    files_with_address += [os.path.join(dirpath, file) for file in filenames]
    files_list.extend(filenames)
    
print(len(files_with_address), 'files found in the directory')
# files_with_address
files_list

7 files found in the directory


['Graminor_2019_all.csv',
 'Graminor_2020_all.csv',
 'Masbasis_2019_all.csv',
 'Masbasis_2020_all_lodg.csv',
 'Robot_2020_all.csv',
 'Staur_2019_all.csv',
 'Staur_2020_all_lodg.csv']

## Data Checking/control

### Check for duplicate filenames

In [6]:
print('Total number of files are :', len(files_list))

print('Number of unique file names are:', len(set(files_list)))

print('There is/are', len(files_list) - len(set(files_list)),'duplicate file name/names.')
if len(files_list) - len(set(files_list)) > 0:
    raise NameError

Total number of files are : 7
Number of unique file names are: 7
There is/are 0 duplicate file name/names.


## Importing data files to Pandas

In [7]:
   
%%time

all_df = []
for data in files_with_address:
    file_name = os.path.splitext(os.path.basename(data))[0]

    # Replce all invalid characters in the name
    file_name = file_name.replace(" ", "_")
    file_name = file_name.replace("-", "_")
    file_name = file_name.replace(")", "")
    file_name = file_name.replace("(", "")
    df_name = file_name.replace(".", "")
    # Test: Check if the same date is already present in the current dict key
    if df_name in all_df:
        print(f'A file with the same name {df_name} has already been imported. \n Please check if there is duplication of data.')
        raise NameError
    all_df.append(df_name)

    locals()[df_name] = pd.read_csv(data, index_col=False)
    print(df_name, '=====', locals()[df_name].shape)
# all_df

Graminor_2019_all ===== (600, 378)
Graminor_2020_all ===== (400, 378)
Masbasis_2019_all ===== (528, 278)
Masbasis_2020_all_lodg ===== (659, 416)
Robot_2020_all ===== (96, 484)
Staur_2019_all ===== (1328, 346)
Staur_2020_all_lodg ===== (1504, 209)
Wall time: 492 ms


In [8]:
print(f'Total imported {len(all_df)}')
all_df

Total imported 7


['Graminor_2019_all',
 'Graminor_2020_all',
 'Masbasis_2019_all',
 'Masbasis_2020_all_lodg',
 'Robot_2020_all',
 'Staur_2019_all',
 'Staur_2020_all_lodg']

# Finding yield columns

In [9]:
# ToDo: Add check for duplicate columns in the df

general_col_names = ['Plot_ID', 'Blue', 'Green', 'Red', 'RedEdge', 'NIR']

base_indices = ['Blue', 'Green', 'Red', 'RedEdge', 'NIR']

spectral_indices = ['NDVI', 'MTCI', 'DVI', 'GDVI', 'MTCI_CI', 'EXG', 'EXGR', 'RDVI',
                    'TDVI', 'GNDVI', 'NDRE', 'SCCI', 'EVI', 'TVI', 'VARI', 'GARI',
                    'GCI', 'GLI', 'NLI', 'MNLI', 'SAVI', 'GSAVI', 'OSAVI', 'GOSAVI',
                    'MSAVI2', 'MSR', 'GRVI', 'WDRVI', 'SR']
# list_agg_df
yield_cols = ['GrainYield', 'Name', 'CodeName', 'Pedigree', 'Line', 'Heading_Date',
              'Maturity_Date', 'Days2Heading', 'Days2Maturity', 'Lodging']

id_cols_new = ['Plot_ID']

# Counter for location of column in columns list

# Dict for saving the name and location of the yield column/s
loc_yield_cols = {}
for df in all_df:
    loc = 0
    for cols in locals()[df].columns.tolist():
        for y_col in yield_cols:
            if not cols.find(y_col):
                loc_yield_cols[cols+'_'+df] = loc
                print(f'\"{cols}\" column in {df} is the yield column\n as it contains the text \"{y_col}\". It is located at location {loc}')
        loc += 1

    yield_cols_found = list(loc_yield_cols.keys())
    target_cols=yield_cols_found[0]
loc_yield_cols

"GrainYield" column in Graminor_2019_all is the yield column
 as it contains the text "GrainYield". It is located at location 6
"Name" column in Graminor_2019_all is the yield column
 as it contains the text "Name". It is located at location 7
"Pedigree" column in Graminor_2019_all is the yield column
 as it contains the text "Pedigree". It is located at location 8
"GrainYield" column in Graminor_2020_all is the yield column
 as it contains the text "GrainYield". It is located at location 6
"Name" column in Graminor_2020_all is the yield column
 as it contains the text "Name". It is located at location 7
"Pedigree" column in Graminor_2020_all is the yield column
 as it contains the text "Pedigree". It is located at location 8
"GrainYield" column in Masbasis_2019_all is the yield column
 as it contains the text "GrainYield". It is located at location 6
"Name" column in Masbasis_2019_all is the yield column
 as it contains the text "Name". It is located at location 7
"Line" column in Mas

{'GrainYield_Graminor_2019_all': 6,
 'Name_Graminor_2019_all': 7,
 'Pedigree_Graminor_2019_all': 8,
 'GrainYield_Graminor_2020_all': 6,
 'Name_Graminor_2020_all': 7,
 'Pedigree_Graminor_2020_all': 8,
 'GrainYield_Masbasis_2019_all': 6,
 'Name_Masbasis_2019_all': 7,
 'Line_Masbasis_2019_all': 8,
 'Days2Heading_Masbasis_2019_all': 9,
 'Days2Maturity_Masbasis_2019_all': 10,
 'GrainYield_Masbasis_2020_all_lodg': 6,
 'Name_Masbasis_2020_all_lodg': 7,
 'Line_Masbasis_2020_all_lodg': 8,
 'Maturity_Date_Masbasis_2020_all_lodg': 9,
 'Days2Heading_Masbasis_2020_all_lodg': 10,
 'Days2Maturity_Masbasis_2020_all_lodg': 11,
 'Lodging_Masbasis_2020_all_lodg': 12,
 'GrainYield_Robot_2020_all': 6,
 'Name_Robot_2020_all': 7,
 'CodeName_Robot_2020_all': 8,
 'Heading_Date_Robot_2020_all': 9,
 'Maturity_Date_Robot_2020_all': 10,
 'Days2Heading_Robot_2020_all': 11,
 'Days2Maturity_Robot_2020_all': 12,
 'GrainYield_Staur_2019_all': 6,
 'Name_Staur_2019_all': 7,
 'Line_Staur_2019_all': 8,
 'Days2Heading_Staur

# Finding dates between heading and maturity

In [10]:
yield_cols

['GrainYield',
 'Name',
 'CodeName',
 'Pedigree',
 'Line',
 'Heading_Date',
 'Maturity_Date',
 'Days2Heading',
 'Days2Maturity',
 'Lodging']

In [11]:
for df in all_df:
    temp_df = locals()[df]
    if 'Days2Maturity' in temp_df.columns:
        print(df)
all_df

Masbasis_2019_all
Masbasis_2020_all_lodg
Robot_2020_all
Staur_2019_all


['Graminor_2019_all',
 'Graminor_2020_all',
 'Masbasis_2019_all',
 'Masbasis_2020_all_lodg',
 'Robot_2020_all',
 'Staur_2019_all',
 'Staur_2020_all_lodg']

## Declaring the important dates for each field

In [137]:
# Dates listed in dict in order; sowing, heading, maturity
# The order of fields must be the same as in all_df list
# sowing_dict = {
#     'Graminor_2019': ['240419', 'XX', 'XX'],
#     'Graminor_2020': ['150420', 'XX', 'XX'],
#     'Masbasis_2019': ['190519', 'XX', 'XX'],
#     'Masbasis_2020': ['150520', 'XX', 'XX'],
#     'Robot_2020': ['200420', '170620', '310720'],
#     'Staur_2019': ['040619', 'XX', 'XX'],
#     'Staur_2020': ['210420', 'XX', 'XX'],
# }

sowing_dict = {
    'Graminor_2019': '240419',
    'Graminor_2020': '150420',
    'Masbasis_2019': '190519',
    'Masbasis_2020': '150520',
    'Robot_2020': '200420',
    'Staur_2019': '040619',
    'Staur_2020': '210420',
}


## Filtering the df based on imp_dates dict

In [124]:
# # Creating a list to add the names of new filtered df
# all_df_dates_filtered = {}

# for field, key in zip_longest(all_df, imp_dates):
#     # Checking if the field df and key in the dict are for the same field
#     print(field, key)
#     assert field.split('_')[0] == key.split('_')[0]
    
#     # Getting dates from imp_dates dict
#     sowing, maturity, heading = imp_dates[key]
   
#     sowing_date = datetime.datetime.strptime(sowing, '%d%m%y').date()
#     heading_date = datetime.datetime.strptime(maturity, '%d%m%y').date()
#     maturity_date =datetime.datetime.strptime(heading, '%d%m%y').date()
    
#     # Iterating through all base indices column names
#     for col in general_col_names[1:]:

#         cols_current = [x for x in locals()[field].columns if col+'_' in x]
#         dates = [x.split('_')[1] for x in cols_current]
#         date_fmt = [datetime.datetime.strptime(x, '%d%m%y').date() for x in dates]
#         # Listing the dates in between(and including) heading and maturity dates
#         in_between_dates = [x.strftime("%d%m%y") for x in date_fmt\
#                             if x >= heading_date and x <= maturity_date]
                        
#         dates_not_usable = [x.strftime("%d%m%y") for x in date_fmt\
#                              if not x.strftime("%d%m%y") in in_between_dates]

#     # Filter the datasets with date between heading and maturity
#     select_cols = [x for x in locals()[field].columns if x[-6:] not in dates_not_usable]
#     temp_df = locals()[field][select_cols]

#     # Adding the names of new df into all_df_dates_filtered list
#     filtered_df = key+'_dates_filtered'
#     all_df_dates_filtered[filtered_df] = imp_dates[key]
#     locals()[filtered_df] = temp_df
# all_df_dates_filtered

## Filtering df which have Days2Maturity and Days2Heading 

In [151]:
# If the dataset had Days 2 heading and days to maturity columns then create the
# following dictionary with the respective sowing dates of each field as value
all_df_dates_filtered = {}

for df in all_df:
    temp_df = locals()[df]
    field_temp = df.split('_')[0]+'_'+df.split('_')[1]
    if 'Days2Heading' in temp_df.columns and 'Days2Maturity' in temp_df.columns:
        print(df)
        all_df_dates_filtered[df] = sowing_dict[field_temp]
all_df_dates_filtered

Masbasis_2019_all
Masbasis_2020_all_lodg
Robot_2020_all
Staur_2019_all


{'Masbasis_2019_all': '190519',
 'Masbasis_2020_all_lodg': '150520',
 'Robot_2020_all': '200420',
 'Staur_2019_all': '040619'}

# Integration

In [152]:
from scipy.integrate import simps
from numpy import trapz

## Creating df with Plot_ID and Grain_Yield only
## Calculating AUC and creating new df with calculated values

In [153]:
cols = locals()[df].columns
non_indices_cols = [x for x in cols if x.split('_')[0] not in general_col_names[1:]]
non_indices_cols

['Plot_ID',
 'GrainYield',
 'Name',
 'Pedigree',
 'Lodging',
 'NDVI_200620',
 'MTCI_200620',
 'DVI_200620',
 'GDVI_200620',
 'MTCI_CI_200620',
 'EXG_200620',
 'EXGR_200620',
 'RDVI_200620',
 'TDVI_200620',
 'GNDVI_200620',
 'NDRE_200620',
 'SCCI_200620',
 'EVI_200620',
 'TVI_200620',
 'VARI_200620',
 'GARI_200620',
 'GCI_200620',
 'GLI_200620',
 'NLI_200620',
 'MNLI_200620',
 'SAVI_200620',
 'GSAVI_200620',
 'OSAVI_200620',
 'GOSAVI_200620',
 'MSAVI2_200620',
 'MSR_200620',
 'GRVI_200620',
 'WDRVI_200620',
 'SR_200620',
 'NDVI_250620',
 'MTCI_250620',
 'DVI_250620',
 'GDVI_250620',
 'MTCI_CI_250620',
 'EXG_250620',
 'EXGR_250620',
 'RDVI_250620',
 'TDVI_250620',
 'GNDVI_250620',
 'NDRE_250620',
 'SCCI_250620',
 'EVI_250620',
 'TVI_250620',
 'VARI_250620',
 'GARI_250620',
 'GCI_250620',
 'GLI_250620',
 'NLI_250620',
 'MNLI_250620',
 'SAVI_250620',
 'GSAVI_250620',
 'OSAVI_250620',
 'GOSAVI_250620',
 'MSAVI2_250620',
 'MSR_250620',
 'GRVI_250620',
 'WDRVI_250620',
 'SR_250620',
 'NDVI_09

In [None]:
simp_df_all = []
for df, sowing in all_df_dates_filtered.items():

    # Creating a list of columns which are not present in 
    # the general_col_names list (except the first item,i.e. Plot_ID)
    temp_df = locals()[df].copy()
    cols = temp_df.columns
    non_indices_cols = [x for x in cols if x.split('_')[0] not in general_col_names[1:]]

#     # ORR
#     non_indices_cols = id_cols_new+yield_cols_found
#     non_indices_cols

    df_auc = temp_df[non_indices_cols]
#     display(df_auc.head())

    # Calculating AUC and creating new df with calculated values
    for col_name in general_col_names[1:]:
        df_simp = []
        temp_cols = [x for x in cols if col_name+'_' in x]
        temp_dates = [datetime.datetime.strptime(date.split('_')[1], '%d%m%y').date() for date in temp_cols]

        # Calculating the days from sowing,i.e. age of the crop in days
        sowing_date = datetime.datetime.strptime(sowing, '%d%m%y').date() 
        

        for sample in range(temp_df.shape[0]):
            # Number of days since sowing for each entry
            days_sow = [(x-sowing_date).days for x in temp_dates]
            # The respective value of the index in question 
            temp_entries= [temp_df[x][sample] for x in temp_cols]

            #### DROPPING DATES OUTSIDE HEADING AND MATURITY DATES ####
            # Days to heading for current sample 
            DH = temp_df[Days2Heading][sample]
            # Days to maturity for current sample 
            DM = temp_df[Days2Maturity][sample]
            
            # Making sure that the maturity comes after heading
            assert DM > DH
            
            heading_date = sowing_date + datetime.timedelta(days=DH)
            maturity_date = sowing_date + datetime.timedelta(days=DM)
            
            # Replacing the respective values of items in temp_entries with np.nan which correspond 
            # to dates not in between heading and maturity for that specific sub-plot
            temp_entries = [y if heading_date <= x <= maturity_date else np.nan for x,y in zip(temp_dates, temp_entries)]
            
            # Dropping missing(nan) values from the entries
            temp_entries_dropna = [x for x in temp_entries if str(x) != 'nan']

            # Checking if the number of items in temp_entries and days_sow is the same
            # If not, i.e., there are missing values(nan) in temp_entries then drop the
            # respective entries from days_sow list
            if not len(temp_entries_dropna) == len(days_sow):
                # Dictionary comprehension
                # Creating dictionary(dict comprehension) where temp_entries are not nan
                dict_dropna = {i: [temp_entries[i], days_sow[i]] for i in range(len(temp_entries))\
                       if not str(temp_entries[i]) == 'nan' }
                
                # Checking if the previously created temp_entries_dropna is the same as the new that will
                # be created from dict_dropna (Unnecessary check but curious to check if any problems arise)
                assert temp_entries_dropna == [dict_dropna[i][0] for i in dict_dropna.keys()]
                
                # Creating new temp_entries and days_sow after dropping nan and respective entries in days_sow
                temp_entries_dropna = [dict_dropna[i][0] for i in dict_dropna.keys()]
                days_sow = [dict_dropna[i][1] for i in dict_dropna.keys()]

#             Checking if the lists have the same number of entries
            assert len(temp_entries_dropna) == len(days_sow)

            df_simp.append(simps(temp_entries_dropna, days_sow))

        # Insert the new column at the end, but before GrainYield
        df_auc.insert(len(df_auc.columns)-1, col_name, df_simp)

    # Adding the new name of the df to a list named simp_df_all
    simp_df = df[:-15]+'_Simps'
    simp_df_all.append(simp_df)
    locals()[simp_df] = df_auc.copy()
simp_df_all

In [42]:
simp_df_all = []
for df, imp_dates in all_df_dates_filtered.items():

    # Creating a list of columns which are not present in 
    # the general_col_names list (except the first item,i.e. Plot_ID)
    temp_df = locals()[df].copy()
    cols = temp_df.columns
    non_indices_cols = [x for x in cols if x.split('_')[0] not in general_col_names[1:]]

#     # ORR
#     non_indices_cols = id_cols_new+yield_cols_found
#     non_indices_cols

    df_auc = temp_df[non_indices_cols]
#     display(df_auc.head())

    # Calculating AUC and creating new df with calculated values
    for col_name in general_col_names[1:]:
        df_simp = []
        temp_cols = [x for x in cols if col_name+'_' in x]
        temp_dates = [datetime.datetime.strptime(date.split('_')[1], '%d%m%y').date() for date in temp_cols]

        # Calculating the days from sowing,i.e. age of the crop in days
        sowing_date = datetime.datetime.strptime(imp_dates[0], '%d%m%y').date() 
        

        for sample in range(temp_df.shape[0]):
            # Number of days since sowing for each entry
            days_sow = [(x-sowing_date).days for x in temp_dates]
            # The respective value of the index in question 
            temp_entries= [temp_df[x][sample] for x in temp_cols]
            # Dropping missing(nan) values from the entries
            temp_entries_dropna = [x for x in temp_entries if str(x) != 'nan']

            # Checking if the number of items in temp_entries and days_sow is the same
            # If not, i.e., there are missing values(nan) in temp_entries then drop the
            # respective entries from days_sow list
            if not len(temp_entries_dropna) == len(days_sow):
                # Dictionary comprehension
                # Creating dictionary where temp_entries are not nan
                dict_dropna = {i: [temp_entries[i], days_sow[i]] for i in range(len(temp_entries))\
                       if not str(temp_entries[i]) == 'nan' }
                
                # Creating new temp_entries and days_sow after dropping nan and respective entries in days_sow
                temp_entries_dropna = [dict_dropna[i][0] for i in dict_dropna.keys()]
                days_sow = [dict_dropna[i][1] for i in dict_dropna.keys()]

#             Checking if the lists have the same number of entries
            assert len(temp_entries_dropna) == len(days_sow)

            df_simp.append(simps(temp_entries_dropna, days_sow))

        # Insert the new column at the end, but before GrainYield
        df_auc.insert(len(df_auc.columns)-1, col_name, df_simp)

    # Adding the new name of the df to a list named simp_df_all
    simp_df = df[:-15]+'_Simps'
    simp_df_all.append(simp_df)
    locals()[simp_df] = df_auc.copy()
simp_df_all

[]

# Temp: Exporting data to be used for model

In [40]:
temp_data = export_path+'Temp_Data/'
os.makedirs(temp_data, exist_ok=True)
# for df in simp_df_all:
#     locals()[df].to_csv(temp_data+df+'.csv', index=False)

# Import Genomics Data

## Importing Genomics Data

In [None]:
## Importing Yield data with line information

In [64]:
# Vollebekk 2019: Graminor_2019_x_19TvPhenores_x_Vollebekk_res
# Masbasis 2020: Masbasis_x_20BMLGI1_2020_tm_x_data
# Robot 2020: Robot_x_ROBOT_2020_x_raw
# Masbasis 2019: Masbasis_2019_x_Field_data_2019

In [65]:
a_file = open(main_path+'yield_df.json', "r")
output_str = a_file.read()
# The file is imported as string

# Converting it to dictionary
output_dict = json.loads(output_str)
a_file.close()

pprint(output_dict)

{'Graminor 2019': ['Graminor_2019_x_19TvPhenores_x_Vollebekk_res',
                   '\\MegaSync\\NMBU\\Master '
                   'Thesis\\Data\\Feb2021\\Graminor_2019\\19TvPhenores.xlsx'],
 'Masbasis 2019': ['Masbasis_2019_x_Field_data_2019',
                   '\\MegaSync\\NMBU\\Master '
                   'Thesis\\Data\\Feb2021\\Masbasis_2019\\Field_data_2019.xlsx'],
 'Masbasis 2020': ['Masbasis_x_20BMLGI1_2020_tm_x_data',
                   '\\MegaSync\\NMBU\\Master '
                   'Thesis\\Data\\Feb2021\\Vollebekke-total_2020\\Masbasis\\20BMLGI1_2020_tm.xlsx'],
 'Robot 2020': ['Robot_x_ROBOT_2020_x_raw',
                '\\MegaSync\\NMBU\\Master '
                'Thesis\\Data\\Feb2021\\Vollebekke-total_2020\\Robot\\ROBOT_2020.xlsx'],
 'Staur 2019': ['Graminor_2019_x_19TvPhenores_x_Staur_res',
                '\\MegaSync\\NMBU\\Master '
                'Thesis\\Data\\Feb2021\\Graminor_2019\\19TvPhenores.xlsx']}


### Checking number of unique cultivars in the field

In [66]:
# plots_data = pd.read_excel(files_with_address[0],engine='openpyxl')
# # Pandas converts 'NA' string to NaN. Need to change those to 
# # some string to get a count as NaNs are not counted as unique values

# plots_data.Name.fillna('-', inplace=True)
# plots_data.CodeName.fillna('-', inplace=True)

# # Creating a new column as multiple plots were named 'NA' but the 
# # CodeName was different for each one of them
# plots_data['NameCode'] = plots_data.Name+plots_data.CodeName

# plots_data
# len(plots_data.NameCode.unique())
# plots_data.NameCode.value_counts()
# # plots_data.NameCode.value_counts().sum()
# # plots_data

# ToDo: Dropping NAN

## Finding NAN values
### ToDo: Test: Raise error if missing values found

In [25]:
# Finding number of missing values in each dataframe
df_with_nan = []
missing_values = False
for df in all_df:
    if locals()[df].isna().sum().sum() > 0:
        print(f'Total missing values in {df} are {locals()[df].isna().sum().sum()}')
        missing_values = True
        df_with_nan.append(df)
#     if len(df_with_nan) > 0:
#         raise ValueError
if not missing_values:
    print('No missing value found in any dataframe')

No missing value found in any dataframe


In [26]:
Graminor_2019_all.isnull().sum().sort_values()

NameError: name 'Graminor_2019_all' is not defined

In [27]:
df_with_nan

[]

In [28]:
# Finding which column has NAN values
for df in df_with_nan:
    print(f'{df}:\n {locals()[df].shape[1]-locals()[df].dropna(axis=1).shape[1]} columns or {locals()[df].shape[0]-locals()[df].dropna().shape[0]} rows to be dropped,')

## ToDo: Automate: Drop rows with missing values in df_with_nan

In [29]:
print(f'{Graminor_eastwest_020719_NIR_half_missing.shape} Before dropping')
# Graminor_eastwest_020719_NIR_half_missing.dropna(inplace=True)
print(f'{Graminor_eastwest_020719_NIR_half_missing.shape} After dropping')


NameError: name 'Graminor_eastwest_020719_NIR_half_missing' is not defined

## ORRR

## ToDo: Droppping df with Nan from the all_df_std

In [30]:
print(f'Number of items in all_df is {len(all_df)}')

Number of items in all_df is 0


In [31]:
# for df in df_with_nan:
#     all_df.remove(df)

###  ToDo: Update field_year_dict and sorted_field_year_dict after dropping the dataset

In [32]:
print(f'Number of items in all_df now is {len(all_df)}')

Number of items in all_df now is 0


# Data Trends

## Normal Distribution of data

ToDo:  
see the distribution of data if it is normal  
else make transpose to make it normal  
dist in Gausion function   
in each field  
what if the data is normal dist?  
the use some transpose to box pox   
try diff funct to see which one iis able to make data normal  
make heat map of whole if not normal  
see which parts are not normal and exculde them  
ls_means in R to make the normalisation/transpose  
pearson corr bw yield and indices for diff dates  


In [33]:
x_labels

NameError: name 'x_labels' is not defined

### Yeo-Johnson Transformation

In [34]:
col_for_plotting = ['Blue', 'Green', 'Red', 'RedEdge', 'NIR', 'NDVI', 'MTCI', 'EVI']

from sklearn.preprocessing import PowerTransformer, normalize, StandardScaler
data_agg_list = ['_median_indices']

# col_for_plotting = ['Blue']
# col_for_plotting = ['Green']
# col_for_plotting = ['Red']

for d_type in data_agg_list:
    for col in col_for_plotting:
        fields = len(field_year_dict_yield.keys())
        rows = math.ceil(fields/2)
        
        fig, ax = plt.subplots(rows,2, figsize=(15,10))
        plots = ax.flatten()
        n = 0
        # TODO: Fix the x ticks
        

        for field_sample, dates in sorted_field_year_dict_yield.items():
            x_labels = []
            # Adding required data to a temp dataframe
            temp_df = pd.DataFrame()
            for date in dates:
                date_str = date.strftime('%d%m%y')
                field_df = field_sample[:-5]+'_'+date_str+d_type
                temp_df[date] = locals()[field_df][col]
                x_label = date.strftime('%d-%m-%y')+':'+str(len(locals()[field_df][col]))
                
                x_labels.append(x_label)
                x_labels= list(set(x_labels))
            # Transform the df
#             pt = PowerTransformer(method='box-cox', standardize=False)
            pt = PowerTransformer(method='yeo-johnson', standardize=False)

            temp_arr = pt.fit_transform(temp_df)
            temp_df = pd.DataFrame(temp_arr)
            
            # Adding field plot to the subplots
            num_of_fields = len(field_year_dict_yield.keys())
            
            text = "Grain Yield"
            ax_n = plots[n]
            
            temp_df.boxplot(ax=ax_n)
            ax_n.set_xticklabels(x_labels, rotation=-35)
            ax_n.set_title(field_sample+'_'+col+d_type[:-5]+'_yeo-johnson')
            
#             # Printing the grain yield in plot of the fiels_sample for reference
#             ax_n.text(0.85, 1.05, text, ha='center', va='top', weight='bold', color='blue', transform=ax_n.transAxes)
            n+=1
        plt.tight_layout()



NameError: name 'field_year_dict_yield' is not defined

### Box-Cox Transformation

In [35]:
col_for_plotting = ['Blue', 'Green', 'Red', 'RedEdge', 'NIR', 'NDVI', 'MTCI', 'EVI']

from sklearn.preprocessing import PowerTransformer, normalize, StandardScaler
data_agg_list = ['_median_indices']

# col_for_plotting = ['Blue']
# col_for_plotting = ['Green']
# col_for_plotting = ['Red']

for d_type in data_agg_list:
    for col in col_for_plotting:
        fields = len(field_year_dict_yield.keys())
        rows = math.ceil(fields/2)
        
        fig, ax = plt.subplots(rows,2, figsize=(15,10))
        plots = ax.flatten()
        n = 0
        # TODO: Fix the x ticks
        for field_sample, dates in sorted_field_year_dict_yield.items():
            
            # Adding required data to a temp dataframe
            temp_df = pd.DataFrame()
            for date in dates:
                date_str = date.strftime('%d%m%y')
                field_df = field_sample[:-5]+'_'+date_str+d_type
                temp_df[date] = locals()[field_df][col]
            x_labels = temp_df.columns.tolist()

            # Transform the df
#             pt = PowerTransformer(method='box-cox', standardize=False)
            pt = PowerTransformer(method='box-cox', standardize=False)

            # Taking absolute values of the dataframe(avoiding negative values)
            temp_arr = pt.fit_transform(temp_df.abs())
            temp_df = pd.DataFrame(temp_arr)
            
            # Adding field plot to the subplots
            num_of_fields = len(field_year_dict_yield.keys())
            
            text = "Grain Yield"
            ax_n = plots[n]

            temp_df.boxplot(ax=ax_n)
            ax_n.set_xticklabels(x_labels, rotation=90)
            ax_n.set_title(field_sample+'_'+col+d_type[:-5]+'_box-cox')
            
#             # Printing the grain yield in plot of the fiels_sample for reference
#             ax_n.text(0.85, 1.05, text, ha='center', va='top', weight='bold', color='blue', transform=ax_n.transAxes)
            n+=1
        plt.tight_layout()



NameError: name 'field_year_dict_yield' is not defined

### ToDo: Identify Dates and index with problems

### Ecxclude the problematic data/dates
or
### Take average values where the problematic data is

Take average of data for date 20200708 and 20200624  
Masbasis  
Cleanup  
Remove dates which have drop  

## ToDo: Remove outliers

### Find AUC for all dates of one field
See if it covers tha gaps under the dates,i.e.

Since data points are different  
Flying time is different  
Cover the gaps between the dates  

Since the data collection is not uniform throughout the year so AUC will give a single value instead of multiple values for one field year which will be representative of all the dates 

#### Option 1: Use Scipy

In [36]:
import scipy
scipy.__version__

'1.6.1'

In [37]:
from scipy import integrate
from scipy.integrate import simps

In [38]:
from scipy.integrate import simpson

In [39]:
x = np.arange(0, 10)
y = np.arange(0, 10)


In [40]:
# integrate.simpson(y, x)
integrate.simps(y, x)

40.5

In [41]:
y = np.power(x, 3)
y

array([  0,   1,   8,  27,  64, 125, 216, 343, 512, 729], dtype=int32)

In [42]:
integrate.simpson(y, x)
# integrate.simps(y, x)


1642.5

In [43]:
integrate.quad(lambda x: x**3, 0, 9)[0]

1640.25

In [44]:
integrate.simpson(y, x, even='first')
# integrate.simps(y, x, even='first')

1644.5

#### Option 2

In [45]:
data
# plot: Plot ID
# x: Number of days after sowing or actual date
# y: Value of the index


NameError: name 'data' is not defined

In [46]:
# x: Days from sowing to data collection
# May 5 2019 Masbasis and Graminor
# Robot: 

data={'plot':['1','1','2','2','3','3'],'x':['5','6','7','8','9','10'],'y':['0.9','0.8','0.7','0.6','0.5','0.4'] }

ACC=[]
A=pd.DataFrame(data, columns=['plot','x','y'])
AA=0

for item in range(len(A)-1):
    if A['plot'][item]== A['plot'][item+1]:
        Ans=(float((A['y'][item]))+float((A['y'][item+1])))*((float((A['x'][item+1]))-float((A['x'][item]))))/2
        AA+=Ans
        print(AA)
        ACC.append(AA)

0.8500000000000001
1.5
1.95


### Alternative

In [47]:
df1=Data.set_index(['Plot'])
ACC=[]

for item in Numbers_final:
    df2=df1[df1.index==item]
    df2=df2.filter(['Blue', 'Green', 'Red', 'RedEdge', 'NIR','NDVI', 'MTCI', 'EVI', 'DVI', 'RVI', 'VARI', 'EXG', 'EXGR', 'GLI', 'GNDVI', 'GVI','Time','timepoint'], axis=1)
    df2=df2.sort_values(by='timepoint')
    df3=df2.reset_index()

AA=0
for j in range(0,3):
    Ans=(float((df3['GVI'][j]))+float((df3['GVI'][j+1])))*((float((df3['timepoint'][j+1]))-float((df3['timepoint'][j]))))/2
    AA+=Ans

    print(AA)
    ACC.append(AA)



DA=pd.DataFrame(ACC)
DD=pd.DataFrame(Numbers_final)
DDA=pd.concat([DD, DA], axis=1)
DDA.to_excel('Staur_Accumulative_GVI_2019.xlsx')

NameError: name 'Data' is not defined

### Time series data vs the AUC

# ToDo: Model Training


Make model for one year at a time and try to predict yield of another field  

TODO: Train on Masbasis 2019 an 2020  
Test on Staur  

Use data until august for yield prediction since it is most relavant  
Use all data for predicting date to maturity  

Data Collection:  
Data collection usually starts after heading  
2019 has the data before hading as well. To use that, dont use dates before heading  

NDVI is resistant to shadows  

DAT390 Report: Do the report with Robot Data only  

TODO: Use AUC for each index for prediction  

TODO:   
Time series data vs the AUC  