# Weather and Genomics Data

Note: Half datasets, with separate files for east and west subplots have been merged manually in excel

In [1]:
%%time

import os
import math
import datetime
import numpy as np
import pandas as pd
from copy import copy

# Dictionaries
import json
from pprint import pprint

# Iterate in loops
from itertools import zip_longest

# Simpsons integration
from numpy import trapz
from scipy.integrate import simps

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# To display df nicely in loops
from IPython.display import display 
# display(df1.head()) 
# display(df2.head())

# Display rows and columns Pandas
pd.options.display.max_columns = 100
pd.set_option('display.max_rows',100)

Wall time: 1.16 s


In [2]:
# Prints the current working directory
os.getcwd()
# os.listdir()

'C:\\Users\\fahad\\MegaSync\\NMBU\\GitHub\\vPheno'

## Finding Username folder to make general path for multi PC use

In [3]:
username = str(os.getcwd()).split('\\')[2]
user_path = r'C:/Users/'+username+'/'
username, user_path

('fahad', 'C:/Users/fahad/')

## Importing Data

In [4]:
main_path = r'./Data/'
path = r'./Data/renamed_merged/'
export_path = './Data/results/'
weather_data = user_path+r'\\MegaSync\NMBU\Master Thesis\Data\Weather\Weather_Data.csv'
genomics_data = user_path+r'\\MegaSync\NMBU\Master Thesis\Data\Genomics\\'

# Create export_path folder if not exists already
os.makedirs(path, exist_ok=True)
os.makedirs(export_path, exist_ok=True)

os.listdir(path)

['Graminor_2019_all.csv',
 'Graminor_2020_all.csv',
 'Masbasis_2019_all.csv',
 'Masbasis_2020_all_lodg.csv',
 'Robot_2020_all.csv',
 'Staur_2019_all.csv',
 'Staur_2020_all_lodg.csv']

## Data Preparation
### Creating list of complete files

In [5]:
# Get the list of all files in directory tree at given path

files_with_address = []
files_list = []

for (dirpath, dirnames, filenames) in os.walk(path):
    files_with_address += [os.path.join(dirpath, file) for file in filenames]
    files_list.extend(filenames)
    
print(len(files_with_address), 'files found in the directory')
# files_with_address
files_list

7 files found in the directory


['Graminor_2019_all.csv',
 'Graminor_2020_all.csv',
 'Masbasis_2019_all.csv',
 'Masbasis_2020_all_lodg.csv',
 'Robot_2020_all.csv',
 'Staur_2019_all.csv',
 'Staur_2020_all_lodg.csv']

## Data Checking/control

### Check for duplicate filenames

In [6]:
print('Total number of files are :', len(files_list))

print('Number of unique file names are:', len(set(files_list)))

print('There is/are', len(files_list) - len(set(files_list)),'duplicate file name/names.')
if len(files_list) - len(set(files_list)) > 0:
    raise NameError

Total number of files are : 7
Number of unique file names are: 7
There is/are 0 duplicate file name/names.


## Importing data files to Pandas

In [7]:
   
%%time

all_df = []
for data in files_with_address:
    file_name = os.path.splitext(os.path.basename(data))[0]

    # Replce all invalid characters in the name
    file_name = file_name.replace(" ", "_")
    file_name = file_name.replace("-", "_")
    file_name = file_name.replace(")", "")
    file_name = file_name.replace("(", "")
    df_name = file_name.replace(".", "")
    # Test: Check if the same date is already present in the current dict key
    if df_name in all_df:
        print(f'A file with the same name {df_name} has already been imported. \n Please check if there is duplication of data.')
        raise NameError
    all_df.append(df_name)

    locals()[df_name] = pd.read_csv(data, index_col=False)
    print(df_name, '=====', locals()[df_name].shape)
# all_df

Graminor_2019_all ===== (600, 378)
Graminor_2020_all ===== (400, 378)
Masbasis_2019_all ===== (528, 278)
Masbasis_2020_all_lodg ===== (659, 416)
Robot_2020_all ===== (96, 484)
Staur_2019_all ===== (1328, 176)
Staur_2020_all_lodg ===== (1504, 209)
Wall time: 398 ms


In [8]:
print(f'Total imported {len(all_df)}')
all_df

Total imported 7


['Graminor_2019_all',
 'Graminor_2020_all',
 'Masbasis_2019_all',
 'Masbasis_2020_all_lodg',
 'Robot_2020_all',
 'Staur_2019_all',
 'Staur_2020_all_lodg']

# Finding yield columns

In [9]:
# ToDo: Add check for duplicate columns in the df

general_col_names = ['Plot_ID', 'Blue', 'Green', 'Red', 'RedEdge', 'NIR']

base_indices = ['Blue', 'Green', 'Red', 'RedEdge', 'NIR']

spectral_indices = ['NDVI', 'MTCI', 'DVI', 'GDVI', 'MTCI_CI', 'EXG', 'EXGR', 'RDVI',
                    'TDVI', 'GNDVI', 'NDRE', 'SCCI', 'EVI', 'TVI', 'VARI', 'GARI',
                    'GCI', 'GLI', 'NLI', 'MNLI', 'SAVI', 'GSAVI', 'OSAVI', 'GOSAVI',
                    'MSAVI2', 'MSR', 'GRVI', 'WDRVI', 'SR']
# list_agg_df
yield_cols = ['GrainYield', 'Name', 'CodeName', 'Pedigree', 'Line', 'Heading_Date',
              'Maturity_Date', 'Days2Heading', 'Days2Maturity', 'Lodging']

id_cols_new = ['Plot_ID']

# Counter for location of column in columns list

# Dict for saving the name and location of the yield column/s
loc_yield_cols = {}
for df in all_df:
    loc = 0
    for cols in locals()[df].columns.tolist():
        for y_col in yield_cols:
            if not cols.find(y_col):
                loc_yield_cols[cols+'_'+df] = loc
                print(f'\"{cols}\" column in {df} is the yield column\n as it contains the text \"{y_col}\". It is located at location {loc}')
        loc += 1

    yield_cols_found = list(loc_yield_cols.keys())
    target_cols=yield_cols_found[0]
loc_yield_cols

"GrainYield" column in Graminor_2019_all is the yield column
 as it contains the text "GrainYield". It is located at location 6
"Name" column in Graminor_2019_all is the yield column
 as it contains the text "Name". It is located at location 7
"Pedigree" column in Graminor_2019_all is the yield column
 as it contains the text "Pedigree". It is located at location 8
"GrainYield" column in Graminor_2020_all is the yield column
 as it contains the text "GrainYield". It is located at location 6
"Name" column in Graminor_2020_all is the yield column
 as it contains the text "Name". It is located at location 7
"Pedigree" column in Graminor_2020_all is the yield column
 as it contains the text "Pedigree". It is located at location 8
"GrainYield" column in Masbasis_2019_all is the yield column
 as it contains the text "GrainYield". It is located at location 6
"Name" column in Masbasis_2019_all is the yield column
 as it contains the text "Name". It is located at location 7
"Line" column in Mas

{'GrainYield_Graminor_2019_all': 6,
 'Name_Graminor_2019_all': 7,
 'Pedigree_Graminor_2019_all': 8,
 'GrainYield_Graminor_2020_all': 6,
 'Name_Graminor_2020_all': 7,
 'Pedigree_Graminor_2020_all': 8,
 'GrainYield_Masbasis_2019_all': 6,
 'Name_Masbasis_2019_all': 7,
 'Line_Masbasis_2019_all': 8,
 'Days2Heading_Masbasis_2019_all': 9,
 'Days2Maturity_Masbasis_2019_all': 10,
 'GrainYield_Masbasis_2020_all_lodg': 6,
 'Name_Masbasis_2020_all_lodg': 7,
 'Line_Masbasis_2020_all_lodg': 8,
 'Maturity_Date_Masbasis_2020_all_lodg': 9,
 'Days2Heading_Masbasis_2020_all_lodg': 10,
 'Days2Maturity_Masbasis_2020_all_lodg': 11,
 'Lodging_Masbasis_2020_all_lodg': 12,
 'GrainYield_Robot_2020_all': 6,
 'Name_Robot_2020_all': 7,
 'CodeName_Robot_2020_all': 8,
 'Heading_Date_Robot_2020_all': 9,
 'Maturity_Date_Robot_2020_all': 10,
 'Days2Heading_Robot_2020_all': 11,
 'Days2Maturity_Robot_2020_all': 12,
 'GrainYield_Staur_2019_all': 6,
 'Name_Staur_2019_all': 7,
 'Line_Staur_2019_all': 8,
 'Days2Heading_Staur

# Finding dates between heading and maturity

In [10]:
yield_cols

['GrainYield',
 'Name',
 'CodeName',
 'Pedigree',
 'Line',
 'Heading_Date',
 'Maturity_Date',
 'Days2Heading',
 'Days2Maturity',
 'Lodging']

In [11]:
for df in all_df:
    temp_df = locals()[df].copy()
    if 'Days2Maturity' in temp_df.columns:
        print(df)
all_df

Masbasis_2019_all
Masbasis_2020_all_lodg
Robot_2020_all
Staur_2019_all


['Graminor_2019_all',
 'Graminor_2020_all',
 'Masbasis_2019_all',
 'Masbasis_2020_all_lodg',
 'Robot_2020_all',
 'Staur_2019_all',
 'Staur_2020_all_lodg']

## Declaring the important dates for each field

In [12]:
# Dates listed in dict in order; sowing, heading, maturity
# The order of fields must be the same as in all_df list
# sowing_dict = {
#     'Graminor_2019': ['240419', 'XX', 'XX'],
#     'Graminor_2020': ['150420', 'XX', 'XX'],
#     'Masbasis_2019': ['190519', 'XX', 'XX'],
#     'Masbasis_2020': ['150520', 'XX', 'XX'],
#     'Robot_2020': ['200420', '170620', '310720'],
#     'Staur_2019': ['040619', 'XX', 'XX'],
#     'Staur_2020': ['210420', 'XX', 'XX'],
# }

sowing_dict = {
    'Graminor_2019': '240419',
    'Graminor_2020': '150420',
    'Masbasis_2019': '190519',
    'Masbasis_2020': '150520',
    'Robot_2020': '200420',
    'Staur_2019': '040619',
    'Staur_2020': '210420',
}


## Filtering the df based on imp_dates dict

In [13]:
# # Creating a list to add the names of new filtered df
# all_df_dates_filtered = {}

# for field, key in zip_longest(all_df, imp_dates):
#     # Checking if the field df and key in the dict are for the same field
#     print(field, key)
#     assert field.split('_')[0] == key.split('_')[0]
    
#     # Getting dates from imp_dates dict
#     sowing, maturity, heading = imp_dates[key]
   
#     sowing_date = datetime.datetime.strptime(sowing, '%d%m%y').date()
#     heading_date = datetime.datetime.strptime(maturity, '%d%m%y').date()
#     maturity_date =datetime.datetime.strptime(heading, '%d%m%y').date()
    
#     # Iterating through all base indices column names
#     for col in general_col_names[1:]:

#         cols_current = [x for x in locals()[field].columns if col+'_' in x]
#         dates = [x.split('_')[1] for x in cols_current]
#         date_fmt = [datetime.datetime.strptime(x, '%d%m%y').date() for x in dates]
#         # Listing the dates in between(and including) heading and maturity dates
#         in_between_dates = [x.strftime("%d%m%y") for x in date_fmt\
#                             if x >= heading_date and x <= maturity_date]
                        
#         dates_not_usable = [x.strftime("%d%m%y") for x in date_fmt\
#                              if not x.strftime("%d%m%y") in in_between_dates]

#     # Filter the datasets with date between heading and maturity
#     select_cols = [x for x in locals()[field].columns if x[-6:] not in dates_not_usable]
#     temp_df = locals()[field][select_cols].copy()

#     # Adding the names of new df into all_df_dates_filtered list
#     filtered_df = key+'_dates_filtered'
#     all_df_dates_filtered[filtered_df] = imp_dates[key]
#     locals()[filtered_df] = temp_df
# all_df_dates_filtered

## Filtering df which have Days2Maturity and Days2Heading 

In [14]:
# If the dataset had Days 2 heading and days to maturity columns then create the
# following dictionary with the respective sowing dates of each field as value
all_df_sowing = {}

for df in all_df:
    temp_df = locals()[df].copy()
    field_temp = df.split('_')[0]+'_'+df.split('_')[1]
    if 'Days2Heading' in temp_df.columns and 'Days2Maturity' in temp_df.columns:
        print(df)
        all_df_sowing[df] = sowing_dict[field_temp]
all_df_sowing

Masbasis_2019_all
Masbasis_2020_all_lodg
Robot_2020_all
Staur_2019_all


{'Masbasis_2019_all': '190519',
 'Masbasis_2020_all_lodg': '150520',
 'Robot_2020_all': '200420',
 'Staur_2019_all': '040619'}

# Integration

In [15]:
from scipy.integrate import simps
from numpy import trapz

## Creating df with Plot_ID and Grain_Yield only
## Calculating AUC and creating new df with calculated values

In [16]:
simp_df_all = []
samples_record_simps = {}
for df, sowing in all_df_sowing.items():

    temp_df = locals()[df].copy()
    cols = temp_df.columns
    
    # Creating a list of columns which other than the indices (ID and yield columns)
    # Making a temp list of yield columns since all entries from yield cols are not present in every df
    temp_yield_cols = [x for x in temp_df.columns if x in yield_cols]
    non_indices_cols = id_cols_new+temp_yield_cols
    print(non_indices_cols)
    df_auc = temp_df[non_indices_cols]
    display(df_auc.head())

    # Calculating AUC and creating new df with calculated values
    for col_name in base_indices+spectral_indices:
        df_simp = []
        # Making temp_cols list avoids problems finding and differentiating 'OSAVI' and 'GOSAVI'
        temp_cols = [x for x in cols if col_name.split('_') == x.split('_')[:-1]]
        temp_dates = [datetime.datetime.strptime(date.split('_')[-1], '%d%m%y').date() for date in temp_cols]

        # Calculating the days from sowing,i.e. age of the crop in days
        sowing_date = datetime.datetime.strptime(sowing, '%d%m%y').date() 
        
        temp_samples = []
        for sample in range(temp_df.shape[0]):
            # Number of days since sowing for each entry
            days_sow = [(x-sowing_date).days for x in temp_dates]
            # The respective value of the index in question 
            temp_entries= [temp_df[x][sample] for x in temp_cols]

            #### DROPPING DATES OUTSIDE HEADING AND MATURITY DATES ####
            # Days to heading for current sample 
            DH = temp_df.Days2Heading[sample]
            # Days to maturity for current sample 
            DM = temp_df.Days2Maturity[sample]
            
            # Skip the entry if wither DH or DM is Nan
            if str(DH)=='nan' or str(DM)=='nan':
#                 print(DH, DM, 'NAN')
                df_simp.append(np.nan)
                continue

            DH = int(DH)
            DM = int(DM)
            # Making sure that the maturity comes after heading
            if DM < DH:
                print(DM, DH)
            assert DM > DH
#             print(DM, DH)
            heading_date = sowing_date + datetime.timedelta(days=DH)
            maturity_date = sowing_date + datetime.timedelta(days=DM)
            
            # Replacing the respective values of items in temp_entries with np.nan which correspond 
            # to dates not in between heading and maturity for that specific sub-plot
            temp_entries = [y if heading_date <= x <= maturity_date else np.nan for x,y in zip(temp_dates, temp_entries)]
            
            # Dropping missing(nan) values from the entries
            temp_entries_dropna = [x for x in temp_entries if str(x) != 'nan']

            # Checking if the number of items in temp_entries and days_sow is the same
            # If not, i.e., there are missing values(nan) in temp_entries then drop the
            # respective entries from days_sow list
            if not len(temp_entries_dropna) == len(days_sow):
                # Dictionary comprehension
                # Creating dictionary(dict comprehension) where temp_entries are not nan
                dict_dropna = {i: [temp_entries[i], days_sow[i]] for i in range(len(temp_entries))\
                       if not str(temp_entries[i]) == 'nan' }
                
                # Checking if the previously created temp_entries_dropna is the same as the new that will
                # be created from dict_dropna (Unnecessary check but curious to check if any problems arise)
                assert temp_entries_dropna == [dict_dropna[i][0] for i in dict_dropna.keys()]
                
                # Creating new temp_entries and days_sow after dropping nan and respective entries in days_sow
                temp_entries_dropna = [dict_dropna[i][0] for i in dict_dropna.keys()]
                days_sow = [dict_dropna[i][1] for i in dict_dropna.keys()]

#             Checking if the lists have the same number of entries
            assert len(temp_entries_dropna) == len(days_sow)

            df_simp.append(simps(temp_entries_dropna, days_sow))
            temp_samples.append([simps(temp_entries_dropna, days_sow), temp_entries_dropna, days_sow])
        samples_record_simps[df+'_'+col_name] = temp_samples
        
        # Insert the new column at the end, but before GrainYield
        df_auc.insert(len(df_auc.columns)-1, col_name, df_simp)

    # Adding the new name of the df to a list named simp_df_all
    simp_df = df.split('_')[0]+'_'+df.split('_')[1]+'_Simps'
    simp_df_all.append(simp_df)
    locals()[simp_df] = df_auc.copy()
simp_df_all

['Plot_ID', 'GrainYield', 'Name', 'Line', 'Days2Heading', 'Days2Maturity']


Unnamed: 0,Plot_ID,GrainYield,Name,Line,Days2Heading,Days2Maturity
0,1101,522.666667,GN12687,1574,66,107
1,1102,388.0,Avocet YrA,28,69,110
2,1103,541.333333,GN08557,1313,70,108
3,1104,572.0,GN08541,1311,69,109
4,1105,542.666667,SW44431,1324,67,106


['Plot_ID', 'GrainYield', 'Name', 'Line', 'Maturity_Date', 'Days2Heading', 'Days2Maturity', 'Lodging']


Unnamed: 0,Plot_ID,GrainYield,Name,Line,Maturity_Date,Days2Heading,Days2Maturity,Lodging
0,1101,,MS 273-150,26.0,,67,,1.0
1,1102,,Sabin,1322.0,,65,,1.0
2,1103,,T2038,25.0,,65,,1.0
3,1104,,Bastian,,,65,,1.0
4,1105,713.333333,T9040,6.0,2020-08-11,66,88.0,0.0


['Plot_ID', 'GrainYield', 'Name', 'CodeName', 'Heading_Date', 'Maturity_Date', 'Days2Heading', 'Days2Maturity']


Unnamed: 0,Plot_ID,GrainYield,Name,CodeName,Heading_Date,Maturity_Date,Days2Heading,Days2Maturity
0,1101,453.658537,Avle,,2020-06-21,2020-08-07,62,109
1,1102,439.02439,,GN10637,2020-06-21,2020-08-11,62,113
2,1103,409.756098,Runar,,2020-06-19,2020-08-04,60,106
3,1104,474.796748,Betong,GN13618,2020-06-20,2020-08-08,61,110
4,1105,411.382114,Reno,,2020-06-20,2020-08-04,61,106


['Plot_ID', 'GrainYield', 'Name', 'Line', 'Days2Heading', 'Days2Maturity']


Unnamed: 0,Plot_ID,GrainYield,Name,Line,Days2Heading,Days2Maturity
0,101,357.239871,512-21,76.0,45.0,100.0
1,102,634.385088,GN11634,1515.0,51.0,114.0
2,103,730.274361,SW51114 (Amulett),60.0,50.0,114.0
3,104,217.024221,Sumai 3 (18.),71.0,60.0,120.0
4,105,598.191762,GN14516,1625.0,49.0,112.0


['Masbasis_2019_Simps',
 'Masbasis_2020_Simps',
 'Robot_2020_Simps',
 'Staur_2019_Simps']

In [17]:
simp_df_all_dropna = []
for df in simp_df_all:
    temp_df = locals()[df].copy()
    rows = temp_df.shape[0]
#     print(temp_df.shape)
    temp_df.dropna(subset=['Blue'],inplace = True)
    print(df, 'Dropped entried', rows- temp_df.shape[0],':', rows, temp_df.shape[0])
    new_df = df+'_dropna'
    locals()[new_df] = temp_df.copy()
    simp_df_all_dropna.append(new_df)
simp_df_all_dropna

Masbasis_2019_Simps Dropped entried 0 : 528 528
Masbasis_2020_Simps Dropped entried 112 : 659 547
Robot_2020_Simps Dropped entried 0 : 96 96
Staur_2019_Simps Dropped entried 1166 : 1328 162


['Masbasis_2019_Simps_dropna',
 'Masbasis_2020_Simps_dropna',
 'Robot_2020_Simps_dropna',
 'Staur_2019_Simps_dropna']

In [18]:
Staur_2019_Simps_dropna

Unnamed: 0,Plot_ID,GrainYield,Name,Line,Days2Heading,Blue,Green,Red,RedEdge,NIR,NDVI,MTCI,DVI,GDVI,MTCI_CI,EXG,EXGR,RDVI,TDVI,GNDVI,NDRE,SCCI,EVI,TVI,VARI,GARI,GCI,GLI,NLI,MNLI,SAVI,GSAVI,OSAVI,GOSAVI,MSAVI2,MSR,GRVI,WDRVI,SR,Days2Maturity
0,101,357.239871,512-21,76.0,45.0,0.891441,2.496192,1.575735,5.080712,18.959833,31.524212,155.743364,17.384098,16.463641,155.743364,2.525209,2.815372,23.393197,29.084631,28.485757,21.693060,25.228202,-49.498249,1079.864163,13.474818,19.525651,264.543523,0.821819,27.224074,15.248713,24.765070,22.886253,24.410953,22.294432,64.510195,558.125548,301.543523,6.089392,568.335371,100.0
1,102,634.385088,GN11634,1515.0,51.0,0.454023,1.307849,0.751860,2.720009,10.658808,20.076644,101.513738,9.906949,9.350960,101.513738,1.409815,1.665061,14.088110,17.181887,18.065563,13.911963,15.852559,-28.347421,616.656489,8.919082,10.760771,175.623412,0.783027,17.248032,8.431690,14.926473,13.755619,15.137620,13.780119,38.467212,373.172348,198.623412,4.947035,379.087118,114.0
2,103,730.274361,SW51114 (Amulett),60.0,50.0,0.806496,2.120763,1.268243,4.318234,19.073734,32.396645,186.533126,17.805491,16.952971,186.533126,2.166787,2.512010,23.986655,29.755699,29.566846,23.426494,26.625393,-54.186812,1102.430259,14.024517,19.105437,308.708247,0.625270,28.516598,16.033038,25.380869,23.635102,25.047162,23.079023,65.449259,640.120100,345.708247,8.724839,649.475629,114.0
3,104,217.024221,Sumai 3 (18.),71.0,60.0,0.490968,1.467358,1.088434,3.058573,9.778403,18.514131,80.384626,8.689969,8.311045,80.384626,1.355312,1.298862,12.678980,15.299158,17.015703,12.176331,15.001603,-22.261099,536.555038,5.834481,11.367351,134.614566,-0.087690,13.890034,6.375298,13.435915,12.608908,13.799072,12.814489,35.484995,249.294871,157.614566,0.317268,256.718244,120.0
4,105,598.191762,GN14516,1625.0,49.0,1.153968,3.003463,1.714870,5.741359,20.686545,31.415006,147.870205,18.971675,17.683082,147.870205,3.138088,3.740733,24.410540,30.727245,27.689786,21.151252,24.691254,-52.009138,1189.844229,15.002430,20.082992,239.521081,0.796924,27.650799,17.250383,25.773786,23.290973,24.831837,22.139405,67.170173,522.156729,276.521081,5.190130,532.542099,112.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,326,443.478067,GN12650,1621.0,49.0,0.729238,2.072085,1.231619,4.296052,18.341864,32.117191,170.968478,17.110245,16.269779,170.968478,2.183312,2.531130,23.359551,28.707676,29.279533,22.691420,25.976173,-54.052238,1060.233322,13.570207,18.838420,297.021355,1.024821,27.278347,15.002476,24.677466,22.951156,24.576354,22.619913,63.700406,629.759773,334.021355,8.002199,639.369282,97.0
158,327,521.343900,SW51114 (Amulett),60.0,48.0,0.774273,2.187079,1.479116,4.404733,15.878395,30.352915,142.732910,14.399279,13.691316,142.732910,2.120769,2.237086,20.830763,25.060464,27.795799,20.563757,24.949805,-39.428246,892.275276,10.017306,17.615663,231.744897,-0.156361,22.540597,11.043946,22.007716,20.522764,22.573069,20.852797,57.687127,431.439516,268.744897,1.751372,442.881540,100.0
159,328,477.973623,Paros,10.0,48.0,0.725017,2.022605,1.397897,4.272810,16.483497,31.076720,162.185444,15.085600,14.460892,162.185444,1.922296,1.987845,21.595316,26.153974,28.704206,21.565294,25.382842,-44.188648,930.124355,11.077394,18.024138,281.652495,0.109567,24.607294,12.136921,22.843019,21.499109,23.280119,21.702623,59.621812,544.188805,318.652495,5.008174,554.799046,96.0
160,329,464.364326,GN13626,1615.0,47.0,0.574082,1.736141,0.897789,3.791359,18.034478,33.431710,183.522653,17.136689,16.298337,183.522653,2.000411,2.479647,23.893056,29.289601,30.453397,24.075199,26.611263,-56.663329,1061.735412,15.418630,17.916079,350.552765,2.055216,29.858499,15.488562,25.295600,23.531304,25.413143,23.384359,64.640221,771.893337,387.552765,12.502904,780.156593,92.0


# Temp: Exporting data to be used for model

In [19]:
temp_data = export_path+'Temp_Data/'
os.makedirs(temp_data, exist_ok=True)
for df in simp_df_all:
    locals()[df].to_csv(temp_data+df+'.csv', index=False)

# Weather Data

In [20]:
weather = pd.read_csv(weather_data)

# Converting date time to python datetime
weather['Time measured'] = pd.to_datetime(weather['Time measured'], infer_datetime_format=True)
# weather['Time measured'] = weather['Time measured'].dt.normalize()

# Removing timezone info from datetime sice other date data is without timezone info
weather['Time measured'] = pd.Series(x.replace(tzinfo=None) for x in weather['Time measured'])

weather.columns

Index(['Time measured', 'Middeltemperatur i 2m høyde (TM)',
       'Maksimum lufttemperatur i 2m høyde (TX)',
       'Minimums lufttemperatur i 2m høyde (TN)', 'Nedbør (RR)',
       'Relativ luftfuktighet i 2m', 'Relativ luftfuktighet i 2m.1',
       ' siste minuttverdi (UU)', 'Bladfuktighet i 2m høyde (BT)',
       ' 10 min glidende middel (FF2)', ' 5 sek middel', ' vindkast (FG2)',
       'Vindhastighet i 2m'],
      dtype='object')

In [22]:
# Translated the column heading using google translate

weather.columns = ['Time measured', 'Average temperature at 2m altitude (TM)',
 'Maximum air temperature at 2m altitude (TX)',
 'Minimum air temperature at 2m altitude (TN)', 'Precipitation (RR)',
 'Relative humidity in 2m', 'Relative humidity in 2m.1',
 'last minute value (UU)', 'Leaf moisture at 2m height (BT)',
 '10 min lubricant (FF2)', '5 sec medium', 'gusts of wind (FG2)',
 'Wind speed in 2m']


In [23]:
weather['Time measured']

0     2019-04-01
1     2019-04-02
2     2019-04-03
3     2019-04-04
4     2019-04-05
         ...    
540   2020-09-26
541   2020-09-27
542   2020-09-28
543   2020-09-29
544   2020-09-30
Name: Time measured, Length: 545, dtype: datetime64[ns]

In [24]:
max_min_dates = {}
for df in all_df:
    temp_df = locals()[df].copy()
    dates = [x.split('_')[1] for x in temp_df.columns if 'Blue' in x]
    df_name_temp = df.split('_')[0]+'_'+df.split('_')[1]
    sowing_date_temp = datetime.datetime.strptime(sowing_dict[df_name_temp], '%d%m%y')
    min_date_temp = min([datetime.datetime.strptime(x, '%d%m%y') for x in dates ])
    max_date_temp = max([datetime.datetime.strptime(x, '%d%m%y') for x in dates ])
    max_min_dates[df] = [sowing_date_temp, min_date_temp, max_date_temp]
max_min_dates

{'Graminor_2019_all': [datetime.datetime(2019, 4, 24, 0, 0),
  datetime.datetime(2019, 6, 6, 0, 0),
  datetime.datetime(2019, 8, 15, 0, 0)],
 'Graminor_2020_all': [datetime.datetime(2020, 4, 15, 0, 0),
  datetime.datetime(2020, 6, 18, 0, 0),
  datetime.datetime(2020, 8, 14, 0, 0)],
 'Masbasis_2019_all': [datetime.datetime(2019, 5, 19, 0, 0),
  datetime.datetime(2019, 6, 6, 0, 0),
  datetime.datetime(2019, 8, 7, 0, 0)],
 'Masbasis_2020_all_lodg': [datetime.datetime(2020, 5, 15, 0, 0),
  datetime.datetime(2020, 6, 18, 0, 0),
  datetime.datetime(2020, 8, 14, 0, 0)],
 'Robot_2020_all': [datetime.datetime(2020, 4, 20, 0, 0),
  datetime.datetime(2020, 6, 18, 0, 0),
  datetime.datetime(2020, 8, 12, 0, 0)],
 'Staur_2019_all': [datetime.datetime(2019, 6, 4, 0, 0),
  datetime.datetime(2019, 7, 24, 0, 0),
  datetime.datetime(2019, 8, 30, 0, 0)],
 'Staur_2020_all_lodg': [datetime.datetime(2020, 4, 21, 0, 0),
  datetime.datetime(2020, 6, 20, 0, 0),
  datetime.datetime(2020, 7, 31, 0, 0)]}

In [25]:
# Weather data for days from sowing date
days_delta = 15

weather_dfs = []
weathers_processed_df = []
for df, dates in max_min_dates.items():
    df_weather_temp = pd.DataFrame()

    sowing_date_temp = dates[0]
    min_date_temp = dates[1]
    max_date_temp = dates[2]
    
    # Filtering the weather date from sowing_date to max_date the data is available for
    temp_weather = weather.loc[(weather['Time measured'] >= sowing_date_temp) &\
                          (weather['Time measured'] <= sowing_date_temp + datetime.timedelta(days=days_delta))]

#     print(df, sowing_date_temp.date(), min_date_temp.date(), max_date_temp.date())
#     print(temp_weather.shape)

#     # See info to find hoe many missing values and in which column
#     display(temp_weather.info())

    # Filling themissing values with the average of the column
    # Applying Only on columns with NaN values
    for i in temp_weather.columns[temp_weather.isnull().any(axis=0)]:
        temp_weather[i].fillna(temp_weather[i].mean(),inplace=True)
    # Drop the time measures column
    temp_weather.drop(['Time measured'], axis=1, inplace=True)
    
    df_weat_temp = df.split('_')[0]+'_'+df.split('_')[1]+'_weather_all'
    locals()[df_name_temp] = temp_weather.copy()
    weather_dfs.append(df_name_temp)
    
    
    
    # Aggregating the weather data using several statistical methods
    mean_df = temp_weather.mean().to_frame().transpose().add_prefix('MEAN ')
    median_df = temp_weather.median().to_frame().transpose().add_prefix('MEDIAN ')
    # Different for mode since mode returns a df, instead of series object
    mode_df = temp_weather.mode().transpose().iloc[:,0].to_frame().transpose().add_prefix('MODE ')
    sum_df = temp_weather.sum().to_frame().transpose().add_prefix('SUM ')
    min_df = temp_weather.min().to_frame().transpose().add_prefix('MIN ')
    max_df = temp_weather.max().to_frame().transpose().add_prefix('MAX ')
    std_df = temp_weather.std().to_frame().transpose().add_prefix('STD_DEV ')
    # Reset index in quantile since it takes quantile of index as well
    quantile_25 = temp_weather.quantile(q=0.25).to_frame().transpose().add_prefix('QUANTILE_25 ').reset_index(drop=True)
    quantile_50 = temp_weather.quantile(q=0.5).to_frame().transpose().add_prefix('QUANTILE_50 ').reset_index(drop=True)
    quantile_75 = temp_weather.quantile(q=0.75).to_frame().transpose().add_prefix('QUANTILE_75 ').reset_index(drop=True)

    single_row_df = pd.concat([mean_df, median_df, mode_df, sum_df, min_df, max_df, std_df, quantile_25, quantile_50, quantile_75], axis=1)
        
#     display(single_row_df)

    df_processed_temp = df.split('_')[0]+'_'+df.split('_')[1]+'_weather_agg'
    locals()[df_processed_temp] = single_row_df.copy()
    weathers_processed_df.append(df_processed_temp)
weathers_processed_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


['Graminor_2019_weather_agg',
 'Graminor_2020_weather_agg',
 'Masbasis_2019_weather_agg',
 'Masbasis_2020_weather_agg',
 'Robot_2020_weather_agg',
 'Staur_2019_weather_agg',
 'Staur_2020_weather_agg']

## Adding weather data to the simps integrated df

In [26]:
df_to_export = []
for df in simp_df_all_dropna:
    temp_df = locals()[df].copy()
    
    field_name = df.split('_')[0]+'_'+df.split('_')[1]
    
    single_row_name = field_name+'_weather_agg'
    single_row_df = locals()[single_row_name]
    # Replicating the single_row data multiple times to make the df equal to the number of rows in the original df
    rows_df = temp_df.shape[0] 
    new_df = pd.DataFrame(np.repeat(single_row_df.values, rows_df, axis=0), columns=single_row_df.columns)
    
    pd.concat([temp_df, new_df], axis=1)
    merged_df = pd.concat([temp_df, new_df], axis=1)
    
    locals()[field_name] = merged_df.copy()
    df_to_export.append(field_name)

In [27]:
Masbasis_2019

Unnamed: 0,Plot_ID,GrainYield,Name,Line,Days2Heading,Blue,Green,Red,RedEdge,NIR,NDVI,MTCI,DVI,GDVI,MTCI_CI,EXG,EXGR,RDVI,TDVI,GNDVI,NDRE,SCCI,EVI,TVI,VARI,GARI,GCI,GLI,NLI,MNLI,SAVI,GSAVI,OSAVI,GOSAVI,MSAVI2,MSR,GRVI,WDRVI,SR,Days2Maturity,MEAN Average temperature at 2m altitude (TM),MEAN Maximum air temperature at 2m altitude (TX),MEAN Minimum air temperature at 2m altitude (TN),MEAN Precipitation (RR),MEAN Relative humidity in 2m,MEAN Relative humidity in 2m.1,MEAN last minute value (UU),MEAN Leaf moisture at 2m height (BT),MEAN 10 min lubricant (FF2),MEAN 5 sec medium,...,MAX gusts of wind (FG2),MAX Wind speed in 2m,STD_DEV Average temperature at 2m altitude (TM),STD_DEV Maximum air temperature at 2m altitude (TX),STD_DEV Minimum air temperature at 2m altitude (TN),STD_DEV Precipitation (RR),STD_DEV Relative humidity in 2m,STD_DEV Relative humidity in 2m.1,STD_DEV last minute value (UU),STD_DEV Leaf moisture at 2m height (BT),STD_DEV 10 min lubricant (FF2),STD_DEV 5 sec medium,STD_DEV gusts of wind (FG2),STD_DEV Wind speed in 2m,QUANTILE_25 Average temperature at 2m altitude (TM),QUANTILE_25 Maximum air temperature at 2m altitude (TX),QUANTILE_25 Minimum air temperature at 2m altitude (TN),QUANTILE_25 Precipitation (RR),QUANTILE_25 Relative humidity in 2m,QUANTILE_25 Relative humidity in 2m.1,QUANTILE_25 last minute value (UU),QUANTILE_25 Leaf moisture at 2m height (BT),QUANTILE_25 10 min lubricant (FF2),QUANTILE_25 5 sec medium,QUANTILE_25 gusts of wind (FG2),QUANTILE_25 Wind speed in 2m,QUANTILE_50 Average temperature at 2m altitude (TM),QUANTILE_50 Maximum air temperature at 2m altitude (TX),QUANTILE_50 Minimum air temperature at 2m altitude (TN),QUANTILE_50 Precipitation (RR),QUANTILE_50 Relative humidity in 2m,QUANTILE_50 Relative humidity in 2m.1,QUANTILE_50 last minute value (UU),QUANTILE_50 Leaf moisture at 2m height (BT),QUANTILE_50 10 min lubricant (FF2),QUANTILE_50 5 sec medium,QUANTILE_50 gusts of wind (FG2),QUANTILE_50 Wind speed in 2m,QUANTILE_75 Average temperature at 2m altitude (TM),QUANTILE_75 Maximum air temperature at 2m altitude (TX),QUANTILE_75 Minimum air temperature at 2m altitude (TN),QUANTILE_75 Precipitation (RR),QUANTILE_75 Relative humidity in 2m,QUANTILE_75 Relative humidity in 2m.1,QUANTILE_75 last minute value (UU),QUANTILE_75 Leaf moisture at 2m height (BT),QUANTILE_75 10 min lubricant (FF2),QUANTILE_75 5 sec medium,QUANTILE_75 gusts of wind (FG2),QUANTILE_75 Wind speed in 2m
0,1101,522.666667,GN12687,1574,66,0.216959,0.564718,0.689903,1.426934,2.759400,5.447633,16.383649,2.069497,2.194682,16.383649,0.222573,-0.178574,3.356982,3.801615,5.935990,2.864099,4.787632,-4.079882,119.162391,-0.667193,5.196798,35.399994,-1.366055,1.163335,0.374387,3.526603,3.784766,3.831908,4.142715,10.638450,37.723860,44.399994,-3.460391,42.165056,107,11.95,17.4625,6.7,5.107692,77.78125,96.7125,433.5625,2.2,15.8375,12.65,...,13.2,13.0,2.286482,3.12642,3.070722,6.328608,11.602254,5.999764,291.08028,0.843801,7.381407,0.825429,0.620618,0.518973,10.375,15.325,4.575,0.15,72.3,96.675,132.0,1.6,10.175,12.3,11.3,11.7,11.4,17.35,7.05,3.053846,80.5,99.25,504.5,2.2,15.25,12.6,11.8,12.0,12.7,19.1,8.525,9.4,86.2,99.9,673.0,2.675,22.275,13.0,11.925,12.2
1,1102,388.000000,Avocet YrA,28,69,0.182509,0.511479,0.487333,1.248133,2.602399,6.192260,15.943737,2.115066,2.090920,15.943737,0.353116,0.182328,3.618459,3.978643,6.041974,3.171946,4.607380,-4.574699,127.869795,0.640767,4.110108,37.432677,-0.732920,2.119392,0.636821,3.768821,3.707001,4.215864,4.131700,10.720113,51.680407,46.432677,-2.368768,55.520962,110,11.95,17.4625,6.7,5.107692,77.78125,96.7125,433.5625,2.2,15.8375,12.65,...,13.2,13.0,2.286482,3.12642,3.070722,6.328608,11.602254,5.999764,291.08028,0.843801,7.381407,0.825429,0.620618,0.518973,10.375,15.325,4.575,0.15,72.3,96.675,132.0,1.6,10.175,12.3,11.3,11.7,11.4,17.35,7.05,3.053846,80.5,99.25,504.5,2.2,15.25,12.6,11.8,12.0,12.7,19.1,8.525,9.4,86.2,99.9,673.0,2.675,22.275,13.0,11.925,12.2
2,1103,541.333333,GN08557,1313,70,0.189568,0.487165,0.576326,1.270945,2.649076,5.807165,17.997416,2.072750,2.161911,17.997416,0.208435,-0.111258,3.469245,3.862061,6.194040,3.154063,4.941695,-4.356682,120.798510,-0.400147,4.756754,40.076434,-1.290414,1.581081,0.491666,3.627881,3.817525,4.009409,4.245161,10.633689,45.700682,49.076434,-2.873821,49.846219,108,11.95,17.4625,6.7,5.107692,77.78125,96.7125,433.5625,2.2,15.8375,12.65,...,13.2,13.0,2.286482,3.12642,3.070722,6.328608,11.602254,5.999764,291.08028,0.843801,7.381407,0.825429,0.620618,0.518973,10.375,15.325,4.575,0.15,72.3,96.675,132.0,1.6,10.175,12.3,11.3,11.7,11.4,17.35,7.05,3.053846,80.5,99.25,504.5,2.2,15.25,12.6,11.8,12.0,12.7,19.1,8.525,9.4,86.2,99.9,673.0,2.675,22.275,13.0,11.925,12.2
3,1104,572.000000,GN08541,1311,69,0.229794,0.575855,0.624427,1.414102,2.908922,5.872472,16.938067,2.284495,2.333067,16.938067,0.297490,-0.000853,3.661671,4.185415,6.023914,3.123900,4.794646,-4.647912,135.126811,0.046213,4.741883,36.958516,-1.100499,2.064596,0.726148,3.852318,3.944067,4.159067,4.262833,11.160792,45.941043,45.958516,-2.826057,50.037770,109,11.95,17.4625,6.7,5.107692,77.78125,96.7125,433.5625,2.2,15.8375,12.65,...,13.2,13.0,2.286482,3.12642,3.070722,6.328608,11.602254,5.999764,291.08028,0.843801,7.381407,0.825429,0.620618,0.518973,10.375,15.325,4.575,0.15,72.3,96.675,132.0,1.6,10.175,12.3,11.3,11.7,11.4,17.35,7.05,3.053846,80.5,99.25,504.5,2.2,15.25,12.6,11.8,12.0,12.7,19.1,8.525,9.4,86.2,99.9,673.0,2.675,22.275,13.0,11.925,12.2
4,1105,542.666667,SW44431,1324,67,0.262217,0.643494,0.820634,1.565365,2.852558,5.076302,15.816345,2.031924,2.209064,15.816345,0.204137,-0.301256,3.209028,3.682072,5.690066,2.636420,4.760904,-3.699399,114.829872,-0.880642,5.586722,31.190989,-1.507757,0.746740,0.214734,3.380044,3.730383,3.621856,4.029280,10.578458,31.725288,40.190989,-3.963863,36.464299,106,11.95,17.4625,6.7,5.107692,77.78125,96.7125,433.5625,2.2,15.8375,12.65,...,13.2,13.0,2.286482,3.12642,3.070722,6.328608,11.602254,5.999764,291.08028,0.843801,7.381407,0.825429,0.620618,0.518973,10.375,15.325,4.575,0.15,72.3,96.675,132.0,1.6,10.175,12.3,11.3,11.7,11.4,17.35,7.05,3.053846,80.5,99.25,504.5,2.2,15.25,12.6,11.8,12.0,12.7,19.1,8.525,9.4,86.2,99.9,673.0,2.675,22.275,13.0,11.925,12.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523,1862,532.000000,GN12687,1574,68,0.165040,0.488308,0.434279,1.254630,2.809118,6.587422,16.932014,2.374839,2.320809,16.932014,0.377298,0.257615,3.955254,4.434978,6.322413,3.433760,4.678603,-5.488422,144.651485,1.057302,4.057210,43.279402,-0.531704,3.130523,1.040400,4.139693,4.013339,4.562506,4.401701,11.389427,65.054484,52.279402,-1.556144,68.556117,109,11.95,17.4625,6.7,5.107692,77.78125,96.7125,433.5625,2.2,15.8375,12.65,...,13.2,13.0,2.286482,3.12642,3.070722,6.328608,11.602254,5.999764,291.08028,0.843801,7.381407,0.825429,0.620618,0.518973,10.375,15.325,4.575,0.15,72.3,96.675,132.0,1.6,10.175,12.3,11.3,11.7,11.4,17.35,7.05,3.053846,80.5,99.25,504.5,2.2,15.25,12.6,11.8,12.0,12.7,19.1,8.525,9.4,86.2,99.9,673.0,2.675,22.275,13.0,11.925,12.2
524,1863,558.666667,GN13616,1600,68,0.198336,0.560793,0.507790,1.396119,3.039426,6.443868,16.548794,2.531636,2.478633,16.548794,0.415460,0.265347,4.038766,4.643327,6.190259,3.337846,4.646435,-5.623266,154.018267,1.027413,4.335603,40.324361,-0.595297,3.190622,1.176099,4.252575,4.128783,4.578599,4.422551,11.771370,60.819620,49.324361,-1.824850,64.441742,108,11.95,17.4625,6.7,5.107692,77.78125,96.7125,433.5625,2.2,15.8375,12.65,...,13.2,13.0,2.286482,3.12642,3.070722,6.328608,11.602254,5.999764,291.08028,0.843801,7.381407,0.825429,0.620618,0.518973,10.375,15.325,4.575,0.15,72.3,96.675,132.0,1.6,10.175,12.3,11.3,11.7,11.4,17.35,7.05,3.053846,80.5,99.25,504.5,2.2,15.25,12.6,11.8,12.0,12.7,19.1,8.525,9.4,86.2,99.9,673.0,2.675,22.275,13.0,11.925,12.2
525,1864,362.666667,SHA3/CBRD,29,70,0.175931,0.602858,0.516092,1.485383,3.022629,6.428098,14.194130,2.506537,2.419770,14.194130,0.513693,0.394023,4.012935,4.603960,6.007367,3.088743,4.290959,-5.702066,153.862856,1.501946,4.432513,37.235161,-0.234030,3.209902,1.141322,4.222584,4.020439,4.555877,4.299516,11.732094,65.704849,46.235161,-1.673069,69.321626,109,11.95,17.4625,6.7,5.107692,77.78125,96.7125,433.5625,2.2,15.8375,12.65,...,13.2,13.0,2.286482,3.12642,3.070722,6.328608,11.602254,5.999764,291.08028,0.843801,7.381407,0.825429,0.620618,0.518973,10.375,15.325,4.575,0.15,72.3,96.675,132.0,1.6,10.175,12.3,11.3,11.7,11.4,17.35,7.05,3.053846,80.5,99.25,504.5,2.2,15.25,12.6,11.8,12.0,12.7,19.1,8.525,9.4,86.2,99.9,673.0,2.675,22.275,13.0,11.925,12.2
526,1865,573.333333,SW51127,1325,68,0.190481,0.535340,0.402044,1.346569,3.108257,6.961186,17.114408,2.706213,2.572917,17.114408,0.478155,0.450634,4.339698,4.985903,6.363094,3.588995,4.600288,-6.369537,167.704656,1.945754,3.811427,44.944455,-0.239413,4.210974,1.525407,4.566103,4.267188,4.930346,4.558747,12.220013,76.767109,53.944455,-0.819189,79.952491,110,11.95,17.4625,6.7,5.107692,77.78125,96.7125,433.5625,2.2,15.8375,12.65,...,13.2,13.0,2.286482,3.12642,3.070722,6.328608,11.602254,5.999764,291.08028,0.843801,7.381407,0.825429,0.620618,0.518973,10.375,15.325,4.575,0.15,72.3,96.675,132.0,1.6,10.175,12.3,11.3,11.7,11.4,17.35,7.05,3.053846,80.5,99.25,504.5,2.2,15.25,12.6,11.8,12.0,12.7,19.1,8.525,9.4,86.2,99.9,673.0,2.675,22.275,13.0,11.925,12.2


# Temp: Exporting data for modeling

In [28]:
os.makedirs(temp_data, exist_ok=True)
for df in df_to_export:
    locals()[df].to_csv(export_path+df+'.csv', index=False)

# Import Genomics Data

In [31]:
genomics_data

'C:/Users/fahad/\\\\MegaSync\\NMBU\\Master Thesis\\Data\\Genomics\\\\'

## Importing Genomics Data

In [22]:
# If the dataset had Days 2 heading and days to maturity columns then create the
# following dictionary with the respective sowing dates of each field as value

for df in all_df:
    temp_df = locals()[df].copy()
    field_temp = df.split('_')[0]+'_'+df.split('_')[1]
    if 'Name' in temp_df.columns:
        print(df)
#         all_df_dates_filtered[df] = sowing_dict[field_temp]
# all_df_dates_filtered

Graminor_2019_all
Graminor_2020_all
Masbasis_2019_all
Masbasis_2020_all_lodg
Robot_2020_all
Staur_2019_all
Staur_2020_all_lodg


In [None]:
# Importing Yield data with line information

In [64]:
# Vollebekk 2019: Graminor_2019_x_19TvPhenores_x_Vollebekk_res
# Masbasis 2020: Masbasis_x_20BMLGI1_2020_tm_x_data
# Robot 2020: Robot_x_ROBOT_2020_x_raw
# Masbasis 2019: Masbasis_2019_x_Field_data_2019

In [65]:
a_file = open(main_path+'yield_df.json', "r")
output_str = a_file.read()
# The file is imported as string

# Converting it to dictionary
output_dict = json.loads(output_str)
a_file.close()

pprint(output_dict)

{'Graminor 2019': ['Graminor_2019_x_19TvPhenores_x_Vollebekk_res',
                   '\\MegaSync\\NMBU\\Master '
                   'Thesis\\Data\\Feb2021\\Graminor_2019\\19TvPhenores.xlsx'],
 'Masbasis 2019': ['Masbasis_2019_x_Field_data_2019',
                   '\\MegaSync\\NMBU\\Master '
                   'Thesis\\Data\\Feb2021\\Masbasis_2019\\Field_data_2019.xlsx'],
 'Masbasis 2020': ['Masbasis_x_20BMLGI1_2020_tm_x_data',
                   '\\MegaSync\\NMBU\\Master '
                   'Thesis\\Data\\Feb2021\\Vollebekke-total_2020\\Masbasis\\20BMLGI1_2020_tm.xlsx'],
 'Robot 2020': ['Robot_x_ROBOT_2020_x_raw',
                '\\MegaSync\\NMBU\\Master '
                'Thesis\\Data\\Feb2021\\Vollebekke-total_2020\\Robot\\ROBOT_2020.xlsx'],
 'Staur 2019': ['Graminor_2019_x_19TvPhenores_x_Staur_res',
                '\\MegaSync\\NMBU\\Master '
                'Thesis\\Data\\Feb2021\\Graminor_2019\\19TvPhenores.xlsx']}


### Checking number of unique cultivars in the field

In [66]:
# plots_data = pd.read_excel(files_with_address[0],engine='openpyxl')
# # Pandas converts 'NA' string to NaN. Need to change those to 
# # some string to get a count as NaNs are not counted as unique values

# plots_data.Name.fillna('-', inplace=True)
# plots_data.CodeName.fillna('-', inplace=True)

# # Creating a new column as multiple plots were named 'NA' but the 
# # CodeName was different for each one of them
# plots_data['NameCode'] = plots_data.Name+plots_data.CodeName

# plots_data
# len(plots_data.NameCode.unique())
# plots_data.NameCode.value_counts()
# # plots_data.NameCode.value_counts().sum()
# # plots_data

# ToDo: Dropping NAN

## Finding NAN values
### ToDo: Test: Raise error if missing values found

In [25]:
# Finding number of missing values in each dataframe
df_with_nan = []
missing_values = False
for df in all_df:
    if locals()[df].isna().sum().sum() > 0:
        print(f'Total missing values in {df} are {locals()[df].isna().sum().sum()}')
        missing_values = True
        df_with_nan.append(df)
#     if len(df_with_nan) > 0:
#         raise ValueError
if not missing_values:
    print('No missing value found in any dataframe')

No missing value found in any dataframe


In [26]:
Graminor_2019_all.isnull().sum().sort_values()

NameError: name 'Graminor_2019_all' is not defined

In [27]:
df_with_nan

[]

In [28]:
# Finding which column has NAN values
for df in df_with_nan:
    print(f'{df}:\n {locals()[df].shape[1]-locals()[df].dropna(axis=1).shape[1]} columns or {locals()[df].shape[0]-locals()[df].dropna().shape[0]} rows to be dropped,')

## ToDo: Automate: Drop rows with missing values in df_with_nan

In [29]:
print(f'{Graminor_eastwest_020719_NIR_half_missing.shape} Before dropping')
# Graminor_eastwest_020719_NIR_half_missing.dropna(inplace=True)
print(f'{Graminor_eastwest_020719_NIR_half_missing.shape} After dropping')


NameError: name 'Graminor_eastwest_020719_NIR_half_missing' is not defined

## ORRR

## ToDo: Droppping df with Nan from the all_df_std

In [30]:
print(f'Number of items in all_df is {len(all_df)}')

Number of items in all_df is 0


In [31]:
# for df in df_with_nan:
#     all_df.remove(df)

###  ToDo: Update field_year_dict and sorted_field_year_dict after dropping the dataset

In [32]:
print(f'Number of items in all_df now is {len(all_df)}')

Number of items in all_df now is 0


# Data Trends

## Normal Distribution of data

ToDo:  
see the distribution of data if it is normal  
else make transpose to make it normal  
dist in Gausion function   
in each field  
what if the data is normal dist?  
the use some transpose to box pox   
try diff funct to see which one iis able to make data normal  
make heat map of whole if not normal  
see which parts are not normal and exculde them  
ls_means in R to make the normalisation/transpose  
pearson corr bw yield and indices for diff dates  


In [33]:
x_labels

NameError: name 'x_labels' is not defined

### Yeo-Johnson Transformation

In [34]:
col_for_plotting = ['Blue', 'Green', 'Red', 'RedEdge', 'NIR', 'NDVI', 'MTCI', 'EVI']

from sklearn.preprocessing import PowerTransformer, normalize, StandardScaler
data_agg_list = ['_median_indices']

# col_for_plotting = ['Blue']
# col_for_plotting = ['Green']
# col_for_plotting = ['Red']

for d_type in data_agg_list:
    for col in col_for_plotting:
        fields = len(field_year_dict_yield.keys())
        rows = math.ceil(fields/2)
        
        fig, ax = plt.subplots(rows,2, figsize=(15,10))
        plots = ax.flatten()
        n = 0
        # TODO: Fix the x ticks
        

        for field_sample, dates in sorted_field_year_dict_yield.items():
            x_labels = []
            # Adding required data to a temp dataframe
            temp_df = pd.DataFrame()
            for date in dates:
                date_str = date.strftime('%d%m%y')
                field_df = field_sample[:-5]+'_'+date_str+d_type
                temp_df[date] = locals()[field_df][col]
                x_label = date.strftime('%d-%m-%y')+':'+str(len(locals()[field_df][col]))
                
                x_labels.append(x_label)
                x_labels= list(set(x_labels))
            # Transform the df
#             pt = PowerTransformer(method='box-cox', standardize=False)
            pt = PowerTransformer(method='yeo-johnson', standardize=False)

            temp_arr = pt.fit_transform(temp_df)
            temp_df = pd.DataFrame(temp_arr)
            
            # Adding field plot to the subplots
            num_of_fields = len(field_year_dict_yield.keys())
            
            text = "Grain Yield"
            ax_n = plots[n]
            
            temp_df.boxplot(ax=ax_n)
            ax_n.set_xticklabels(x_labels, rotation=-35)
            ax_n.set_title(field_sample+'_'+col+d_type[:-5]+'_yeo-johnson')
            
#             # Printing the grain yield in plot of the fiels_sample for reference
#             ax_n.text(0.85, 1.05, text, ha='center', va='top', weight='bold', color='blue', transform=ax_n.transAxes)
            n+=1
        plt.tight_layout()



NameError: name 'field_year_dict_yield' is not defined

### Box-Cox Transformation

In [35]:
col_for_plotting = ['Blue', 'Green', 'Red', 'RedEdge', 'NIR', 'NDVI', 'MTCI', 'EVI']

from sklearn.preprocessing import PowerTransformer, normalize, StandardScaler
data_agg_list = ['_median_indices']

# col_for_plotting = ['Blue']
# col_for_plotting = ['Green']
# col_for_plotting = ['Red']

for d_type in data_agg_list:
    for col in col_for_plotting:
        fields = len(field_year_dict_yield.keys())
        rows = math.ceil(fields/2)
        
        fig, ax = plt.subplots(rows,2, figsize=(15,10))
        plots = ax.flatten()
        n = 0
        # TODO: Fix the x ticks
        for field_sample, dates in sorted_field_year_dict_yield.items():
            
            # Adding required data to a temp dataframe
            temp_df = pd.DataFrame()
            for date in dates:
                date_str = date.strftime('%d%m%y')
                field_df = field_sample[:-5]+'_'+date_str+d_type
                temp_df[date] = locals()[field_df][col]
            x_labels = temp_df.columns.tolist()

            # Transform the df
#             pt = PowerTransformer(method='box-cox', standardize=False)
            pt = PowerTransformer(method='box-cox', standardize=False)

            # Taking absolute values of the dataframe(avoiding negative values)
            temp_arr = pt.fit_transform(temp_df.abs())
            temp_df = pd.DataFrame(temp_arr)
            
            # Adding field plot to the subplots
            num_of_fields = len(field_year_dict_yield.keys())
            
            text = "Grain Yield"
            ax_n = plots[n]

            temp_df.boxplot(ax=ax_n)
            ax_n.set_xticklabels(x_labels, rotation=90)
            ax_n.set_title(field_sample+'_'+col+d_type[:-5]+'_box-cox')
            
#             # Printing the grain yield in plot of the fiels_sample for reference
#             ax_n.text(0.85, 1.05, text, ha='center', va='top', weight='bold', color='blue', transform=ax_n.transAxes)
            n+=1
        plt.tight_layout()



NameError: name 'field_year_dict_yield' is not defined

### ToDo: Identify Dates and index with problems

### Ecxclude the problematic data/dates
or
### Take average values where the problematic data is

Take average of data for date 20200708 and 20200624  
Masbasis  
Cleanup  
Remove dates which have drop  

## ToDo: Remove outliers

### Find AUC for all dates of one field
See if it covers tha gaps under the dates,i.e.

Since data points are different  
Flying time is different  
Cover the gaps between the dates  

Since the data collection is not uniform throughout the year so AUC will give a single value instead of multiple values for one field year which will be representative of all the dates 

#### Option 1: Use Scipy

In [36]:
import scipy
scipy.__version__

'1.6.1'

In [37]:
from scipy import integrate
from scipy.integrate import simps

In [38]:
from scipy.integrate import simpson

In [39]:
x = np.arange(0, 10)
y = np.arange(0, 10)


In [40]:
# integrate.simpson(y, x)
integrate.simps(y, x)

40.5

In [41]:
y = np.power(x, 3)
y

array([  0,   1,   8,  27,  64, 125, 216, 343, 512, 729], dtype=int32)

In [42]:
integrate.simpson(y, x)
# integrate.simps(y, x)


1642.5

In [43]:
integrate.quad(lambda x: x**3, 0, 9)[0]

1640.25

In [44]:
integrate.simpson(y, x, even='first')
# integrate.simps(y, x, even='first')

1644.5

#### Option 2

In [45]:
data
# plot: Plot ID
# x: Number of days after sowing or actual date
# y: Value of the index


NameError: name 'data' is not defined

In [46]:
# x: Days from sowing to data collection
# May 5 2019 Masbasis and Graminor
# Robot: 

data={'plot':['1','1','2','2','3','3'],'x':['5','6','7','8','9','10'],'y':['0.9','0.8','0.7','0.6','0.5','0.4'] }

ACC=[]
A=pd.DataFrame(data, columns=['plot','x','y'])
AA=0

for item in range(len(A)-1):
    if A['plot'][item]== A['plot'][item+1]:
        Ans=(float((A['y'][item]))+float((A['y'][item+1])))*((float((A['x'][item+1]))-float((A['x'][item]))))/2
        AA+=Ans
        print(AA)
        ACC.append(AA)

0.8500000000000001
1.5
1.95


### Alternative

In [47]:
df1=Data.set_index(['Plot'])
ACC=[]

for item in Numbers_final:
    df2=df1[df1.index==item]
    df2=df2.filter(['Blue', 'Green', 'Red', 'RedEdge', 'NIR','NDVI', 'MTCI', 'EVI', 'DVI', 'RVI', 'VARI', 'EXG', 'EXGR', 'GLI', 'GNDVI', 'GVI','Time','timepoint'], axis=1)
    df2=df2.sort_values(by='timepoint')
    df3=df2.reset_index()

AA=0
for j in range(0,3):
    Ans=(float((df3['GVI'][j]))+float((df3['GVI'][j+1])))*((float((df3['timepoint'][j+1]))-float((df3['timepoint'][j]))))/2
    AA+=Ans

    print(AA)
    ACC.append(AA)



DA=pd.DataFrame(ACC)
DD=pd.DataFrame(Numbers_final)
DDA=pd.concat([DD, DA], axis=1)
DDA.to_excel('Staur_Accumulative_GVI_2019.xlsx')

NameError: name 'Data' is not defined

### Time series data vs the AUC

# ToDo: Model Training


Make model for one year at a time and try to predict yield of another field  

TODO: Train on Masbasis 2019 an 2020  
Test on Staur  

Use data until august for yield prediction since it is most relavant  
Use all data for predicting date to maturity  

Data Collection:  
Data collection usually starts after heading  
2019 has the data before hading as well. To use that, dont use dates before heading  

NDVI is resistant to shadows  

DAT390 Report: Do the report with Robot Data only  

TODO: Use AUC for each index for prediction  

TODO:   
Time series data vs the AUC  