# 2. Renaming and merging

Note: Half datasets, with separate files for east and west subplots have been merged manually in excel

In [32]:
%%time

import pandas as pd
import numpy as np
import math
import os
import datetime
from copy import copy
from pprint import pprint
import json

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# To display df nicely in loops
from IPython.display import display 
# display(df1.head()) 
# display(df2.head())

# Display rows and columns Pandas
pd.options.display.max_columns = 100
pd.set_option('display.max_rows',100)

Wall time: 0 ns


In [33]:
# Prints the current working directory
os.getcwd()
# os.listdir()

'C:\\Users\\fahad\\MegaSync\\NMBU\\GitHub\\vPheno'

## Finding Username folder to make general path for multi PC use

In [34]:
username = str(os.getcwd()).split('\\')[2]
user_path = r'C:/Users/'+username+'/'
username, user_path

('fahad', 'C:/Users/fahad/')

## Importing Data

In [35]:
main_path = r'./Data/'
path = r'./Data/renamed_merged/'
export_path = './Data/results/'

# Create export_path folder if not exists already
os.makedirs(path, exist_ok=True)
os.makedirs(export_path, exist_ok=True)

os.listdir(path)

['Graminor_2019_all.csv',
 'Masbasis_2019_all.csv',
 'Masbasis_2020_all.csv',
 'Robot_2020_all.csv']

## Data Preparation
### Creating list of complete files

In [36]:
# Get the list of all files in directory tree at given path

files_with_address = []
files_list = []

for (dirpath, dirnames, filenames) in os.walk(path):
    files_with_address += [os.path.join(dirpath, file) for file in filenames]
    files_list.extend(filenames)
    
print(len(files_with_address), 'files found in the directory')
# files_with_address
files_list

4 files found in the directory


['Graminor_2019_all.csv',
 'Masbasis_2019_all.csv',
 'Masbasis_2020_all.csv',
 'Robot_2020_all.csv']

## Data Checking/control

### Check for duplicate filenames

In [37]:
print('Total number of files are :', len(files_list))

print('Number of unique file names are:', len(set(files_list)))

print('There is/are', len(files_list) - len(set(files_list)),'duplicate file name/names.')
if len(files_list) - len(set(files_list)) > 0:
    raise NameError

Total number of files are : 4
Number of unique file names are: 4
There is/are 0 duplicate file name/names.


## Importing Genomics Data

### Checking number of unique cultivars in the field

In [38]:
# Vollebekk 2019: Graminor_2019_x_19TvPhenores_x_Vollebekk_res
# Masbasis 2020: Masbasis_x_20BMLGI1_2020_tm_x_data
# Robot 2020: Robot_x_ROBOT_2020_x_raw
# Masbasis 2019: Masbasis_2019_x_Field_data_2019

In [39]:
a_file = open(main_path+'yield_df.json', "r")
output_str = a_file.read()
# The file is imported as string

# Converting it to dictionary
output_dict = json.loads(output_str)
a_file.close()

pprint(output_dict)

{'Graminor 2019': ['Graminor_2019_x_19TvPhenores_x_Vollebekk_res',
                   '\\MegaSync\\NMBU\\Master '
                   'Thesis\\Data\\Feb2021\\Graminor_2019\\19TvPhenores.xlsx'],
 'Masbasis 2019': ['Masbasis_2019_x_Field_data_2019',
                   '\\MegaSync\\NMBU\\Master '
                   'Thesis\\Data\\Feb2021\\Masbasis_2019\\Field_data_2019.xlsx'],
 'Masbasis 2020': ['Masbasis_x_20BMLGI1_2020_tm_x_data',
                   '\\MegaSync\\NMBU\\Master '
                   'Thesis\\Data\\Feb2021\\Vollebekke-total_2020\\Masbasis\\20BMLGI1_2020_tm.xlsx'],
 'Robot 2020': ['Robot_x_ROBOT_2020_x_raw',
                '\\MegaSync\\NMBU\\Master '
                'Thesis\\Data\\Feb2021\\Vollebekke-total_2020\\Robot\\ROBOT_2020.xlsx'],
 'Staur 2019': ['Graminor_2019_x_19TvPhenores_x_Staur_res',
                '\\MegaSync\\NMBU\\Master '
                'Thesis\\Data\\Feb2021\\Graminor_2019\\19TvPhenores.xlsx']}


In [40]:
# plots_data = pd.read_excel(files_with_address[0],engine='openpyxl')
# # Pandas converts 'NA' string to NaN. Need to change those to 
# # some string to get a count as NaNs are not counted as unique values

# plots_data.Name.fillna('-', inplace=True)
# plots_data.CodeName.fillna('-', inplace=True)

# # Creating a new column as multiple plots were named 'NA' but the 
# # CodeName was different for each one of them
# plots_data['NameCode'] = plots_data.Name+plots_data.CodeName

# plots_data
# len(plots_data.NameCode.unique())
# plots_data.NameCode.value_counts()
# # plots_data.NameCode.value_counts().sum()
# # plots_data

## Importing data files to Pandas

In [41]:
   
%%time

all_df = []
for data in files_with_address:
    file_name = os.path.splitext(os.path.basename(data))[0]

    # Replce all invalid characters in the name
    file_name = file_name.replace(" ", "_")
    file_name = file_name.replace("-", "_")
    file_name = file_name.replace(")", "")
    file_name = file_name.replace("(", "")
    df_name = file_name.replace(".", "")
    # Test: Check if the same date is already present in the current dict key
    if df_name in all_df:
        print(f'A file with the same name {df_name} has already been imported. \n Please check if there is duplication of data.')
        raise NameError
    all_df.append(df_name)

    locals()[df_name] = pd.read_csv(data, index_col=False)
    print(df_name, '=====', locals()[df_name].shape)
# all_df

Graminor_2019_all ===== (601, 376)
Masbasis_2019_all ===== (528, 274)
Masbasis_2020_all ===== (688, 410)
Robot_2020_all ===== (96, 478)
Wall time: 356 ms


In [42]:
print(f'Total imported {len(all_df)}')
all_df

Total imported 4


['Graminor_2019_all',
 'Masbasis_2019_all',
 'Masbasis_2020_all',
 'Robot_2020_all']

# Simpsons Integration

## Listing down the important dates for each field

In [43]:
# Sowing date of the field
sowing = '200420'
maturity = '310720'
heading = '170620'

sowing_date = datetime.datetime.strptime(sowing, '%d%m%y').date()
maturity_date = datetime.datetime.strptime(maturity, '%d%m%y').date()
heading_date =datetime.datetime.strptime(heading, '%d%m%y').date()

In [44]:
# ToDo: Add check for duplicate columns in the df

general_col_names = ['Plot_ID', 'Blue', 'Green', 'Red', 'RedEdge', 'NIR']

# list_agg_df
yield_cols = ['GrainYield', 'Days2Heading', 'Days2Maturity']
id_cols_new = ['Plot_ID']

# Counter for location of column in columns list

# Dict for saving the name and location of the yield column/s
loc_yield_cols = {}
for df in all_df:
    loc = 0
    for cols in locals()[df].columns.tolist():
        for y_col in yield_cols:
            if not cols.find(y_col):
                loc_yield_cols[cols] = loc
                print(f'\"{cols}\" column in {df} is the yield column\n as it contains the text \"{y_col}\". It is located at location {loc}')
        loc += 1

    yield_cols_found = list(loc_yield_cols.keys())
    target_cols=yield_cols_found[0]
loc_yield_cols

"GrainYield" column in Graminor_2019_all is the yield column
 as it contains the text "GrainYield". It is located at location 6
"GrainYield" column in Masbasis_2019_all is the yield column
 as it contains the text "GrainYield". It is located at location 6
"GrainYield" column in Masbasis_2020_all is the yield column
 as it contains the text "GrainYield". It is located at location 6
"GrainYield" column in Robot_2020_all is the yield column
 as it contains the text "GrainYield". It is located at location 6


{'GrainYield': 6}

In [50]:
id_cols_new+yield_cols_found

['Plot_ID', 'GrainYield']

In [47]:
df_aoc = df[id_cols_new+yield_cols_found]
df_aoc

TypeError: string indices must be integers

In [46]:
df_aoc = df[id_cols_new+yield_cols_found]
# Renaming Grain_Yield Column #custom
df_aoc.columns = [df_aoc.columns[0]]+['GrainYield']
df_aoc.head(2)

TypeError: string indices must be integers

In [45]:
Graminor_2019_all

Unnamed: 0,Plot_ID,Blue_060619,Green_060619,Red_060619,RedEdge_060619,NIR_060619,GrainYield,NDVI_060619,MTCI_060619,DVI_060619,GDVI_060619,MTCI_CI_060619,EXG_060619,EXGR_060619,RDVI_060619,TDVI_060619,GNDVI_060619,NDRE_060619,SCCI_060619,EVI_060619,TVI_060619,VARI_060619,GARI_060619,GCI_060619,GLI_060619,NLI_060619,MNLI_060619,SAVI_060619,GSAVI_060619,OSAVI_060619,GOSAVI_060619,MSAVI2_060619,MSR_060619,GRVI_060619,WDRVI_060619,SR_060619,Blue_110619,Green_110619,Red_110619,RedEdge_110619,NIR_110619,NDVI_110619,MTCI_110619,DVI_110619,GDVI_110619,MTCI_CI_110619,EXG_110619,EXGR_110619,RDVI_110619,TDVI_110619,...,TVI_070819,VARI_070819,GARI_070819,GCI_070819,GLI_070819,NLI_070819,MNLI_070819,SAVI_070819,GSAVI_070819,OSAVI_070819,GOSAVI_070819,MSAVI2_070819,MSR_070819,GRVI_070819,WDRVI_070819,SR_070819,Blue_150819,Green_150819,Red_150819,RedEdge_150819,NIR_150819,NDVI_150819,MTCI_150819,DVI_150819,GDVI_150819,MTCI_CI_150819,EXG_150819,EXGR_150819,RDVI_150819,TDVI_150819,GNDVI_150819,NDRE_150819,SCCI_150819,EVI_150819,TVI_150819,VARI_150819,GARI_150819,GCI_150819,GLI_150819,NLI_150819,MNLI_150819,SAVI_150819,GSAVI_150819,OSAVI_150819,GOSAVI_150819,MSAVI2_150819,MSR_150819,GRVI_150819,WDRVI_150819,SR_150819
0,101,0.062427,0.129283,0.129278,0.185566,0.407937,499.624440,0.518710,3.950610,0.278659,0.278654,3.950610,0.066861,0.015155,0.380188,0.468589,0.518696,0.374675,0.722322,-0.379451,16.719722,0.000025,0.603848,2.155378,-0.138632,0.125585,0.070004,0.402991,0.402982,0.399674,0.399664,1.299123,2.592551,3.155378,-0.520277,3.155496,0.079597,0.157513,0.168775,0.220306,0.404954,0.411656,3.583244,0.236179,0.247441,3.583244,0.066653,-0.012119,0.311809,0.388215,...,12.397675,-0.448297,0.347059,9.475924,-0.379698,0.230615,0.062710,0.407691,0.450596,0.471129,0.529443,1.144776,5.724268,10.475924,-0.240062,6.128223,0.028785,0.058441,0.117516,0.140037,0.219810,0.303250,3.542136,0.102294,0.161369,3.542136,-0.029419,-0.135500,0.176127,0.188044,0.579940,0.221686,0.731033,-0.150329,3.774674,-0.401399,0.854320,2.761220,-0.333835,-0.417284,-0.155894,0.183252,0.311022,0.205689,0.368211,0.879676,1.139292,3.761220,-0.684853,1.870472
1,102,0.061705,0.129847,0.127858,0.184991,0.404994,515.532751,0.520099,3.850745,0.277136,0.275147,3.850745,0.070131,0.020976,0.379655,0.467148,0.514447,0.372896,0.716972,-0.379650,16.707685,0.010147,0.597877,2.119012,-0.132922,0.123894,0.068499,0.402481,0.398825,0.399993,0.395986,1.295456,2.605650,3.119012,-0.518888,3.167525,0.075853,0.152404,0.163355,0.220335,0.405466,0.425636,3.249073,0.242111,0.253062,3.249073,0.065600,-0.010693,0.321016,0.399166,...,11.737341,-0.482118,0.363954,9.827102,-0.392431,0.193261,0.050652,0.394101,0.441762,0.459487,0.524948,1.118907,5.473275,10.827102,-0.259012,5.885476,0.025709,0.051825,0.102399,0.122849,0.201019,0.325028,3.822563,0.098619,0.149194,3.822563,-0.024459,-0.115994,0.179037,0.184507,0.590065,0.241364,0.742593,-0.153498,3.894176,-0.393533,0.795194,2.878825,-0.329154,-0.434086,-0.144656,0.184125,0.297261,0.212809,0.361382,0.859646,1.249363,3.878825,-0.671809,1.963087
2,103,0.063729,0.132427,0.132313,0.194414,0.430182,529.501025,0.529549,3.796499,0.297869,0.297755,3.796499,0.068813,0.016002,0.397160,0.494205,0.529238,0.377472,0.712817,-0.404347,17.876711,0.000569,0.621582,2.248436,-0.138024,0.166189,0.096792,0.420523,0.420316,0.412278,0.412055,1.341230,2.696649,3.248436,-0.509292,3.251243,0.075147,0.149900,0.164103,0.221534,0.412597,0.430889,3.326803,0.248494,0.262697,3.326803,0.060550,-0.019294,0.327221,0.408071,...,12.233252,-0.350342,0.323931,9.840122,-0.338834,0.269818,0.065369,0.406650,0.437229,0.479428,0.521912,1.128248,6.655479,10.840122,-0.174221,7.032567,0.024099,0.047830,0.084905,0.113178,0.184771,0.370319,2.532189,0.099866,0.136941,2.532189,-0.013344,-0.084380,0.192308,0.190392,0.588738,0.240286,0.648862,-0.165847,4.508981,-0.341276,0.684723,2.863084,-0.298898,-0.426430,-0.123007,0.194627,0.280387,0.232422,0.348805,0.850718,1.498339,3.863084,-0.642547,2.176214
3,104,0.062525,0.127990,0.130862,0.186196,0.397415,544.503985,0.504571,3.817141,0.266553,0.269425,3.817141,0.062594,0.007378,0.366736,0.450186,0.512794,0.361917,0.717277,-0.358909,15.878329,-0.014626,0.606708,2.105043,-0.145530,0.093756,0.051490,0.388835,0.394124,0.387276,0.393088,1.273057,2.463077,3.105043,-0.534106,3.036908,0.073349,0.151789,0.159115,0.224403,0.416914,0.447545,2.948651,0.257799,0.265125,2.948651,0.071114,0.000143,0.339671,0.423709,...,10.578978,-0.517790,0.359574,10.759718,-0.408050,0.147209,0.033778,0.370224,0.418171,0.443765,0.511705,1.066306,5.499631,11.759718,-0.256996,5.910944,0.022502,0.044899,0.085384,0.108823,0.182668,0.362930,3.150474,0.097284,0.137769,3.150474,-0.018088,-0.092726,0.187902,0.185513,0.605400,0.253334,0.698026,-0.162313,4.217642,-0.375622,0.714421,3.068425,-0.318624,-0.438027,-0.126100,0.189995,0.284034,0.227272,0.355472,0.844311,1.455684,4.068425,-0.647532,2.139371
4,105,0.056797,0.127818,0.129993,0.191037,0.466161,529.501025,0.563895,4.506941,0.336168,0.338343,4.506941,0.068845,0.014673,0.435389,0.547809,0.569622,0.418631,0.742392,-0.483063,20.083072,-0.010822,0.672267,2.647081,-0.133293,0.251406,0.154573,0.460020,0.463917,0.444576,0.448744,1.421312,3.057981,3.647081,-0.472098,3.586051,0.067687,0.141492,0.153007,0.224422,0.422184,0.467978,2.769218,0.269177,0.280692,2.769218,0.062289,-0.010430,0.354921,0.442857,...,10.081904,-0.532498,0.353657,10.949825,-0.414830,0.121620,0.026395,0.358928,0.406717,0.435354,0.503935,1.043241,5.449990,11.949825,-0.260797,5.862981,0.022015,0.043172,0.082700,0.103487,0.172024,0.350668,3.297066,0.089323,0.128851,3.297066,-0.018371,-0.090978,0.176983,0.171229,0.598762,0.248761,0.709392,-0.149943,3.778293,-0.380597,0.711413,2.984571,-0.322113,-0.472945,-0.130105,0.177529,0.270243,0.215380,0.343424,0.821583,1.386726,3.984571,-0.655617,2.080087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
596,872,0.031274,0.095103,0.045338,0.175002,0.519788,469.682277,0.839548,2.659075,0.474451,0.424686,2.659075,0.113593,0.145222,0.631129,0.788071,0.690669,0.496245,0.591086,-1.201996,30.457618,0.455862,0.487476,4.465558,0.069299,0.712613,0.413557,0.668161,0.571382,0.654300,0.548059,1.737803,11.169425,5.465558,0.068240,11.464762,0.030750,0.107035,0.038412,0.199857,0.597318,0.879156,2.461898,0.558906,0.490284,2.461898,0.144907,0.198165,0.700975,0.886074,...,13.769518,-0.130674,0.286580,9.704081,-0.261456,0.410410,0.099399,0.441445,0.451436,0.517174,0.530948,1.183358,8.850674,10.704081,-0.042714,9.180710,0.027015,0.064249,0.042983,0.150733,0.305187,0.753093,1.433444,0.262204,0.240938,1.433444,0.058501,0.062575,0.444369,0.493129,0.652176,0.338774,0.449844,-0.567385,16.582922,0.265110,0.341564,3.750038,-0.028958,0.368466,0.118271,0.463712,0.415679,0.515977,0.455083,1.258355,6.724913,4.750038,-0.169577,7.100201
597,873,0.026577,0.091572,0.036885,0.180711,0.568449,594.682657,0.878133,2.695898,0.531564,0.476878,2.695898,0.119680,0.159613,0.683216,0.859791,0.722519,0.517565,0.589392,-1.559402,34.081314,0.536777,0.520950,5.207708,0.113983,0.795094,0.499261,0.721362,0.616641,0.694551,0.581544,1.856800,15.156602,6.207708,0.212950,15.411332,0.032683,0.111978,0.040707,0.201914,0.612264,0.875316,2.545487,0.571557,0.500286,2.545487,0.150567,0.205555,0.707314,0.895991,...,11.892889,-0.271709,0.293344,10.335547,-0.306838,0.305953,0.065132,0.401432,0.422137,0.483713,0.513174,1.105566,7.745936,11.335547,-0.105134,8.097357,0.027351,0.064749,0.045209,0.148332,0.298534,0.736961,1.456531,0.253325,0.233785,1.456531,0.056937,0.058393,0.432077,0.477102,0.643536,0.336123,0.456094,-0.537684,15.981078,0.236539,0.347710,3.610663,-0.038659,0.326905,0.103842,0.450359,0.406215,0.502885,0.446767,1.235119,6.214283,4.610663,-0.204570,6.603431
598,874,0.028344,0.095014,0.039843,0.181375,0.544136,528.060114,0.863547,2.563096,0.504293,0.449121,2.563096,0.121842,0.161077,0.659910,0.827352,0.702685,0.500007,0.579016,-1.389237,32.464444,0.517978,0.499254,4.726871,0.103897,0.762789,0.459803,0.697836,0.591390,0.677833,0.561999,1.802556,13.386467,5.726871,0.154587,13.657063,0.031505,0.109895,0.039021,0.201211,0.618383,0.881289,2.572103,0.579362,0.508488,2.572103,0.149264,0.204530,0.714553,0.905343,...,13.046540,-0.091439,0.259371,11.434364,-0.247496,0.457281,0.093933,0.430935,0.436508,0.518777,0.526710,1.148552,10.868752,12.434364,0.055177,11.167987,0.028029,0.063449,0.046859,0.139275,0.267183,0.701574,1.384058,0.220324,0.203734,1.384058,0.052010,0.049857,0.393158,0.420312,0.616194,0.314690,0.448549,-0.449938,13.883026,0.201631,0.332564,3.210971,-0.056688,0.207428,0.059509,0.405981,0.367913,0.464777,0.415247,1.149759,5.283039,4.210971,-0.273737,5.701826
599,875,0.031130,0.109043,0.040605,0.211467,0.630192,489.629146,0.878935,2.450667,0.589587,0.521149,2.450667,0.146350,0.198546,0.719867,0.913264,0.704985,0.497500,0.566026,-1.740399,38.112697,0.577447,0.559046,4.779312,0.128727,0.814482,0.570308,0.755366,0.630812,0.709664,0.579547,1.947078,15.266213,5.779312,0.216302,15.520049,0.076674,0.159097,0.151785,0.204902,0.391301,0.441028,3.509231,0.239516,0.232204,3.509231,0.089734,0.036332,0.325013,0.400456,...,11.895290,-0.198792,0.269476,10.677534,-0.281022,0.350110,0.069277,0.402945,0.416455,0.490766,0.510215,1.100536,8.842800,11.677534,-0.043135,9.172976,0.024904,0.060438,0.040716,0.136301,0.288011,0.752280,1.587168,0.247294,0.227572,1.587168,0.055256,0.058692,0.431317,0.469710,0.653100,0.357543,0.475279,-0.540856,15.626556,0.258652,0.331594,3.765354,-0.027786,0.341516,0.101578,0.447604,0.402332,0.505997,0.447581,1.220533,6.697631,4.765354,-0.171398,7.073623


# Import Genomics Data

# ToDo: Dropping NAN

## Finding NAN values
### ToDo: Test: Raise error if missing values found

In [25]:
# Finding number of missing values in each dataframe
df_with_nan = []
missing_values = False
for df in all_df:
    if locals()[df].isna().sum().sum() > 0:
        print(f'Total missing values in {df} are {locals()[df].isna().sum().sum()}')
        missing_values = True
        df_with_nan.append(df)
#     if len(df_with_nan) > 0:
#         raise ValueError
if not missing_values:
    print('No missing value found in any dataframe')

No missing value found in any dataframe


In [26]:
Graminor_2019_all.isnull().sum().sort_values()

NameError: name 'Graminor_2019_all' is not defined

In [27]:
df_with_nan

[]

In [28]:
# Finding which column has NAN values
for df in df_with_nan:
    print(f'{df}:\n {locals()[df].shape[1]-locals()[df].dropna(axis=1).shape[1]} columns or {locals()[df].shape[0]-locals()[df].dropna().shape[0]} rows to be dropped,')

## ToDo: Automate: Drop rows with missing values in df_with_nan

In [29]:
print(f'{Graminor_eastwest_020719_NIR_half_missing.shape} Before dropping')
# Graminor_eastwest_020719_NIR_half_missing.dropna(inplace=True)
print(f'{Graminor_eastwest_020719_NIR_half_missing.shape} After dropping')


NameError: name 'Graminor_eastwest_020719_NIR_half_missing' is not defined

## ORRR

## ToDo: Droppping df with Nan from the all_df_std

In [30]:
print(f'Number of items in all_df is {len(all_df)}')

Number of items in all_df is 0


In [31]:
# for df in df_with_nan:
#     all_df.remove(df)

###  ToDo: Update field_year_dict and sorted_field_year_dict after dropping the dataset

In [32]:
print(f'Number of items in all_df now is {len(all_df)}')

Number of items in all_df now is 0


# Data Trends

## Normal Distribution of data

ToDo:  
see the distribution of data if it is normal  
else make transpose to make it normal  
dist in Gausion function   
in each field  
what if the data is normal dist?  
the use some transpose to box pox   
try diff funct to see which one iis able to make data normal  
make heat map of whole if not normal  
see which parts are not normal and exculde them  
ls_means in R to make the normalisation/transpose  
pearson corr bw yield and indices for diff dates  


In [33]:
x_labels

NameError: name 'x_labels' is not defined

### Yeo-Johnson Transformation

In [34]:
col_for_plotting = ['Blue', 'Green', 'Red', 'RedEdge', 'NIR', 'NDVI', 'MTCI', 'EVI']

from sklearn.preprocessing import PowerTransformer, normalize, StandardScaler
data_agg_list = ['_median_indices']

# col_for_plotting = ['Blue']
# col_for_plotting = ['Green']
# col_for_plotting = ['Red']

for d_type in data_agg_list:
    for col in col_for_plotting:
        fields = len(field_year_dict_yield.keys())
        rows = math.ceil(fields/2)
        
        fig, ax = plt.subplots(rows,2, figsize=(15,10))
        plots = ax.flatten()
        n = 0
        # TODO: Fix the x ticks
        

        for field_sample, dates in sorted_field_year_dict_yield.items():
            x_labels = []
            # Adding required data to a temp dataframe
            temp_df = pd.DataFrame()
            for date in dates:
                date_str = date.strftime('%d%m%y')
                field_df = field_sample[:-5]+'_'+date_str+d_type
                temp_df[date] = locals()[field_df][col]
                x_label = date.strftime('%d-%m-%y')+':'+str(len(locals()[field_df][col]))
                
                x_labels.append(x_label)
                x_labels= list(set(x_labels))
            # Transform the df
#             pt = PowerTransformer(method='box-cox', standardize=False)
            pt = PowerTransformer(method='yeo-johnson', standardize=False)

            temp_arr = pt.fit_transform(temp_df)
            temp_df = pd.DataFrame(temp_arr)
            
            # Adding field plot to the subplots
            num_of_fields = len(field_year_dict_yield.keys())
            
            text = "Grain Yield"
            ax_n = plots[n]
            
            temp_df.boxplot(ax=ax_n)
            ax_n.set_xticklabels(x_labels, rotation=-35)
            ax_n.set_title(field_sample+'_'+col+d_type[:-5]+'_yeo-johnson')
            
#             # Printing the grain yield in plot of the fiels_sample for reference
#             ax_n.text(0.85, 1.05, text, ha='center', va='top', weight='bold', color='blue', transform=ax_n.transAxes)
            n+=1
        plt.tight_layout()



NameError: name 'field_year_dict_yield' is not defined

### Box-Cox Transformation

In [35]:
col_for_plotting = ['Blue', 'Green', 'Red', 'RedEdge', 'NIR', 'NDVI', 'MTCI', 'EVI']

from sklearn.preprocessing import PowerTransformer, normalize, StandardScaler
data_agg_list = ['_median_indices']

# col_for_plotting = ['Blue']
# col_for_plotting = ['Green']
# col_for_plotting = ['Red']

for d_type in data_agg_list:
    for col in col_for_plotting:
        fields = len(field_year_dict_yield.keys())
        rows = math.ceil(fields/2)
        
        fig, ax = plt.subplots(rows,2, figsize=(15,10))
        plots = ax.flatten()
        n = 0
        # TODO: Fix the x ticks
        for field_sample, dates in sorted_field_year_dict_yield.items():
            
            # Adding required data to a temp dataframe
            temp_df = pd.DataFrame()
            for date in dates:
                date_str = date.strftime('%d%m%y')
                field_df = field_sample[:-5]+'_'+date_str+d_type
                temp_df[date] = locals()[field_df][col]
            x_labels = temp_df.columns.tolist()

            # Transform the df
#             pt = PowerTransformer(method='box-cox', standardize=False)
            pt = PowerTransformer(method='box-cox', standardize=False)

            # Taking absolute values of the dataframe(avoiding negative values)
            temp_arr = pt.fit_transform(temp_df.abs())
            temp_df = pd.DataFrame(temp_arr)
            
            # Adding field plot to the subplots
            num_of_fields = len(field_year_dict_yield.keys())
            
            text = "Grain Yield"
            ax_n = plots[n]

            temp_df.boxplot(ax=ax_n)
            ax_n.set_xticklabels(x_labels, rotation=90)
            ax_n.set_title(field_sample+'_'+col+d_type[:-5]+'_box-cox')
            
#             # Printing the grain yield in plot of the fiels_sample for reference
#             ax_n.text(0.85, 1.05, text, ha='center', va='top', weight='bold', color='blue', transform=ax_n.transAxes)
            n+=1
        plt.tight_layout()



NameError: name 'field_year_dict_yield' is not defined

### ToDo: Identify Dates and index with problems

### Ecxclude the problematic data/dates
or
### Take average values where the problematic data is

Take average of data for date 20200708 and 20200624  
Masbasis  
Cleanup  
Remove dates which have drop  

## ToDo: Remove outliers

### Find AUC for all dates of one field
See if it covers tha gaps under the dates,i.e.

Since data points are different  
Flying time is different  
Cover the gaps between the dates  

Since the data collection is not uniform throughout the year so AUC will give a single value instead of multiple values for one field year which will be representative of all the dates 

#### Option 1: Use Scipy

In [36]:
import scipy
scipy.__version__

'1.6.1'

In [37]:
from scipy import integrate
from scipy.integrate import simps

In [38]:
from scipy.integrate import simpson

In [39]:
x = np.arange(0, 10)
y = np.arange(0, 10)


In [40]:
# integrate.simpson(y, x)
integrate.simps(y, x)

40.5

In [41]:
y = np.power(x, 3)
y

array([  0,   1,   8,  27,  64, 125, 216, 343, 512, 729], dtype=int32)

In [42]:
integrate.simpson(y, x)
# integrate.simps(y, x)


1642.5

In [43]:
integrate.quad(lambda x: x**3, 0, 9)[0]

1640.25

In [44]:
integrate.simpson(y, x, even='first')
# integrate.simps(y, x, even='first')

1644.5

#### Option 2

In [45]:
data
# plot: Plot ID
# x: Number of days after sowing or actual date
# y: Value of the index


NameError: name 'data' is not defined

In [46]:
# x: Days from sowing to data collection
# May 5 2019 Masbasis and Graminor
# Robot: 

data={'plot':['1','1','2','2','3','3'],'x':['5','6','7','8','9','10'],'y':['0.9','0.8','0.7','0.6','0.5','0.4'] }

ACC=[]
A=pd.DataFrame(data, columns=['plot','x','y'])
AA=0

for item in range(len(A)-1):
    if A['plot'][item]== A['plot'][item+1]:
        Ans=(float((A['y'][item]))+float((A['y'][item+1])))*((float((A['x'][item+1]))-float((A['x'][item]))))/2
        AA+=Ans
        print(AA)
        ACC.append(AA)

0.8500000000000001
1.5
1.95


### Alternative

In [47]:
df1=Data.set_index(['Plot'])
ACC=[]

for item in Numbers_final:
    df2=df1[df1.index==item]
    df2=df2.filter(['Blue', 'Green', 'Red', 'RedEdge', 'NIR','NDVI', 'MTCI', 'EVI', 'DVI', 'RVI', 'VARI', 'EXG', 'EXGR', 'GLI', 'GNDVI', 'GVI','Time','timepoint'], axis=1)
    df2=df2.sort_values(by='timepoint')
    df3=df2.reset_index()

AA=0
for j in range(0,3):
    Ans=(float((df3['GVI'][j]))+float((df3['GVI'][j+1])))*((float((df3['timepoint'][j+1]))-float((df3['timepoint'][j]))))/2
    AA+=Ans

    print(AA)
    ACC.append(AA)



DA=pd.DataFrame(ACC)
DD=pd.DataFrame(Numbers_final)
DDA=pd.concat([DD, DA], axis=1)
DDA.to_excel('Staur_Accumulative_GVI_2019.xlsx')

NameError: name 'Data' is not defined

### Time series data vs the AUC

# ToDo: Model Training


Make model for one year at a time and try to predict yield of another field  

TODO: Train on Masbasis 2019 an 2020  
Test on Staur  

Use data until august for yield prediction since it is most relavant  
Use all data for predicting date to maturity  

Data Collection:  
Data collection usually starts after heading  
2019 has the data before hading as well. To use that, dont use dates before heading  

NDVI is resistant to shadows  

DAT390 Report: Do the report with Robot Data only  

TODO: Use AUC for each index for prediction  

TODO:   
Time series data vs the AUC  