# Process Growth Phases 

This looks at the SQLite Databases generated by Apsim X (Next Gen) for 109 Wheat varieties, 10 differing sow dates for  57,434 locations/sites across Australia.  
Each database file covers one (1) site.

In [1]:
#Import the required libraries
import sys
import os
import datetime
import sqlite3
import numpy as np
import pandas as pd

In [2]:
# define the working directories
apsim_sourcedir = "/OSM/CBR/AG_WHEATTEMP/source"
apsim_outfiledir = "/OSM/CBR/AG_WHEATTEMP/work/output"
metfile_sourcedir = "/OSM/CBR/AG_WHEATTEMP/work/ApsimNG-test/APSIM_run/met"


In [3]:
dbfile_df = pd.DataFrame(columns=['filename'])
dbfile_df.filename = sorted(apsim_sourcedir+'/'+f for f in os.listdir(apsim_sourcedir) if f.endswith('.db'))
print(dbfile_df.head())
dbname = dbfile_df.filename[0]
print(dbname)

                                       filename
0  /OSM/CBR/AG_WHEATTEMP/source/113.60-28.30.db
1  /OSM/CBR/AG_WHEATTEMP/source/113.70-28.45.db
2  /OSM/CBR/AG_WHEATTEMP/source/113.70-28.50.db
3  /OSM/CBR/AG_WHEATTEMP/source/113.75-28.45.db
4  /OSM/CBR/AG_WHEATTEMP/source/113.75-28.50.db
/OSM/CBR/AG_WHEATTEMP/source/113.60-28.30.db


In [4]:
def calc_day_length(lat, doy, radn):
    '''
    Calculates the day length based on the latitude and the day of the year
    
    sample usage:
    >>> daylength = calc_day_length(-27.55, 1)
    13.71
    >>> daylength = calc_day_length(-27.55, 2)
    13.7
    >>> daylength = calc_day_length(-27.55, 3)
    13.69
    '''
    
    import math
    
    if lat == 0:
        print("The latitude must be greater than zero.")
        return 0
    
    if (doy <= 0 or doy > 366):
        print("The day of year (doy) must be a valid day of the year, between 1 and 365.")
        return 0
    
    #latitude (lambda) in radians
    radians = math.pi/180
    latlambda = lat * radians
    radnJ = radn * 1000000
    
    sinLAT = math.sin(latlambda)
    cosLAT = math.cos(latlambda)
    sinDMC = radians * 23.45
    sinDEC = -sinDMC * math.cos(2 * math.pi * (doy + 10) / 365)
    cosDEC = math.sqrt(1 - (sinDEC * sinDEC))
    a = sinLAT * sinDEC
    b = cosLAT * cosDEC

    daylength = 12 * (1 + (2 / math.pi) * math.asin(a/b))
    
    hour = math.mod(doy, 1) * 24
    sinB = a + b * math.cos(2 * math.pi * (hour - 12) / 24)
    SC = 1367 * (1 + 0.033 * math.cos(2 * math.pi * (doy - 10) / 365))
    sinINT = a * day_length + (24 * b / math.pi) * math.cos((math.pi / 2) * ((daylength / 12) - 1))
    
    Ta = radnJ / (sinINT * 3600 * SC)
    fracDiffusedRadn = Ta * -1.6 +1.32
    
    #print(round(latlambda,4), ", ", round(sindelta,2), ",", round(cosdelta,2), ",", \
    #      round(sinlambda,2), ", ", round(coslambda,2), ", ", round(a, 2), ", ", \
    #      round(b, 2), ", ", round(day_length, 2))
          
    return daylength, fracDiffusedRadn

In [5]:
def read_ApsimWeather(filename, lat):
    '''
    Reads an apsim weather ('.met') file, removes the header information,
    calculates and adds a date column (based on year and day), and the
    average temperature (based on maxt and mint).
    '''
    import math

    
    lineNo = 0
    with open(filename, "r") as f:
        for line in f:
            lineNo = lineNo + 1
            if line.startswith('year'):
                break;

    # return the data using the starting line no (determined above)
    # original column names=['year','day', 'radn', 'maxt', 'mint', 'rain']
    metData = pd.read_table(filename, sep='\s+', header=None, skiprows=lineNo+1,
                            names=['year','dayofYear', 'radiation', 'maxTemp', 'minTemp', 'rain'])

    # add the calculated columns
    metData['runDate'] = pd.to_datetime(metData['year'].astype(str) + " " + metData['dayofYear'].astype(str), format="%Y %j")

    # this may need to be the thermal time, not just average temp
    metData['avgTemp'] = (metData['maxTemp'] + metData['minTemp']) / 2

    #convert the radiation from MJ/m2/day to Photosynthetically active radiation (PAR)
    metData['PARIO'] = metData['radiation'] * 0.47
    
    #convert the measurement unit for the radiation from MJ/m2/day to J/m2/day
    metData['radnJ'] = metData['radiation'] * 1000000

    
    # calculation the day length
    radians = math.pi/180
    latlambda = lat * radians
    
    sinLAT = math.sin(latlambda)
    cosLAT = math.cos(latlambda)
    sinDMC = radians * 23.45
    
    metData['sinDEC'] = -sinDMC * math.cos(2 * math.pi * (metData['dayofYear'] + 10) / 365)
    metData['cosDEC'] = math.sqrt(1 - (metData['sinDEC'] * metData['sinDEC']))
    metData['a'] = sinLAT * metData['sinDEC']
    metData['b'] = cosLAT * metData['cosDEC']

    metData['daylength'] = 12 * (1 + (2 / math.pi) * math.asin(metData['a']/metData['b']))
    
    # calculate the Fraction Disfused Radiation (FDR)
    metData['hour'] = math.mod(metData['dayofYear'], 1) * 24
    metData['sinB'] = metData['a'] + metData['b'] * math.cos(2 * math.pi * (metData['hour'] - 12) / 24)
    metData['SC'] = 1367 * (1 + 0.033 * math.cos(2 * math.pi * (metData['dayofYear'] - 10) / 365))
    metData['sinINT'] = metData['a'] * metData['daylength'] + (24 * metData['b'] / math.pi) * \
                        math.cos((math.pi / 2) * ((metData['daylength'] / 12) - 1))
    
    metData['Ta'] = radnJ / (metData['sinINT'] * 3600 * metData['SC'])
    metData['fracDiffusedRadn'] = metData['Ta'] * -1.6 +1.32
        
    # calculate the Evapotranspiration
    metData['vpsl'] = 238.102 * 17.32491 * ((metData['minTemp'] + metData['maxTemp']) /2) / \
                      (((metData['minTemp'] + metData['maxTemp']) / 2) + 238.102) ** 2
    metData['ETpt'] = 1.26 * (metData['radnJ']  * (metData['vpsl'] / (metData['vpsl'] + 0.067))) / 2454000
        
    
    # sort the columns to be a little more logical
    cols=['year', 'dayofYear', 'runDate', 'dayLength', 'maxTemp', 'minTemp', 'avgTemp', 'rain', 'radiation', \
          'PARIO', 'fracDiffusedRadn', 'ETpt']
    metData = metData[cols]

    return metData


In [6]:
def get_simulation_details(dbname):
    '''
    Opens the specified SQL Database and extracts the 'Name' details from the Simulation
    Table, and splits it to Simulation ID, Longitude, Latitude, Variety, SowDate, and
    returns ad dataframe.
    '''

    # connect to the Database
    con = sqlite3.connect(dbname)
    cur = con.cursor()

    # get contents of the _Simulation Table
    strSql = "SELECT ID as SimulationID, Name FROM _Simulations"
    dfSim = pd.read_sql_query(strSql, con, index_col = 'SimulationID')

    # split the 'Name' field into long, lat, variety and sowdate columns
    dfSim[['long','lat','variety','sowdate']] = \
    dfSim['Name'].str.extract("^(?P<long>\d+)_(?P<lat>-?\d+)_(?P<variety>\S+)_(?P<sowdate>\d+-\S+)$", expand=True)

    # format the columns
    pd.options.display.float_format = '{:,.2f}'.format
    dfSim['long'] = dfSim['long'].astype(float) / 100
    dfSim['lat'] = dfSim['lat'].astype(float) / 100

    # create a SimId column (as the original SimulationID is now an index column)
    dfSim['SimID'] = dfSim.index 

    return dfSim


In [7]:
def get_report_details(dbname):
    '''
    Opens the specified SQL Database and extracts the details from the Report
    Table, formats the columns correctly and returns a dataframe
    '''

    # connect to the Database
    con = sqlite3.connect(dbname)
    cur = con.cursor()

    # get contents of the Report Table
    #strSql = "SELECT SimulationID, substr([Clock.Today], 1, 10) as runDate, \
    #      [Wheat.Leaf.LAI] as LAI, [Wheat.AboveGround.Wt] as Biomass, \
    #      [Wheat.Grain.Wt] as Yield, [Wheat.Phenology.Zadok.Stage] as ZadokStage, \
    #      [Wheat.WaterSupplyDemandRatio] as WaterSupplyDemandRatio, \
    #      [Wheat.Root.NUptake] as RootNUptake, [Wheat.Leaf.Fn] as LeafFn \
    #      FROM Report \
    #      ORDER BY SimulationID, runDate"
    
    #Need to exclude consider CUTOFFS

    #do not need all of the columns, will cut this down from the get go
    strSql = "SELECT SimulationID, substr([Clock.Today], 1, 10) as runDate, \
          [Wheat.Leaf.LAI] as LAI, [Wheat.AboveGround.Wt] as Biomass, \
          [Wheat.Grain.Wt] as Yield, [Wheat.Phenology.Zadok.Stage] as ZadokStage, \
          [Wheat.WaterSupplyDemandRatio] as WaterSupplyDemandRatio \
          FROM Report \
          ORDER BY SimulationID, runDate"    
    #print(datetime.datetime.now())
    dfReport = pd.read_sql_query(strSql, con, index_col="SimulationID" )
    #print(datetime.datetime.now())

    # format the date columns
    dfReport['runDate'] = pd.to_datetime(dfReport['runDate'], format="%Y-%m-%d")

    # create the SimId column
    dfReport['SimID'] = dfReport.index

    return dfReport


In [14]:
def get_filename(dbname):
    '''
    Takes the full path and filename for the database file, and creates the filename
    that is used for the weather file, and to save the output.

    Note:  cannot use the db filename as it doesn't have the long & lat that we require
           need to manipulate the filename to add the underscrore '_' char
           ONLY need to allow for negative (south) latitudes
    '''
    filename = os.path.basename(dbname)
    filename = os.path.splitext(filename)[0]
    nameparts = filename.split('-')
    filename = nameparts[0] + '_-' + nameparts[1]
    lat = '-' + nameparts[1]

    return filename, lat


In [9]:
def get_weather_details(filename, latitude):
    '''
    Retrieves the weather data for the location (long,lat) specified in the dbname,
    formats the data, and returns a dataframe
    '''

    fullfilename = metfile_sourcedir + "/c_" + filename + ".met"
    dfWeather = read_ApsimWeather(fullfilename, latitude)

    return dfWeather


## Process the data
The following forms a single function:  def process_Apsim_dbfile(dbname):  
#### NOTE:  dbname is defined at the top

In [10]:
print("processing file: ", dbname)
print("started at ", datetime.datetime.now())

# retrieve the Simulation Details from the DB._Sumulation table
dfSim = get_simulation_details(dbname) 
print(dfSim.shape)
print(dfSim.head(5))


processing file:  /OSM/CBR/AG_WHEATTEMP/source/113.60-28.30.db
started at  2018-07-27 10:42:26.522674
(1071, 6)
                                      Name   long    lat     variety sowdate  \
SimulationID                                                                   
1                 11360_-2830_young_29-jul 113.60 -28.30       young  29-jul   
2             11360_-2830_agt_katana_1-apr 113.60 -28.30  agt_katana   1-apr   
3             11360_-2830_agt_scythe_1-apr 113.60 -28.30  agt_scythe   1-apr   
4               11360_-2830_annuello_1-apr 113.60 -28.30    annuello   1-apr   
5                 11360_-2830_aroona_1-apr 113.60 -28.30      aroona   1-apr   

              SimID  
SimulationID         
1                 1  
2                 2  
3                 3  
4                 4  
5                 5  


In [13]:
# retrieve the weather data from the weather '.met' file
filename, latitude = get_filename(dbname)
dfWeather = get_weather_details(filename, latitude)
print(dfWeather.shape)
print(dfWeather.head(5))


ValueError: too many values to unpack (expected 2)

In [None]:
# retrieve the Details from the DB.Report table
dfReport = get_report_details(dbname)
print(dfReport.shape)
print(dfReport.head(5))


In [None]:
# combine the report data with the weather data
dfCombined = dfReport.merge(dfWeather, on='runDate', how='left')
dfCombined
# filter the data based on the information we want
#filterCols = ['SimID', 'runDate', 'ZadokStage', 'avgTemp']
#dfSubData = dfCombined[filterCols]


In [None]:
# combine the data with the Simulation details, so that we can get the sow date
# and filter it again
dfCombined = dfCombined.merge(dfSim, on="SimID", how='left')
#filterCols = ['SimID', 'runDate', 'ZadokStage', 'avgTemp', 'sowdate']
#dfSubData = dfSubData[filterCols]
dfCombined

In [None]:
add extra (weather) columns into this
filterCols = ['SimID', 'year', 'dayofYear', 'runDate', 'LAI', 'Biomass', 'Yield', 
              'ZadokStage', 'WaterSupplyDemandRatio',  'minTemp', 'maxTemp', 
              'avgTemp', 'rain', 'radiation', 'long', 'lat', 'variety', 'sowdate']
dfSubData = dfCombined[filterCols]
dfSubData


In [None]:
#??? 
#do we need to re-order the data here so that it is long, lat, variety, sowdate, runDate


In [None]:
# create a sowing date (with current year)
dfSubData['sowingdate'] = dfSubData['sowdate'] + '-' + dfSubData['runDate'].dt.year.map(str)
dfSubData['sowingdate'] = pd.to_datetime(dfSubData['sowingdate'], format="%d-%b-%Y")

In [None]:
# now calculate the cumulative temp info for each simulation 
# 90 is maturity (start of ripening stage)
#& (dfSubData['ZadokStage'] <= 90)

dfSubData['tempavgTemp'] = dfSubData['avgTemp'].where((dfSubData['runDate'] >= dfSubData['sowingdate']) 
                                                      & (dfSubData['ZadokStage'] > 0)
                                                      & (dfSubData['ZadokStage'] <= 90), 0)
dfSubData['TTAfterSowing'] = dfSubData.groupby(by=['SimID','sowingdate'])['tempavgTemp'].cumsum()
dfSubData

In [None]:
# filter the data on the tempavgTemp column
newData = dfSubData[dfSubData['tempavgTemp'] > 0]
#newData1 = newData.groupby(['SimID','sowdate'])['TTAfterSowing'].max().reset_index()
newData1 = newData.groupby(['SimID','sowingdate'])['TTAfterSowing'].max().reset_index()


In [None]:
#newData2 = newData1.groupby(['SimID','sowdate'])['TTAfterSowing'].mean().reset_index()
newData2 = newData1.groupby(['SimID','sowingdate'])['TTAfterSowing'].mean().reset_index()


In [None]:
#create bins for the phases
bins = [0, 7, 10, 31, 39, 71, 87, 90, 120]
group_names = ['01_Germinating', '02_Emerging', '03_Vegetative', '04_StemElongation',
               '05_GrainSet', '06_GrainFilling', '07_Maturing', '08_Ripening']
dfSubData['phases'] = pd.cut(dfSubData['ZadokStage'], bins, labels=group_names)
dfSubData

#might need to 

### Need to determine the phases: 
•	length (no of days in phase), minTemp, maxTemp, and avgTemp,  
•	the cumulativeAvgTemp at the end of each phase, (needs to be called TTAfterSowing)  
•	the counts for number of day where temperatures are below or above specified values (refer to SIP_Temp discussion.docs for further details)  
•	the average and cumulative rainfall   
•	average radiation, cumulative radiation  
•	average watersupplydemandratio


In [None]:
# eliminate those records where the phase is Nan
#dfSubData.dropna(subset=['phases'], inplace=True)
#dfSubData

In [None]:
#Create a function to get the stats for group
def get_stats(group):
    return {'min': group.min(), 'max': group.max(), 'mean': group.mean(), 'count': group.count()}

In [None]:
#should I add a column that can be a concatinated key
dfSubData['SimIdSowingdatePhase'] = dfSubData['SimId'].map(str) + \ 
    dfSubData['sowingdate'].map(str) + dfSubData['phases']

#apply the get_stats function to each phase bin
dfSubData['ZadokStage'].groupby(dfSubData['SimIdSowingdatePhase']).apply(get_stats).unstack()
dfSubData,

In [None]:
#how many days in each of the phases
newData1.groupby(['SimIdSowingdatePhase'].count().reset_index()

#what is the minimimum temperature for the phase
newData1.groupby(['SimIdSowingdatePhase'])['minTemp'].min().reset_index()
#what is the maximum temperature for the phase
newData1.groupby(['SimIdSowingdatePhase'])['maxTemp'].max().reset_index()

 newData1.groupby(['SimIdSowingdatePhase'])['biomass'].max().reset_index()

#what is the average of average temp for the phase
newData1.groupby(['SimIdSowingdatePhase'])['avgTemp'].mean().reset_index()

#what is the average of average daily radiation for the phase
#newData1.groupby(['SimIdSowingdatePhase'])['radn'].mean().reset_index()

#what is the cumulative rainfall for each phase
newData1.groupby(by=['SimIdSowingdatePhase'])['rain'].cumsum()

newData1.groupby(by=['SimIdSowingdatePhase'])['ETpt'].cumsum()

newData1.groupby(by=['SimIdSowingdatePhase'])['PARIO'].mean().reset_index()
newData1.groupby(by=['SimIdSowingdatePhase'])['PARIO'].cumsum()

newData1.groupby(by=['SimIdSowingdatePhase'])['PQDay'].mean.reset_index()
newData1.groupby(by=['SimIdSowingdatePhase'])['daylengh'].mean.reset_index()

#need to get the values for the last day in each phase:
startdate, enddate, DAS, LAI, Biomass, Yield
                
                 
#what is the average of average watersupplydemandratio for the phase
newData1.groupby(['SimIdSowingdatePhase'])['watersupplydemandratio'].mean().reset_index()


In [None]:
#get the counts of various miniumum temperatures
newData1[newData1['minTemp'] <= 0].groupby['SimIdSowingdatePhase'].count()
newData1[newData1['minTemp'] <= -1].groupby['SimIdSowingdatePhase'].count()
newData1[newData1['minTemp'] <= -2].groupby['SimIdSowingdatePhase'].count()
newData1[newData1['minTemp'] <= -3].groupby['SimIdSowingdatePhase'].count()


In [None]:
#get the counts of various maximum temperatures
newData1[newData1['maxTemp'] >= 30].groupby['SimIdSowingdatePhase'].count()
newData1[newData1['maxTemp'] >= 32].groupby['SimIdSowingdatePhase'].count()
newData1[newData1['maxTemp'] >= 34].groupby['SimIdSowingdatePhase'].count()
newData1[newData1['maxTemp'] >= 36].groupby['SimIdSowingdatePhase'].count()
newData1[newData1['maxTemp'] >= 38].groupby['SimIdSowingdatePhase'].count()
newData1[newData1['maxTemp'] >= 40].groupby['SimIdSowingdatePhase'].count()


In [None]:
# need to add back in the longitude, latitude, variety from dfSim
newData2 = newData2.merge(dfSim, on=['SimID', 'sowdate'], how='left')
filterCols = ['SimID', 'long', 'lat', 'variety', 'sowdate', 'TTAfterSowing']
newData2 = newData2[filterCols]


In [None]:
outfilename = apsim_outfiledir + "/" + filename + "_zadok.csv"
newData2.to_csv(outfilename, encoding='utf-8', index=False)
