# Process Growth Phases 

This looks at the SQLite Databases generated by Apsim X (Next Gen) for 109 Wheat varieties, 10 differing sow dates for  57,434 locations/sites across Australia.  
Each database file covers one (1) site.

In [1]:
#Import the required libraries
import sys
import os
import datetime
import sqlite3
import numpy as np
import pandas as pd

In [2]:
# define the working directories
apsim_sourcedir = "/OSM/CBR/AG_WHEATTEMP/source"
apsim_outfiledir = "/OSM/CBR/AG_WHEATTEMP/work/output"
metfile_sourcedir = "/OSM/CBR/AG_WHEATTEMP/work/ApsimNG-test/APSIM_run/met"


In [3]:
dbfile_df = pd.DataFrame(columns=['filename'])
dbfile_df.filename = sorted(apsim_sourcedir+'/'+f for f in os.listdir(apsim_sourcedir) if f.endswith('.db'))
print(dbfile_df.head())
dbname = dbfile_df.filename[0]
print(dbname)

                                       filename
0  /OSM/CBR/AG_WHEATTEMP/source/113.60-28.30.db
1  /OSM/CBR/AG_WHEATTEMP/source/113.70-28.45.db
2  /OSM/CBR/AG_WHEATTEMP/source/113.70-28.50.db
3  /OSM/CBR/AG_WHEATTEMP/source/113.75-28.45.db
4  /OSM/CBR/AG_WHEATTEMP/source/113.75-28.50.db
/OSM/CBR/AG_WHEATTEMP/source/113.60-28.30.db


In [4]:
def read_ApsimWeather(filename):
    '''
    Reads an apsim weather ('.met') file, removes the header information,
    calculates and adds a date column (based on year and day), and the
    average temperature (based on maxt and mint).
    '''

    lineNo = 0
    with open(filename, "r") as f:
        for line in f:
            lineNo = lineNo + 1
            if line.startswith('year'):
                break;

    # return the data using the starting line no (determined above)
    # original column names=['year','day', 'radn', 'maxt', 'mint', 'rain']
    metData = pd.read_table(filename, sep='\s+', header=None, skiprows=lineNo+1,
                            names=['year','dayofYear', 'radiation', 'maxTemp', 'minTemp', 'rain'])

    # add the calculated columns
    metData['runDate'] = pd.to_datetime(metData['year'].astype(str) + " " + metData['dayofYear'].astype(str), format="%Y %j")

    # this may need to be the thermal time, not just average temp
    metData['avgTemp'] = (metData['maxTemp'] + metData['minTemp']) / 2

    # sort the columns to be a little more logical
    cols=['year', 'dayofYear', 'runDate', 'maxTemp', 'minTemp', 'avgTemp', 'rain', 'radiation']
    metData = metData[cols]

    return metData


In [5]:
def get_simulation_details(dbname):
    '''
    Opens the specified SQL Database and extracts the 'Name' details from the Simulation
    Table, and splits it to Simulation ID, Longitude, Latitude, Variety, SowDate, and
    returns ad dataframe.
    '''

    # connect to the Database
    con = sqlite3.connect(dbname)
    cur = con.cursor()

    # get contents of the _Simulation Table
    strSql = "SELECT ID as SimulationID, Name FROM _Simulations"
    dfSim = pd.read_sql_query(strSql, con, index_col = 'SimulationID')

    # split the 'Name' field into long, lat, variety and sowdate columns
    dfSim[['long','lat','variety','sowdate']] = \
    dfSim['Name'].str.extract("^(?P<long>\d+)_(?P<lat>-?\d+)_(?P<variety>\S+)_(?P<sowdate>\d+-\S+)$", expand=True)

    # format the columns
    pd.options.display.float_format = '{:,.2f}'.format
    dfSim['long'] = dfSim['long'].astype(float) / 100
    dfSim['lat'] = dfSim['lat'].astype(float) / 100

    # create a SimId column (as the original SimulationID is now an index column)
    dfSim['SimID'] = dfSim.index 

    return dfSim


In [6]:
def get_report_details(dbname):
    '''
    Opens the specified SQL Database and extracts the details from the Report
    Table, formats the columns correctly and returns a dataframe
    '''

    # connect to the Database
    con = sqlite3.connect(dbname)
    cur = con.cursor()

    # get contents of the Report Table
    #strSql = "SELECT SimulationID, substr([Clock.Today], 1, 10) as runDate, \
    #      [Wheat.Leaf.LAI] as LeafLAI, [Wheat.AboveGround.Wt] as AboveGroundWeight, \
    #      [Wheat.Grain.Wt] as GrainWeight, [Wheat.Phenology.Zadok.Stage] as ZadokStage, \
    #      [Wheat.WaterSupplyDemandRatio] as WaterSupplyDemandRatio, \
    #      [Wheat.Root.NUptake] as RootNUptake, [Wheat.Leaf.Fn] as LeafFn \
    #      FROM Report \
    #      ORDER BY SimulationID, runDate"

    #do not need all of the columns, will cut this down from the get go
    strSql = "SELECT SimulationID, substr([Clock.Today], 1, 10) as runDate, \
          [Wheat.Phenology.Zadok.Stage] as ZadokStage, \
          [Wheat.WaterSupplyDemandRatio] as WaterSupplyDemandRatio \
          FROM Report \
          ORDER BY SimulationID, runDate"    
    #print(datetime.datetime.now())
    dfReport = pd.read_sql_query(strSql, con, index_col="SimulationID" )
    #print(datetime.datetime.now())

    # format the date columns
    dfReport['runDate'] = pd.to_datetime(dfReport['runDate'], format="%Y-%m-%d")

    # create the SimId column
    dfReport['SimID'] = dfReport.index

    return dfReport


In [7]:
def get_filename(dbname):
    '''
    Takes the full path and filename for the database file, and creates the filename
    that is used for the weather file, and to save the output.

    Note:  cannot use the db filename as it doesn't have the long & lat that we require
           need to manipulate the filename to add the underscrore '_' char
    '''
    filename = os.path.basename(dbname)
    filename = os.path.splitext(filename)[0]
    nameparts = filename.split('-')
    filename = nameparts[0] + '_-' + nameparts[1]

    return filename


In [8]:
def get_weather_details(filename):
    '''
    Retrieves the weather data for the location (long,lat) specified in the dbname,
    formats the data, and returns a dataframe
    '''

    fullfilename = metfile_sourcedir + "/c_" + filename + ".met"
    dfWeather = read_ApsimWeather(fullfilename)

    return dfWeather


## Process the data
The following forms a single function:  def process_Apsim_dbfile(dbname):  
#### NOTE:  dbname is defined at the top

In [9]:
print("processing file: ", dbname)
print("started at ", datetime.datetime.now())

# retrieve the Simulation Details from the DB._Sumulation table
dfSim = get_simulation_details(dbname) 
print(dfSim.shape)
print(dfSim.head(5))


processing file:  /OSM/CBR/AG_WHEATTEMP/source/113.60-28.30.db
started at  2018-07-20 12:19:03.530513
(1071, 6)
                                      Name   long    lat     variety sowdate  \
SimulationID                                                                   
1                 11360_-2830_young_29-jul 113.60 -28.30       young  29-jul   
2             11360_-2830_agt_katana_1-apr 113.60 -28.30  agt_katana   1-apr   
3             11360_-2830_agt_scythe_1-apr 113.60 -28.30  agt_scythe   1-apr   
4               11360_-2830_annuello_1-apr 113.60 -28.30    annuello   1-apr   
5                 11360_-2830_aroona_1-apr 113.60 -28.30      aroona   1-apr   

              SimID  
SimulationID         
1                 1  
2                 2  
3                 3  
4                 4  
5                 5  


In [10]:
# retrieve the weather data from the weather '.met' file
filename = get_filename(dbname)
dfWeather = get_weather_details(filename)
print(dfWeather.shape)
print(dfWeather.head(5))


(43038, 8)
   year  dayofYear    runDate  maxTemp  minTemp  avgTemp  rain  radiation
0  1900          1 1900-01-01    31.80    13.40    22.60  0.00      24.30
1  1900          2 1900-01-02    31.80    13.40    22.60  0.00      24.30
2  1900          3 1900-01-03    31.80    13.40    22.60  0.00      24.30
3  1900          4 1900-01-04    31.80    13.40    22.60  0.00      24.30
4  1900          5 1900-01-05    31.80    13.40    22.60  0.00      24.30


In [11]:
# retrieve the Details from the DB.Report table
dfReport = get_report_details(dbname)
print(dfReport.shape)
print(dfReport.head(5))


(46093698, 4)
                runDate  ZadokStage  WaterSupplyDemandRatio  SimID
SimulationID                                                      
1            1900-01-01        0.00                    1.00      1
1            1900-01-01        0.00                    1.00      1
1            1900-01-02        0.00                    1.00      1
1            1900-01-02        0.00                    1.00      1
1            1900-01-03        0.00                    1.00      1


In [20]:
# combine the report data with the weather data
dfCombined = dfReport.merge(dfWeather, on='runDate', how='left')
dfCombined
# filter the data based on the information we want
#filterCols = ['SimID', 'runDate', 'ZadokStage', 'avgTemp']
#dfSubData = dfCombined[filterCols]


Unnamed: 0,runDate,ZadokStage,WaterSupplyDemandRatio,SimID,year,dayofYear,maxTemp,minTemp,avgTemp,rain,radiation
0,1900-01-01,0.00,1.00,1,1900,1,31.80,13.40,22.60,0.00,24.30
1,1900-01-01,0.00,1.00,1,1900,1,31.80,13.40,22.60,0.00,24.30
2,1900-01-02,0.00,1.00,1,1900,2,31.80,13.40,22.60,0.00,24.30
3,1900-01-02,0.00,1.00,1,1900,2,31.80,13.40,22.60,0.00,24.30
4,1900-01-03,0.00,1.00,1,1900,3,31.80,13.40,22.60,0.00,24.30
5,1900-01-03,0.00,1.00,1,1900,3,31.80,13.40,22.60,0.00,24.30
6,1900-01-04,0.00,1.00,1,1900,4,31.80,13.40,22.60,0.00,24.30
7,1900-01-04,0.00,1.00,1,1900,4,31.80,13.40,22.60,0.00,24.30
8,1900-01-05,0.00,1.00,1,1900,5,31.80,13.40,22.60,0.00,24.30
9,1900-01-05,0.00,1.00,1,1900,5,31.80,13.40,22.60,0.00,24.30


In [21]:
# combine the data with the Simulation details, so that we can get the sow date
# and filter it again
dfCombined = dfCombined.merge(dfSim, on="SimID", how='left')
#filterCols = ['SimID', 'runDate', 'ZadokStage', 'avgTemp', 'sowdate']
#dfSubData = dfSubData[filterCols]
dfCombined

MemoryError: 

In [None]:
filterCols = ['SimID', 'runDate', 'ZadokStage', 'WaterSupplyDemandRatio', 
              'minTemp', 'maxTemp', 'avgTemp', 'rain', 'radiation', 
              'long', 'lat', 'variety', 'sowdate']
dfSubData = dfCombined[filterCols]
dfSubData


In [None]:
# create a sowing date (with current year)
dfSubData['sowingdate'] = dfSubData['sowdate'] + '-' + dfSubData['runDate'].dt.year.map(str)
dfSubData['sowingdate'] = pd.to_datetime(dfSubData['sowingdate'], format="%d-%b-%Y")

In [None]:
# now calculate the cumulative temp info for each simulation
#& (dfSubData['ZadokStage'] <= 70)

dfSubData['tempavgTemp'] = dfSubData['avgTemp'].where((dfSubData['runDate'] >= dfSubData['sowingdate']) 
                                                      & (dfSubData['ZadokStage'] > 0), 0)
dfSubData['TTAfterSowing'] = dfSubData.groupby(by=['SimID','sowingdate'])['tempavgTemp'].cumsum()


In [None]:
#create bins for the phases
bins = [0, 7, 10, 31, 39, 71, 87, 90, 120]
group_names = ['Germinating', 'Emerging', 'Vegetative', 'StemElongation',
               'GrainSet', 'GrainFilling', 'Maturing', 'Ripening']
dfCombined['phases'] = pd.cut(dfCombined['ZadokStage'], bins, labels=group_names)
dfCombined

#might need to 

In [None]:
# filter the data on the tempavgTemp column
newData = dfSubData[dfSubData['tempavgTemp'] > 0]
newData1 = newData.groupby(['SimID','sowdate'])['TTAfterSowing'].max().reset_index()


In [None]:
newData2 = newData1.groupby(['SimID','sowdate'])['TTAfterSowing'].mean().reset_index()


#Need to determine the phases
•	length (no of days in phase), minTemp, maxTemp, and avgTemp,
•	the cumulativeAvgTemp at the end of each phase, (needs to be called TTAfterSowing)
•	the counts for number of day where temperatures are below or above specified values (refer to SIP_Temp discussion.docs for further details)
•	the average and cumulative rainfall 
•	average radiation, cumulative radiation
•	average watersupplydemandratio


In [None]:
#Create a function to get the stats for group
def get_stats(group):
    return {'min': group.min(), 'max': group.max(), 'mean': group.mean(), 'count': group.count()}

In [None]:
#apply the get_stats function to each phase bin
dfSubData['ZadokStage'].groupby(df['phases']).apply(get_stats).unstack()
dfSubData


In [None]:
#how many days in each of the phases
newData1.groupby(['SimID','sowdate','phase'])['runDate'].count().reset_index()

#what is the minimimum temperature for the phase
newData1.groupby(['SimID','sowdate','phase'])['minTemp'].min().reset_index()
#what is the maximum temperature for the phase
newData1.groupby(['SimID','sowdate','phase'])['maxTemp'].max().reset_index()

#what is the average of average temp for the phase
newData1.groupby(['SimID','sowdate','phase'])['avgTemp'].mean().reset_index()

#what is the average of average daily radiation for the phase
newData1.groupby(['SimID','sowdate','phase'])['radn'].mean().reset_index()

#what is the cumulative rainfall for each phase
newData1.groupby(by=['SimID','sowdate','phase'])['rain'].cumsum()

#what is the average of average watersupplydemandratio for the phase
newData1.groupby(['SimID','sowdate','phase'])['watersupplydemandratio'].mean().reset_index()


In [None]:
#get the counts of various miniumum temperatures
newData1[newData1['minTemp'] <= 0].groupby['phases'].count()
newData1[newData1['minTemp'] <= -1].groupby['phases'].count()
newData1[newData1['minTemp'] <= -2].groupby['phases'].count()
newData1[newData1['minTemp'] <= -3].groupby['phases'].count()


In [None]:
#get the counts of various maximum temperatures
newData1[newData1['maxTemp'] >= 30].groupby['phases'].count()
newData1[newData1['maxTemp'] >= 32].groupby['phases'].count()
newData1[newData1['maxTemp'] >= 34].groupby['phases'].count()
newData1[newData1['maxTemp'] >= 36].groupby['phases'].count()
newData1[newData1['maxTemp'] >= 38].groupby['phases'].count()
newData1[newData1['maxTemp'] >= 40].groupby['phases'].count()


In [None]:
# need to add back in the longitude, latitude, variety from dfSim
newData2 = newData2.merge(dfSim, on=['SimID', 'sowdate'], how='left')
filterCols = ['SimID', 'long', 'lat', 'variety', 'sowdate', 'TTAfterSowing']
newData2 = newData2[filterCols]


In [None]:
outfilename = apsim_outfiledir + "/" + filename + "_zadok.csv"
newData2.to_csv(outfilename, encoding='utf-8', index=False)
