## Working with pandas dataframes

I am trying to extract the name information using the columns (as a whole, if that makes sense).  I have a function works if I iterate through the rows and treat each 'Name' individually.

In [1]:
import numpy as np
import pandas as pd
import sqlite3
import os
import feather

In [2]:
apsim_sourcedir = "/OSM/CBR/AG_WHEATTEMP/source"
apsim_outfiledir = "/OSM/CBR/AG_WHEATTEMP/work"
metfile_sourcedir = "/OSM/CBR/AG_WHEATTEMP/work/ApsimNG-test/APSIM_run/met"

In [3]:
dbfile_df = pd.DataFrame(columns=['filename'])
dbfile_df.filename = sorted(apsim_sourcedir+'/'+f for f in os.listdir(apsim_sourcedir) if f.endswith('.db'))
print(dbfile_df.head())

                                       filename
0  /OSM/CBR/AG_WHEATTEMP/source/113.60-28.30.db
1  /OSM/CBR/AG_WHEATTEMP/source/113.70-28.45.db
2  /OSM/CBR/AG_WHEATTEMP/source/113.70-28.50.db
3  /OSM/CBR/AG_WHEATTEMP/source/113.75-28.45.db
4  /OSM/CBR/AG_WHEATTEMP/source/113.75-28.50.db


In [4]:
#we are going to work with just the first filename
print(dbfile_df.filename[0])
con = sqlite3.connect(dbfile_df.filename[0])
cur = con.cursor()

/OSM/CBR/AG_WHEATTEMP/source/113.60-28.30.db


In [5]:
strSql = "SELECT ID as SimulationID, Name FROM _Simulations"
dfSim = pd.read_sql_query(strSql, con, index_col = 'SimulationID')
print(dfSim.shape)
dfSim

(1071, 1)


Unnamed: 0_level_0,Name
SimulationID,Unnamed: 1_level_1
1,11360_-2830_young_29-jul
2,11360_-2830_agt_katana_1-apr
3,11360_-2830_agt_scythe_1-apr
4,11360_-2830_annuello_1-apr
5,11360_-2830_aroona_1-apr
6,11360_-2830_arrino_1-apr
7,11360_-2830_attila_1-apr
8,11360_-2830_axe_1-apr
9,11360_-2830_banks_1-apr
10,11360_-2830_batavia_1-apr


In [6]:
#An alternative method, using grep is done below
#dfSim2 = dfSim.join(pd.DataFrame(dfSim.Name.str.rsplit('_', expand=True, n=1)))
#dfSim2.rename(columns={0: 'newName', 1: 'sowDate'}, inplace=True)

#dfSim2 = dfSim2.join(pd.DataFrame(dfSim2.newName.str.split('_', expand=True, n=2)))
#dfSim2.rename(columns={0: 'long', 1: 'lat', 2: 'variety'}, inplace=True)

#dfSim2.drop('newName', axis=1, inplace=True)
#dfSim2

In [7]:
#pd.options.display.float_format = '{:,.2f}'.format
#dfSim2['long'] = dfSim2['long'].astype(float) / 100
#dfSim2['lat'] = dfSim2['lat'].astype(float) / 100
#dfSim2

While the above works nicely, it may not be the best way to achieve this, might need to do some testing when dealing with more data.  So I will use regular expressions instead.

In [8]:
#This is done in a single line below
#dfSim['long'] = dfSim['Name'].str.extract("^(\d+)_-?\d+_\S+_\d+-\S+$", expand=True)
#dfSim['lat'] = dfSim['Name'].str.extract("^\d+_(-?\d+)_\S+_\d+-\S+$", expand=True)
#dfSim['variety'] = dfSim['Name'].str.extract("^\d+_-?\d+_(\S+)_\d+-\S+$", expand=True)
#dfSim['sowdate'] = dfSim['Name'].str.extract("^\d+_-?\d+_\S+_(\d+-\S+)$", expand=True)
#dfSim

In [9]:
#a more optimal way to achieve the above
dfSim[['long','lat','variety','sowdate']] = dfSim['Name'].str.extract("^(?P<lo>\d+)_(?P<la>-?\d+)_(?P<var>\S+)_(?P<da>\d+-\S+)$", expand=True)
dfSim

Unnamed: 0_level_0,Name,long,lat,variety,sowdate
SimulationID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,11360_-2830_young_29-jul,11360,-2830,young,29-jul
2,11360_-2830_agt_katana_1-apr,11360,-2830,agt_katana,1-apr
3,11360_-2830_agt_scythe_1-apr,11360,-2830,agt_scythe,1-apr
4,11360_-2830_annuello_1-apr,11360,-2830,annuello,1-apr
5,11360_-2830_aroona_1-apr,11360,-2830,aroona,1-apr
6,11360_-2830_arrino_1-apr,11360,-2830,arrino,1-apr
7,11360_-2830_attila_1-apr,11360,-2830,attila,1-apr
8,11360_-2830_axe_1-apr,11360,-2830,axe,1-apr
9,11360_-2830_banks_1-apr,11360,-2830,banks,1-apr
10,11360_-2830_batavia_1-apr,11360,-2830,batavia,1-apr


In [10]:
#now format the longitude and latitude columns correctly and
#add the SimID column (as we cannot work with the index column)
pd.options.display.float_format = '{:,.2f}'.format
dfSim['long'] = dfSim['long'].astype(float) / 100
dfSim['lat'] = dfSim['lat'].astype(float) / 100
dfSim['SimID'] = dfSim.index

In [11]:
print(dfSim.shape)
dfSim

(1071, 6)


Unnamed: 0_level_0,Name,long,lat,variety,sowdate,SimID
SimulationID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,11360_-2830_young_29-jul,113.60,-28.30,young,29-jul,1
2,11360_-2830_agt_katana_1-apr,113.60,-28.30,agt_katana,1-apr,2
3,11360_-2830_agt_scythe_1-apr,113.60,-28.30,agt_scythe,1-apr,3
4,11360_-2830_annuello_1-apr,113.60,-28.30,annuello,1-apr,4
5,11360_-2830_aroona_1-apr,113.60,-28.30,aroona,1-apr,5
6,11360_-2830_arrino_1-apr,113.60,-28.30,arrino,1-apr,6
7,11360_-2830_attila_1-apr,113.60,-28.30,attila,1-apr,7
8,11360_-2830_axe_1-apr,113.60,-28.30,axe,1-apr,8
9,11360_-2830_banks_1-apr,113.60,-28.30,banks,1-apr,9
10,11360_-2830_batavia_1-apr,113.60,-28.30,batavia,1-apr,10


#### Now lets filter this by the Variety we are after

In [12]:
dfSimVar = dfSim[(dfSim.variety=='janz')]
print(dfSimVar.shape)
dfSimVar

(10, 6)


Unnamed: 0_level_0,Name,long,lat,variety,sowdate,SimID
SimulationID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
57,11360_-2830_janz_1-apr,113.6,-28.3,janz,1-apr,57
164,11360_-2830_janz_15-apr,113.6,-28.3,janz,15-apr,164
271,11360_-2830_janz_29-apr,113.6,-28.3,janz,29-apr,271
378,11360_-2830_janz_13-may,113.6,-28.3,janz,13-may,378
485,11360_-2830_janz_27-may,113.6,-28.3,janz,27-may,485
592,11360_-2830_janz_3-jun,113.6,-28.3,janz,3-jun,592
699,11360_-2830_janz_17-jun,113.6,-28.3,janz,17-jun,699
806,11360_-2830_janz_1-jul,113.6,-28.3,janz,1-jul,806
913,11360_-2830_janz_15-jul,113.6,-28.3,janz,15-jul,913
1020,11360_-2830_janz_29-jul,113.6,-28.3,janz,29-jul,1020


#### get a list of Simulation ID's

In [13]:
simIds = dfSimVar.index.tolist()
simIdStr = ', '.join(str(e) for e in simIds)
simIdStr

'57, 164, 271, 378, 485, 592, 699, 806, 913, 1020'

### Now retrieve the Report Data

We can use the simIds that were collected above

In [14]:
#This should return all of the columns
#strSql = "SELECT * FROM Report \
#          WHERE SimulationID IN (" + simIdStr + ") \
#          ORDER BY SimulationID"

strSql = "SELECT SimulationID, [Clock.Today] as ClockToday, [Wheat.Leaf.LAI] as LeafLAI, \
          [Wheat.AboveGround.Wt] as AboveGroundWeight, [Wheat.Grain.Wt] as GrainWeight, \
          [Wheat.Phenology.Zadok.Stage] as ZadokStage, [Wheat.WaterSupplyDemandRatio] as WaterSupplyDemandRatio, \
          [Wheat.Root.NUptake] as RootNUptake, [Wheat.Leaf.Fn] as LeafFn \
          FROM Report \
          WHERE SimulationID IN (" + simIdStr + ") \
          ORDER BY SimulationID, ClockToday"
strSql

'SELECT SimulationID, [Clock.Today] as ClockToday, [Wheat.Leaf.LAI] as LeafLAI,           [Wheat.AboveGround.Wt] as AboveGroundWeight, [Wheat.Grain.Wt] as GrainWeight,           [Wheat.Phenology.Zadok.Stage] as ZadokStage, [Wheat.WaterSupplyDemandRatio] as WaterSupplyDemandRatio,           [Wheat.Root.NUptake] as RootNUptake, [Wheat.Leaf.Fn] as LeafFn           FROM Report           WHERE SimulationID IN (57, 164, 271, 378, 485, 592, 699, 806, 913, 1020)           ORDER BY SimulationID, ClockToday'

In [15]:
#c = con.cursor()
#c.execute(strSql, simIds)
#varietyRows = c.fetchall()
#varietyRows = pd.DataFrame(varietyRows)
#varietyRows
#print(varietyRows.shape)

In [16]:
dfReport = pd.read_sql_query(strSql, con, \
                       index_col="SimulationID"
                      )
print(dfReport.shape)
dfReport

(430380, 8)


Unnamed: 0_level_0,ClockToday,LeafLAI,AboveGroundWeight,GrainWeight,ZadokStage,WaterSupplyDemandRatio,RootNUptake,LeafFn
SimulationID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
57,1900-01-01 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00
57,1900-01-02 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00
57,1900-01-03 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00
57,1900-01-04 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00
57,1900-01-05 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00
57,1900-01-06 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00
57,1900-01-07 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00
57,1900-01-08 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00
57,1900-01-09 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00
57,1900-01-10 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00


In [17]:
# Need to re-format the Clock.Today column so that it only includes the date
dfReport['date'] = pd.to_datetime(dfReport['ClockToday']).apply(lambda x:x.strftime('%d/%m/%Y'))
dfReport['SimID'] = dfReport.index
dfReport

Unnamed: 0_level_0,ClockToday,LeafLAI,AboveGroundWeight,GrainWeight,ZadokStage,WaterSupplyDemandRatio,RootNUptake,LeafFn,date,SimID
SimulationID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
57,1900-01-01 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,01/01/1900,57
57,1900-01-02 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,02/01/1900,57
57,1900-01-03 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,03/01/1900,57
57,1900-01-04 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,04/01/1900,57
57,1900-01-05 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,05/01/1900,57
57,1900-01-06 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,06/01/1900,57
57,1900-01-07 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,07/01/1900,57
57,1900-01-08 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,08/01/1900,57
57,1900-01-09 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,09/01/1900,57
57,1900-01-10 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,10/01/1900,57


### Lets retrieve the weather data and format it as required

In [18]:
print(dbfile_df.filename[0])

/OSM/CBR/AG_WHEATTEMP/source/113.60-28.30.db


In [19]:
def get_weather_filename(sourceDir, dbfilename):
    '''
    takes in the sourceDirectory for the met files, and the SQLite database filename
    and returns the fullpath/filename of the met file
    '''
    
    #apsim_outfiledir 
    #K:\ApsimNG-test\APSIM_run\met
    filename = os.path.basename(dbfilename)
    filename = os.path.splitext(filename)[0]
    nameparts = filename.split('-')
    #cannot use the db filename as it doesn't have the long & lat that we require
    #need to manipulate the filename to add the underscrore '_' char
    filename = nameparts[0] + '_-' + nameparts[1]
    #filename = "115.20_-29.35"
    filename = sourceDir + "/c_" + filename + ".met"
    print(filename)
    #print("/OSM/CBR/AG_WHEATTEMP/work/ApsimNG-test/APSIM_run/met/")
    
    return filename

In [20]:
def compose_date(years, months=1, days=1, weeks=None, hours=None, minutes=None,
                 seconds=None, milliseconds=None, microseconds=None, nanoseconds=None):
    '''
    builds a date based on a year and day of year
    '''
    
    years = np.asarray(years) - 1970
    months = np.asarray(months) - 1
    days = np.asarray(days) - 1
    types = ('<M8[Y]', '<m8[M]', '<m8[D]', '<m8[W]', '<m8[h]',
             '<m8[m]', '<m8[s]', '<m8[ms]', '<m8[us]', '<m8[ns]')
    vals = (years, months, days, weeks, hours, minutes, seconds,
            milliseconds, microseconds, nanoseconds)
    
    return sum(np.asarray(v, dtype=t) for t, v in zip(types, vals)
               if v is not None)

    #df = pd.DataFrame({'doy': [49, 65, 81, 97, 113, 129, 145, 161],
    #               'year': [2000, 2000, 2000, 2001, 2001, 2001, 2001, 2001]})

In [21]:
def read_ApsimWeather(filename):
    '''
    Reads an apsim weather ('.met') file, removes the header information,
    calculates and adds a date column (based on year and day), and the
    average tempearature (based on maxt and mint).
    '''
    
    import pandas as pd
    import datetime
    
    lineNo = 0 
    with open(filename, "r") as f:
        #all_lines = f.readlines()
        for line in f:
            lineNo = lineNo + 1
            if line.startswith('year'):
                break;
                
    #return the data using the starting line no (determined above)
    metData = pd.read_table(filename, sep='\s+', header=None, skiprows=lineNo+1, 
                          names=['year','day', 'radn', 'maxt', 'mint', 'rain'])
    
    # add the calculated columns
    metData['date'] = compose_date(metData['year'], days=metData['day'])
    metData['date'] = pd.to_datetime(metData['date']).apply(lambda x:x.strftime('%d/%m/%Y'))
    metData['avgt'] = (metData['maxt'] + metData['mint']) / 2

    # sort the columns to be a little more logical
    cols=['year', 'day', 'date', 'maxt', 'mint', 'avgt', 'radn', 'rain']
    metData = metData[cols]
    
    #return the data
    return metData


In [22]:
filename = get_weather_filename(metfile_sourcedir, dbfile_df.filename[0])
metData = read_ApsimWeather(filename)
print(metData.shape)
metData

/OSM/CBR/AG_WHEATTEMP/work/ApsimNG-test/APSIM_run/met/c_113.60_-28.30.met
(43038, 8)


Unnamed: 0,year,day,date,maxt,mint,avgt,radn,rain
0,1900,1,01/01/1900,31.80,13.40,22.60,24.30,0.00
1,1900,2,02/01/1900,31.80,13.40,22.60,24.30,0.00
2,1900,3,03/01/1900,31.80,13.40,22.60,24.30,0.00
3,1900,4,04/01/1900,31.80,13.40,22.60,24.30,0.00
4,1900,5,05/01/1900,31.80,13.40,22.60,24.30,0.00
5,1900,6,06/01/1900,31.80,13.40,22.60,24.30,0.00
6,1900,7,07/01/1900,31.80,13.40,22.60,24.30,0.00
7,1900,8,08/01/1900,31.80,13.40,22.60,24.30,0.00
8,1900,9,09/01/1900,31.80,13.40,22.60,24.30,0.00
9,1900,10,10/01/1900,31.80,13.40,22.60,24.30,0.00


### Now need to merge the report data with the weather data

In [23]:
#dfReport.date = pd.to_datetime(dfReport.date, format="%d/%m/%Y")
#metData.date = pd.to_datetime(metData.date, format="%d/%m/%Y")
dfCombined = dfReport.merge(metData, on='date', how='left')

In [24]:
print(dfCombined.shape)
dfCombined

(430380, 17)


Unnamed: 0,ClockToday,LeafLAI,AboveGroundWeight,GrainWeight,ZadokStage,WaterSupplyDemandRatio,RootNUptake,LeafFn,date,SimID,year,day,maxt,mint,avgt,radn,rain
0,1900-01-01 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,01/01/1900,57,1900,1,31.80,13.40,22.60,24.30,0.00
1,1900-01-02 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,02/01/1900,57,1900,2,31.80,13.40,22.60,24.30,0.00
2,1900-01-03 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,03/01/1900,57,1900,3,31.80,13.40,22.60,24.30,0.00
3,1900-01-04 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,04/01/1900,57,1900,4,31.80,13.40,22.60,24.30,0.00
4,1900-01-05 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,05/01/1900,57,1900,5,31.80,13.40,22.60,24.30,0.00
5,1900-01-06 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,06/01/1900,57,1900,6,31.80,13.40,22.60,24.30,0.00
6,1900-01-07 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,07/01/1900,57,1900,7,31.80,13.40,22.60,24.30,0.00
7,1900-01-08 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,08/01/1900,57,1900,8,31.80,13.40,22.60,24.30,0.00
8,1900-01-09 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,09/01/1900,57,1900,9,31.80,13.40,22.60,24.30,0.00
9,1900-01-10 12:00:00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,10/01/1900,57,1900,10,31.80,13.40,22.60,24.30,0.00


In [25]:
#filterCols = ['SimID', 'date', 'ZadokStage', 'avgt']
#subData = dfCombined[['SimID', 'date', 'ZadokStage', 'avgt']]
#dfCombined.loc[:, 'SimID', 'date', 'ZadokStage', 'avgt']
#subData = pd.DataFrame(dfCombined, columns=['SimID', 'date', 'ZadokStage', 'avgt'])
#subData

In [26]:
#dfCombined = pd.DataFrame(dfCombined)
filterCols = ['SimID', 'date', 'ZadokStage', 'avgt']
subData = dfCombined[filterCols]
subData

Unnamed: 0,SimID,date,ZadokStage,avgt
0,57,01/01/1900,0.00,22.60
1,57,02/01/1900,0.00,22.60
2,57,03/01/1900,0.00,22.60
3,57,04/01/1900,0.00,22.60
4,57,05/01/1900,0.00,22.60
5,57,06/01/1900,0.00,22.60
6,57,07/01/1900,0.00,22.60
7,57,08/01/1900,0.00,22.60
8,57,09/01/1900,0.00,22.60
9,57,10/01/1900,0.00,22.60


### Now add back the simulation details so that we can get sowdate for each simulation

In [27]:
#dfSim
subData = subData.merge(dfSim, on="SimID", how='left')
subData

Unnamed: 0,SimID,date,ZadokStage,avgt,Name,long,lat,variety,sowdate
0,57,01/01/1900,0.00,22.60,11360_-2830_janz_1-apr,113.60,-28.30,janz,1-apr
1,57,02/01/1900,0.00,22.60,11360_-2830_janz_1-apr,113.60,-28.30,janz,1-apr
2,57,03/01/1900,0.00,22.60,11360_-2830_janz_1-apr,113.60,-28.30,janz,1-apr
3,57,04/01/1900,0.00,22.60,11360_-2830_janz_1-apr,113.60,-28.30,janz,1-apr
4,57,05/01/1900,0.00,22.60,11360_-2830_janz_1-apr,113.60,-28.30,janz,1-apr
5,57,06/01/1900,0.00,22.60,11360_-2830_janz_1-apr,113.60,-28.30,janz,1-apr
6,57,07/01/1900,0.00,22.60,11360_-2830_janz_1-apr,113.60,-28.30,janz,1-apr
7,57,08/01/1900,0.00,22.60,11360_-2830_janz_1-apr,113.60,-28.30,janz,1-apr
8,57,09/01/1900,0.00,22.60,11360_-2830_janz_1-apr,113.60,-28.30,janz,1-apr
9,57,10/01/1900,0.00,22.60,11360_-2830_janz_1-apr,113.60,-28.30,janz,1-apr


In [28]:
#get a subset of the data
filterCols = ['SimID', 'date', 'ZadokStage', 'avgt', 'sowdate']
subData = subData[filterCols]
subData

Unnamed: 0,SimID,date,ZadokStage,avgt,sowdate
0,57,01/01/1900,0.00,22.60,1-apr
1,57,02/01/1900,0.00,22.60,1-apr
2,57,03/01/1900,0.00,22.60,1-apr
3,57,04/01/1900,0.00,22.60,1-apr
4,57,05/01/1900,0.00,22.60,1-apr
5,57,06/01/1900,0.00,22.60,1-apr
6,57,07/01/1900,0.00,22.60,1-apr
7,57,08/01/1900,0.00,22.60,1-apr
8,57,09/01/1900,0.00,22.60,1-apr
9,57,10/01/1900,0.00,22.60,1-apr


In [29]:
#index = subData.index
#subData = pd.DataFrame(subData, index=index, columns = filterCols)
#subData.loc['sowdate']

In [30]:
#update the sowdate to include the year
subData['date'] = pd.to_datetime(subData['date'], format="%d/%m/%Y")
subData['sowdate'] = subData['sowdate'] + '-' + subData['date'].dt.year.map(str)
subData['sowdate'] = pd.to_datetime(subData['sowdate'], format="%d-%b-%Y")
subData


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,SimID,date,ZadokStage,avgt,sowdate
0,57,1900-01-01,0.00,22.60,1900-04-01
1,57,1900-01-02,0.00,22.60,1900-04-01
2,57,1900-01-03,0.00,22.60,1900-04-01
3,57,1900-01-04,0.00,22.60,1900-04-01
4,57,1900-01-05,0.00,22.60,1900-04-01
5,57,1900-01-06,0.00,22.60,1900-04-01
6,57,1900-01-07,0.00,22.60,1900-04-01
7,57,1900-01-08,0.00,22.60,1900-04-01
8,57,1900-01-09,0.00,22.60,1900-04-01
9,57,1900-01-10,0.00,22.60,1900-04-01


### now calculate the cumulative temp info for each simulation

For each SimId, cumulative add the avgt (average Temperature) values between the sowdate and the current date (where the date is > sowdate and the ZadokStage < 70


In [32]:
#def calc_cumulativeTT(df, simId, sowdate, currdate, zadokStage):
#    '''
#    '''
#get the subset based on the simId
#dfSub = subData.loc[subData['SimID'] == 57]
#dfSub

subData['date1'] = subData['date'].dt.strftime('%d/%m/%Y')
subData['sowdate1'] = subData['sowdate'].dt.strftime('%d/%m/%Y')
subData
#

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,SimID,date,ZadokStage,avgt,sowdate,date1,sowdate1
0,57,1900-01-01,0.00,22.60,1900-04-01,01/01/1900,01/04/1900
1,57,1900-01-02,0.00,22.60,1900-04-01,02/01/1900,01/04/1900
2,57,1900-01-03,0.00,22.60,1900-04-01,03/01/1900,01/04/1900
3,57,1900-01-04,0.00,22.60,1900-04-01,04/01/1900,01/04/1900
4,57,1900-01-05,0.00,22.60,1900-04-01,05/01/1900,01/04/1900
5,57,1900-01-06,0.00,22.60,1900-04-01,06/01/1900,01/04/1900
6,57,1900-01-07,0.00,22.60,1900-04-01,07/01/1900,01/04/1900
7,57,1900-01-08,0.00,22.60,1900-04-01,08/01/1900,01/04/1900
8,57,1900-01-09,0.00,22.60,1900-04-01,09/01/1900,01/04/1900
9,57,1900-01-10,0.00,22.60,1900-04-01,10/01/1900,01/04/1900


In [45]:
# Create a copy of the current dataframe so that it can be reset easily
dfSub2 = subData.copy()
dfSub2

Unnamed: 0,SimID,date,ZadokStage,avgt,sowdate,date1,sowdate1
0,57,1900-01-01,0.00,22.60,1900-04-01,01/01/1900,01/04/1900
1,57,1900-01-02,0.00,22.60,1900-04-01,02/01/1900,01/04/1900
2,57,1900-01-03,0.00,22.60,1900-04-01,03/01/1900,01/04/1900
3,57,1900-01-04,0.00,22.60,1900-04-01,04/01/1900,01/04/1900
4,57,1900-01-05,0.00,22.60,1900-04-01,05/01/1900,01/04/1900
5,57,1900-01-06,0.00,22.60,1900-04-01,06/01/1900,01/04/1900
6,57,1900-01-07,0.00,22.60,1900-04-01,07/01/1900,01/04/1900
7,57,1900-01-08,0.00,22.60,1900-04-01,08/01/1900,01/04/1900
8,57,1900-01-09,0.00,22.60,1900-04-01,09/01/1900,01/04/1900
9,57,1900-01-10,0.00,22.60,1900-04-01,10/01/1900,01/04/1900


In [49]:
#save this to a csv, so we can view elsewhere
outCols = ['SimID', 'sowdate', 'date', 'ZadokStage', 'avgt']
outSub = dfSub2[outCols]
filename = apsim_outfiledir + "/sim57_zadok.csv"
outSub.to_csv(filename, encoding='utf-8', index=False)

In [50]:
dfSub2['avgTemp'] = dfSub2['avgt'].where((dfSub2['date'] >= dfSub2['sowdate']) & (dfSub2['ZadokStage'] > 0) & (dfSub2['ZadokStage'] <= 70), 0)
dfSub2['cumAvgTemp'] = dfSub2.groupby(by=['SimID','sowdate'])['avgTemp'].cumsum()
dfSub2

Unnamed: 0,SimID,date,ZadokStage,avgt,sowdate,date1,sowdate1,avgTemp,cumAvgTemp
0,57,1900-01-01,0.00,22.60,1900-04-01,01/01/1900,01/04/1900,0.00,0.00
1,57,1900-01-02,0.00,22.60,1900-04-01,02/01/1900,01/04/1900,0.00,0.00
2,57,1900-01-03,0.00,22.60,1900-04-01,03/01/1900,01/04/1900,0.00,0.00
3,57,1900-01-04,0.00,22.60,1900-04-01,04/01/1900,01/04/1900,0.00,0.00
4,57,1900-01-05,0.00,22.60,1900-04-01,05/01/1900,01/04/1900,0.00,0.00
5,57,1900-01-06,0.00,22.60,1900-04-01,06/01/1900,01/04/1900,0.00,0.00
6,57,1900-01-07,0.00,22.60,1900-04-01,07/01/1900,01/04/1900,0.00,0.00
7,57,1900-01-08,0.00,22.60,1900-04-01,08/01/1900,01/04/1900,0.00,0.00
8,57,1900-01-09,0.00,22.60,1900-04-01,09/01/1900,01/04/1900,0.00,0.00
9,57,1900-01-10,0.00,22.60,1900-04-01,10/01/1900,01/04/1900,0.00,0.00


In [72]:
newData = dfSub2[dfSub2['avgTemp'] > 0]
newData

Unnamed: 0,SimID,date,ZadokStage,avgt,sowdate,date1,sowdate1,avgTemp,cumAvgTemp
90,57,1900-04-01,5.00,20.95,1900-04-01,01/04/1900,01/04/1900,20.95,20.95
91,57,1900-04-02,6.27,20.90,1900-04-01,02/04/1900,01/04/1900,20.90,41.85
92,57,1900-04-03,7.55,20.90,1900-04-01,03/04/1900,01/04/1900,20.90,62.75
93,57,1900-04-04,8.82,20.90,1900-04-01,04/04/1900,01/04/1900,20.90,83.65
94,57,1900-04-05,30.30,20.90,1900-04-01,05/04/1900,01/04/1900,20.90,104.55
95,57,1900-04-06,30.41,20.90,1900-04-01,06/04/1900,01/04/1900,20.90,125.45
96,57,1900-04-07,30.49,20.90,1900-04-01,07/04/1900,01/04/1900,20.90,146.35
97,57,1900-04-08,30.57,20.85,1900-04-01,08/04/1900,01/04/1900,20.85,167.20
98,57,1900-04-09,30.63,20.85,1900-04-01,09/04/1900,01/04/1900,20.85,188.05
99,57,1900-04-10,30.79,20.85,1900-04-01,10/04/1900,01/04/1900,20.85,208.90


In [73]:
newData3 = newData.groupby(['SimID','sowdate'])['cumAvgTemp'].max()
newData3

SimID  sowdate   
57     1900-04-01   1,362.35
       1901-04-01   1,432.05
       1902-04-01   1,324.90
       1903-04-01   1,357.00
       1904-04-01   1,285.85
       1905-04-01   1,342.55
       1906-04-01   1,318.25
       1907-04-01   1,415.90
       1908-04-01   1,453.35
       1909-04-01   1,341.35
       1910-04-01   1,499.35
       1911-04-01   1,446.50
       1912-04-01   1,393.85
       1913-04-01   1,387.85
       1914-04-01   1,398.00
       1915-04-01   1,345.55
       1916-04-01   1,418.10
       1917-04-01   1,386.80
       1918-04-01   1,378.85
       1919-04-01   1,424.20
       1920-04-01   1,395.10
       1921-04-01   1,401.30
       1922-04-01   1,426.20
       1923-04-01   1,426.75
       1924-04-01   1,430.75
       1925-04-01   1,398.95
       1926-04-01   1,422.30
       1927-04-01   1,460.60
       1928-04-01   1,461.00
       1929-04-01   1,400.95
                      ...   
1020   1988-07-29   1,400.70
       1989-07-29   1,423.75
       1990-07-29   1,407

In [81]:
newData4 = newData.groupby(['SimID','sowdate'])['cumAvgTemp'].max().reset_index()
newData4
newData4['sowingdate'] = newData4.sowdate.dt.strftime('%d-%b')
newData4 


Unnamed: 0,SimID,sowdate,cumAvgTemp,sowingdate
0,57,1900-04-01,1362.35,01-Apr
1,57,1901-04-01,1432.05,01-Apr
2,57,1902-04-01,1324.90,01-Apr
3,57,1903-04-01,1357.00,01-Apr
4,57,1904-04-01,1285.85,01-Apr
5,57,1905-04-01,1342.55,01-Apr
6,57,1906-04-01,1318.25,01-Apr
7,57,1907-04-01,1415.90,01-Apr
8,57,1908-04-01,1453.35,01-Apr
9,57,1909-04-01,1341.35,01-Apr


In [84]:
newData5 = newData4.groupby(['SimID','sowingdate'])['cumAvgTemp'].mean().reset_index()
newData5

Unnamed: 0,SimID,sowingdate,cumAvgTemp
0,57,01-Apr,1396.68
1,164,15-Apr,1442.97
2,271,29-Apr,1493.16
3,378,13-May,1519.48
4,485,27-May,1519.54
5,592,03-Jun,1512.17
6,699,17-Jun,1485.16
7,806,01-Jul,1447.85
8,913,15-Jul,1421.15
9,1020,29-Jul,1396.39


In [85]:
filename = apsim_outfiledir + "/sim_zadok_final.csv"
newData5.to_csv(filename, encoding='utf-8', index=False)

In [None]:
to append to the file, need to use mode='a'
#newData5.to_csv(filename, header=False, mode='a')