# Process the Summary Files

This script will process the *_summary.csv files.  The will iterate through all of the files and generate a single file for a specific growth phase and year.

In [1]:
#Import the required libraries
import sys
import os
import datetime
import numpy as np
import pandas as pd

In [2]:
sourcedir = "/OSM/CBR/AG_WHEATTEMP/work/output"

In [3]:
filelist_df = pd.DataFrame(columns=['filename'])
filelist_df.filename = sorted(sourcedir+'/'+f for f in os.listdir(sourcedir) if f.endswith('_summary.csv'))
print(filelist_df.head())


                                            filename
0  /OSM/CBR/AG_WHEATTEMP/work/output/113.60_-28.3...
1  /OSM/CBR/AG_WHEATTEMP/work/output/113.70_-28.4...
2  /OSM/CBR/AG_WHEATTEMP/work/output/113.70_-28.5...
3  /OSM/CBR/AG_WHEATTEMP/work/output/113.75_-28.4...
4  /OSM/CBR/AG_WHEATTEMP/work/output/113.75_-28.5...


In [6]:
outfile = sourcedir + "/filelist.txt"
filelist_df.to_csv(outfile, header=False, index=False)

In [None]:
filelist_df = pd.read_csv(outfilelist, header=None)
filelist_df.columns=['filename']
filelist_df.head()

In [9]:
def process_summary_files(filelist, filter_phase, filter_year):
    '''
    processes an individual file and extracts information based on the phase and year that
    is passed in
    '''

    outfilelist = sourcedir + "/" + filelist
    filelist_df = pd.read_csv(outfilelist, header=None)
    filelist_df.columns=['filename']    
    
    for filename in filelist_df.filename:
        #print(" ...file: ", filename)
    
        dfData = pd.read_csv(filename)
        dfData = dfData[(dfData['phases'] == filter_phase)]
        dfData['sowingdate'] = pd.to_datetime(dfData['sowingdate'], format="%Y-%m-%d")
        dfData['year'] = dfData['sowingdate'].dt.year
        dfData['sowdate'] = dfData['sowingdate'].dt.strftime("%d-%B")
        dfData = dfData[(dfData['year'] == filter_year)]

        cols = ['SimID', 'variety', 'long', 'lat', 'sowdate', 'phases', 'dayCount', 
                'maxTemp', 'avgTemp', 'days>=30', 'days>=32']
        dfData = dfData[cols]

        dfData.rename(columns={'days>=30': 'daysGTE30', 'days>=32': 'daysGTE32'}, inplace=True)

        outfile = sourcedir + "/" + filter_phase + "_" + str(filter_year) + ".csv"

        
        #NEED TO LOOK AT COLLATING THE LIST AND THEN SAVING IT AS SINGLE SAVE
        #                DIFFERENT FILE FORMAT ALSO (FST PACKAGE ????)
        
        
        
        if not os.path.isfile(outfile):
            dfData.to_csv(outfile, header=True, encoding='utf-8', index=False)
        else:
            dfData.to_csv(outfile, header=False, mode='a', encoding='utf-8', index=False)
        
    

In [10]:
# set some defaults to work with
filelist = "filelist.txt"
filter_phase = "07_GrainFilling"
filter_year = 2017

print("processing started at ", datetime.datetime.now())
process_summary_files(filelist, filter_phase, filter_year)
print("Processs completed", datetime.datetime.now())

processing started at  2018-08-08 13:53:51.811575
Processs completed 2018-08-08 14:35:48.688472
