## Create Data Chunks to facilitate HTCondor Processing

In [1]:
import os, sys
import numpy as np
import pandas as pd
import time
from tools import split, save_hdf

### Load data and Create data chunk

In [2]:
def ExtendPeriodIndex(df_in):
    df_out = df_in.copy()
    tdx = pd.to_datetime(df_out.index,format='%Y-%m-%d')
    df_out.index = tdx.to_period('M')
    df_out = df_out.reindex(pd.date_range(tdx.min(), tdx.max(),freq='M').to_period('M'), 
                            fill_value=np.nan)
    return df_out
    
# Load yield data
yield1 = pd.read_hdf('./data/crop/SO_admin1_maize_yield.hdf'); yield1.columns.name = 'FNID'
yield2 = pd.read_hdf('./data/crop/SO_admin2_maize_yield.hdf'); yield2.columns.name = 'FNID'
# - Extend period index
yield1 = ExtendPeriodIndex(yield1)
yield1 = yield1[yield1.index.month.isin([2,8])]
yield2 = ExtendPeriodIndex(yield2)
yield2 = yield2[yield2.index.month.isin([2,8])]
crop = yield2.copy()    # THIS SELECT DATA

# Load EO data
prcp = pd.read_hdf('./data/earthobs/SO_admin2_prcp.hdf')
etos = pd.read_hdf('./data/earthobs/SO_admin2_etos.hdf')
smos = pd.read_hdf('./data/earthobs/SO_admin2_smos.hdf')
ndvi = pd.read_hdf('./data/earthobs/SO_admin2_ndvi.hdf')

In [8]:
crop.notna().sum(1).values

array([16,  0,  2, 21,  2, 28, 19, 21, 19, 29, 21, 30,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0, 22, 14, 23, 23, 26, 25, 27, 31, 26, 35,
       30, 34, 29, 29, 29, 31, 30, 34, 36, 34, 26, 31, 31, 35, 34, 33, 29,
       33, 35, 33, 24, 36, 37, 33, 36, 32, 34, 35, 32, 32, 34, 33, 29, 33,
        2, 30])

In [3]:
# Number of districts
nchunk = crop.shape[1]
pointList = list(split(list(crop.columns), nchunk))
predList = ['prcp', 'etos', 'smos', 'ndvi']
# Creat data chunks with multi-index
for i in range(nchunk):
    # Create a DataFrame per chunk
    dfPred1 = pd.DataFrame(index = crop.index, columns=pointList[i])
    mcols2 = pd.MultiIndex.from_product([pointList[i], predList], names=['pid', ''])
    dfPred2 = pd.DataFrame(index = prcp.index, columns=mcols2)
    # Assign data to each point
    for j in pointList[i]:
        dfPred1[j] = crop[j]
        dfTemp = pd.concat([prcp[j], etos[j], smos[j], ndvi[j]], axis=1)
        dfTemp.columns = predList
        dfPred2[j] = dfTemp
    # Save as HDF format
    save_hdf('./data/data_in/dfCropDist{:d}.hdf'.format(i), dfPred1, set_print=False)
    save_hdf('./data/data_in/dfPredDist{:d}.hdf'.format(i), dfPred2, set_print=False)
print('All chunks of data is saved.')

All chunks of data is saved.


In [4]:
pointList

[['SO1990A21201'],
 ['SO1990A21203'],
 ['SO1990A21301'],
 ['SO1990A21303'],
 ['SO1990A21304'],
 ['SO1990A21501'],
 ['SO1990A22001'],
 ['SO1990A22002'],
 ['SO1990A22003'],
 ['SO1990A22101'],
 ['SO1990A22102'],
 ['SO1990A22103'],
 ['SO1990A22301'],
 ['SO1990A22302'],
 ['SO1990A22303'],
 ['SO1990A22304'],
 ['SO1990A22305'],
 ['SO1990A22306'],
 ['SO1990A22307'],
 ['SO1990A22401'],
 ['SO1990A22402'],
 ['SO1990A22403'],
 ['SO1990A22404'],
 ['SO1990A22501'],
 ['SO1990A22502'],
 ['SO1990A22503'],
 ['SO1990A22504'],
 ['SO1990A22505'],
 ['SO1990A22601'],
 ['SO1990A22602'],
 ['SO1990A22603'],
 ['SO1990A22605'],
 ['SO1990A22606'],
 ['SO1990A22701'],
 ['SO1990A22702'],
 ['SO1990A22703'],
 ['SO1990A22801'],
 ['SO1990A22802'],
 ['SO1990A22803'],
 ['SO1990A22804']]