In [2]:
import matplotlib.pyplot as plt # python graphing library
import numpy as np              # python library for vector/array manipulations
import os                       # interact with the operating system
import pandas as pd             # DataFrame programming in python

# allow plots to appear in this notebook
%matplotlib inline              
plt.rcParams['figure.figsize'] = 10, 8 # size of plots

In [3]:
DATA_DIR = '../../data/cluster/historic/'
FIELD_MAP = {
    'cpja_slice_msk': 'precip_JunAug',
    'cpos_slice_msk': 'precip_OctSep',
    'gsp_slice_msk':  'precip_growingSeason',
    'map_slice_msk':  'precip_meanAnnual',
    'mat_slice_msk':  'meanTemp_Annual',
    'mta_slice_msk':  'meanTemp_Aug',
    'mtaa_slice_msk': 'meanTemp_AprAug',
    'ntj_slice_msk':  'meanMinTemp_Jan',
    'ntm_slice_msk':  'meanMinTemp_Mar',
    'nto_slice_msk':  'meanMinTemp_Oct',
    'ntw_slice_msk':  'meanMinTemp_DecFeb',
    'pja_slice_msk':  'precipPrevious_JunAug',
    'pos_slice_msk':  'precipPreious_OctSep',
    'vgp_slice_msk':  'varPrecip_growingSeason',
    'vgt_mat_msk':    'vegetation',
    'xta_slice_msk':  'meanMaxTemp_Aug',
    'etopo1':         'elev_etopo1'}

In [4]:
def read_and_format_data(file_path):
    data = pd.read_csv(file_path)
    redundant_vgt_columns = [field for field in list(data) 
                             if field.startswith('vgt')][1:]
    drop_columns = redundant_vgt_columns + ['Unnamed: 0', 'srtm30', 'mask']
    data = data.drop(drop_columns, axis=1)
    return data

In [5]:
def mask_to_binary(mask, dataframe):
    df = dataframe.copy()
    for col in list(df):
        if col.startswith(mask):
            df[col] = df[col].apply(lambda x: 0 if np.isnan(x) else 1)
    return df

In [6]:
def main():
    files = [f for f in os.listdir(DATA_DIR) 
             if f.endswith('.csv')
             and 'clean' not in f]
    for f in files:
        print('Converting %s' % f)
        year = int(f[-8:-4])
        if year < 2000:
            continue
        in_path = DATA_DIR + f
        out_path = '%sclean_%d.csv' % (DATA_DIR, year)
        data = read_and_format_data(in_path)
        data['year'] = year
        data = data.rename(columns=FIELD_MAP)
        data = mask_to_binary('vegetation', data)
        print('Writing reformatted data to %s...' % out_path)
        data.to_csv(out_path, index=False)

In [7]:
main()

Converting climatic_variables_longlat_var_2014.csv
Writing reformatted data to ../../data/cluster/historic/clean_2014.csv...


In [7]:
# Check
data = pd.read_csv(DATA_DIR + 'clean_2014.csv')
data.head()

Unnamed: 0,x,y,lon,lat,elev_etopo1,meanTemp_Annual,vegetation,meanTemp_AprAug,meanTemp_Aug,meanMinTemp_DecFeb,...,meanMinTemp_Mar,meanMaxTemp_Aug,precip_meanAnnual,precip_JunAug,precipPrevious_JunAug,precip_OctSep,precipPreious_OctSep,precip_growingSeason,varPrecip_growingSeason,year
0,2690000.0,-4510000.0,-77.292019,5.124395,67,26.812778,0,27.142666,26.916667,22.978334,...,23.381667,30.876667,535.318058,2577.260016,1403.930003,12906.033359,6482.216669,1364.280025,0.138894,2014
1,2700000.0,-4510000.0,-77.208582,5.099891,61,27.079143,0,27.427266,27.184194,23.197228,...,23.637626,31.199392,547.503372,2594.673175,1409.924892,13220.560694,6650.520226,1429.550831,0.147905,2014
2,2710000.0,-4510000.0,-77.125153,5.075297,67,27.102833,0,27.424797,27.179896,23.270616,...,23.64945,31.303671,555.477352,2615.540579,1420.162854,13418.755609,6753.027381,1459.028055,0.152029,2014
3,2720000.0,-4510000.0,-77.041733,5.050615,26,27.376301,0,27.702362,27.475579,23.487258,...,23.90979,31.707875,566.427592,2648.11079,1436.123514,13688.704153,6891.573054,1495.428626,0.158797,2014
4,2730000.0,-4510000.0,-76.958322,5.025843,43,27.141213,0,27.479962,27.271241,23.182578,...,23.568634,31.612004,567.604844,2610.392888,1416.810746,13724.426062,6913.167928,1503.757309,0.1684,2014
