### Read the station data from GHCN archive ###

Data obtained from http://www.ncdc.noaa.gov/cdo-web/

In [None]:
from __future__ import print_function
import numpy as np
import numpy.ma as ma
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
%matplotlib inline

In [None]:
def get_date(date_number):
    """
    Turn the int64 value from the DATE of GHCN into a pd.datetime
    """
    dstring = str(date_number)
    return pd.datetime(int(dstring[0:4]),int(dstring[4:6]),int(dstring[6:8]))

def get_df(fnm, var, no_missing = True):
    """
    Create a dataframe for a single station, with a time index, for a single
    variable of data given as a key word (e.g. PRECIP, TMAX, TMIN).
    Requires file path and name (fnm).
    no_missing is a Bool that optionally masks out values < -99 from the df.
    """
    df = pd.read_csv(fnm)
    dt_indx = [get_date(date) for date in df.DATE]
    if no_missing:
        tmp_df = pd.DataFrame(data=df[var].values,
                              index=dt_indx,columns=[df.STATION[0][6:]])
        mask = tmp_df > -99.
        return tmp_df[mask]
    else:
        return pd.DataFrame(data=df[var].values,
                            index=dt_indx,columns=[df.STATION[0][6:]])

def get_combined_df(fpth, var):
    """
    From a given file path, and variable, extract data from all .csv files, and
    place in a single dataframe object.
    """
    flist = glob.glob(fpth)
    df_dic = {}
    for f in flist:
        df_dic[f[5:]] = get_df(fnm = f, var = var, no_missing=True)
    #mask = df_dic != -9999
    #df_dic[mask] = np.nan 
    #mx = ma.MaskedArray(df_dic, mask != -9999.)
    return pd.concat([df_dic[key] for key in df_dic.keys()],axis=1)

In [None]:
%%time
df_tmax = get_combined_df(fpth="Data/*.csv",var="TMAX")
df_tmin = get_combined_df(fpth="Data/*.csv",var="TMIN")
df_prcp = get_combined_df(fpth="Data/*.csv",var="PRCP")

In [None]:
# Example of masking and accessing data from stations...
station = df_prcp.keys()[0]
mask = df_prcp[station] > -1
plt.plot(df_prcp[station][mask].index,df_prcp[station][mask],'.',alpha=0.5)
plt.title("Station {0:s}".format(station))

In [None]:
#X=pd.concat([df_prcp[key] for key in df_prcp.keys()],axis=1)
#X.tail()

In [None]:
#mask = X == -9999.
#X[mask] = np.nan
#X[mask].tail()

In [None]:
#X.mean()

In [None]:
#X.describe()

In [None]:
#X[X > 500.]

In [None]:
#plt.plot(X.index, X.values)
#plt.ylabel(r"$\frac{1}{10}$mm day$^{-1}$")
#plt.xlabel(r"Days")
#plt.legend(df_prcp.keys())
#plt.title('GHCN EAST AFRICA DAILY RAINFALL TOTALS', fontsize=18)

In [None]:
### Annual Mean ('A') Rainfall Pattern using Resampling.###
#X_mm = X.resample('A')#, how=['mean', np.min, np.max])
#X_mm['1961':'2015'].plot(subplots=True)
#X_mm['1981':'2015'].plot()
#plt.title("ANNUAL $\mu$ RAINFALL")
#plt.ylabel("PRECIPITATION (MM)")
#plt.xlabel("YEARS")

In [None]:
#X_mm.ET000063331

In [None]:
#X_mm.KE000063612['1961':'2015'].plot()
#plt.plot(X_mm.ET000063331)
#plt.title("ANNUAL $\mu$ RAINFALL")
#plt.ylabel("PRECIPITATION (MM)")
#plt.xlabel("YEARS")

In [None]:
#mask = df_prcp.ET000063331 != -9999
#plt.plot(df_prcp.ET000063331.index[mask], df_prcp.ET000063331[mask].values,'b',alpha=0.75)

In [None]:
#DOY mean (μ) 
#doy_mean=[]
#for doy in range(366):
#    index = X.index.dayofyear == doy+1 
#    doy_mean.append(np.nanmean(X[index]))

In [None]:
#doy_mean[0]

In [None]:
#plt.errorbar(range(366),doy_mean,xerr=None)
#plt.xlim(0,max(range(366)))
#plt.title("DOY $\mu$ rainfall")
#plt.ylabel("Precip (mm)")
#plt.xlabel("DOY")

In [None]:
###Calculate monthly anomalies
##  Anomalies = Observation - Climatology

anomalies = []
for n,day in enumerate(X.values):
    #print("Index: {0} Date: {1} Value: {0:3.3f}".format(n,X.index[n].date(),day[0]))
    doyi = X.index[n].dayofyear -1 # Create an index to call doy_mean
    #print(n, doy_mean[doyi])
    anomalies.append(day - doy_mean[doyi])

In [None]:
#anomalies[0]

In [None]:
#plt.plot(X.index.dayofyear,anomalies)
#X.index.anomaly['1961':'2015'].plot()

In [None]:
# Exaple of how to mask data and quick look
#mask = df.PRCP != -9999
#plt.plot(df.PRCP[mask])
#plt.show()

In [None]:
#print(df.DATE[0])
#print(type(df.DATE[0]))
#df.DATE[0]