### Read the station data from GHCN archive ###

Data obtained from http://www.ncdc.noaa.gov/cdo-web/

In [None]:
from __future__ import print_function, division
import numpy as np
import numpy.ma as ma
import pandas as pd
import matplotlib.pyplot as plt
import math
#import seaborn as sns
import os
import glob
%matplotlib inline

In [None]:
def get_date(date_number):
    """
    Turn the int64 value from the DATE of GHCN into a pd.datetime
    """
    dstring = str(date_number)
    return pd.datetime(int(dstring[0:4]),int(dstring[4:6]),int(dstring[6:8]))

def get_df(fnm, var, no_missing = True):
    """
    Create a dataframe for a single station, with a time index, for a single
    variable of data given as a key word (e.g. PRECIP, TMAX, TMIN).
    Requires file path and name (fnm).
    no_missing is a Bool that optionally masks out values < -99 from the df.
    """
    df = pd.read_csv(fnm)
    dt_indx = [get_date(date) for date in df.DATE]
    data_vals = df[var].values
    if var is 'PRCP':
        data_vals = data_vals / 10.  # This is to convert precip data to mm
    if no_missing:
        tmp_df = pd.DataFrame(data=data_vals,
                              index=dt_indx,columns=[df.STATION[0][6:]])
        mask = tmp_df > -99.  # A catchall value for missing data in GHCN
        return tmp_df[mask]
    else:
        return pd.DataFrame(data=data_vals,
                            index=dt_indx,columns=[df.STATION[0][6:]])

def get_combined_df(fpth, var):
    """
    From a given file path, and variable, extract data from all .csv files, and
    place in a single dataframe object.
    """
    flist = glob.glob(fpth)
    df_dic = {}
    for f in flist:
        df_dic[f[5:]] = get_df(fnm = f, var = var, no_missing=True)
    return pd.concat([df_dic[key] for key in df_dic.keys()],axis=1)

Call the Get_combined() function to create dataframes out of all data in a folder.

In [None]:
%%time
df_tmax = get_combined_df(fpth="Data/*.csv",var="TMAX")
df_tmin = get_combined_df(fpth="Data/*.csv",var="TMIN")
df_prcp = get_combined_df(fpth="Data/*.csv",var="PRCP")

### Start of analysis ###

Plot time series of precipitation for all stations, and also accumulate the data and plot the average rainfall.

In [None]:
# Example of masking and accessing data from stations...
station = df_prcp.keys()[20]
plt.plot(df_prcp[station].index,df_prcp[station],'.',alpha=0.5)
plt.title("Station {0:s}".format(station))

Time series plots

In [None]:
def daily_mean(date, df):
    """
    Return the mean value for all stations on a specified day,
    ignoring NAN values.
    """
    tmp = np.array([df_prcp[key][day] for key in df_prcp.keys()])
    return np.nanmean(tmp), np.nanstd(tmp)
    
means = []
stds = []
sem = []
for day in df_prcp.index:
    #print(day)
    #daily_mean(date=day, df=df_prcp)
    means.append(np.nanmean(np.array([df_prcp[key][day] for key in df_prcp.keys()])))
    sem.append(np.nanstd(np.array(([df_prcp[key][day]/np.sqrt(len(np.array(df_prcp[key][day]))-1) for key in df_prcp.keys()])))) 
    stds.append(np.nanstd(np.array([df_prcp[key][day] for key in df_prcp.keys()])))

In [None]:
plt.errorbar(df_prcp.index, means ,Xerr=None, yerr=df_prcp.sem, alpha=0.75)
plt.title("East Africa Daily Precipitation Mean ($\mu$) ")
plt.ylabel("Precipitation (mm) day$^{-1}$")
plt.xlabel("Year")

In [None]:
plt.plot(df_prcp.index, means, '.r', df_prcp.index, stds, '.g')

In [None]:
np.shape(means)

Calculate the DOY mean (μ) and Standard Error of the Mean (SEM) uncertainity. 
Mean uncertainty is given in SEM, where: $$ SEM = \frac {\sigma} {\sqrt{n-1}}$$

In [None]:
doy_mean=[]
doy_sem =[]
doy_csum=[]
for doy in range(366):
    index = df_prcp.index.dayofyear == doy+1 
    cum_sum= df_prcp.index.dayofyear.cumsum() == doy+1
    #print (cum_sum)
    doy_mean.append(np.nanmean(df_prcp[index]))
    doy_sem.append(np.nanstd(df_prcp[index])/np.sqrt(len(df_prcp[index])-1))
    doy_csum.append(np.nanmean(cum_sum))
  

In [None]:
plt.errorbar(range(366),doy_mean,xerr=None, yerr=doy_sem, alpha=0.6)
plt.xlim(0,max(range(366)))
plt.title("East Africa DOY Mean ($\mu$) Rainfall")
plt.ylabel("Precipitation (mm)")
plt.xlabel("Day of Year")

Correlation Matrix

In [None]:
corr_matrix = df_prcp
corr_matrix.corr(method='pearson', min_periods=1)

In [None]:
plt.plot(df_prcp.index, df_prcp.values)
plt.ylabel(r"$\frac{1}{10}$mm day$^{-1}$")
plt.xlabel(r"Days")
plt.legend(df_prcp.keys())
plt.title('GHCN EAST AFRICA DAILY RAINFALL TOTALS', fontsize=18)

Annual Mean ('A') Rainfall Pattern using Resampling.


In [None]:
###
ann_mean = df_prcp.resample('A')#, how=['mean', np.min, np.max])
ann_mean['1961':'2015'].plot(subplots=True)
ann_mean['1981':'2015'].plot()
plt.title("ANNUAL $\mu$ RAINFALL")
plt.ylabel("PRECIPITATION (MM)")
plt.xlabel("YEARS")

In [None]:
###Calculate monthly anomalies
##  Anomalies = Observation - Climatology

anomalies = []
for n,day in enumerate(X.values):
    #print("Index: {0} Date: {1} Value: {0:3.3f}".format(n,X.index[n].date(),day[0]))
    doyi = X.index[n].dayofyear -1 # Create an index to call doy_mean
    #print(n, doy_mean[doyi])
    anomalies.append(day - doy_mean[doyi])

In [None]:
#anomalies[0]

In [None]:
#plt.plot(X.index.dayofyear,anomalies)
#X.index.anomaly['1961':'2015'].plot()

In [None]:
#print(df.DATE[0])
#print(type(df.DATE[0]))
#df.DATE[0]