### Read the station data from GHCN archive ###

Data obtained from http://www.ncdc.noaa.gov/cdo-web/

In [None]:
from __future__ import print_function, division
import numpy as np
import numpy.ma as ma
import pandas as pd
import matplotlib.pyplot as plt
import math
#import seaborn as sns
import os
import glob
%matplotlib inline

In [None]:
def get_date(date_number):
    """
    Turn the int64 value from the DATE of GHCN into a pd.datetime
    """
    dstring = str(date_number)
    return pd.datetime(int(dstring[0:4]),int(dstring[4:6]),int(dstring[6:8]))

def get_df(fnm, var, no_missing = True):
    """
    Create a dataframe for a single station, with a time index, for a single
    variable of data given as a key word (e.g. PRECIP, TMAX, TMIN).
    Requires file path and name (fnm).
    no_missing is a Bool that optionally masks out values < -99 from the df.
    """
    df = pd.read_csv(fnm)
    dt_indx = [get_date(date) for date in df.DATE]
    data_vals = df[var].values
    if var is 'PRCP':
        data_vals = data_vals / 10.  # This is to convert precip data to mm
    if no_missing:
        tmp_df = pd.DataFrame(data=data_vals,
                              index=dt_indx,columns=[df.STATION[0][6:]])
        mask = tmp_df > -99.  # A catchall value for missing data in GHCN
        return tmp_df[mask]
    else:
        return pd.DataFrame(data=data_vals,
                            index=dt_indx,columns=[df.STATION[0][6:]])

def get_combined_df(fpth, var):
    """
    From a given file path, and variable, extract data from all .csv files, and
    place in a single dataframe object.
    """
    flist = glob.glob(fpth)
    df_dic = {}
    for f in flist:
        df_dic[f[5:]] = get_df(fnm = f, var = var, no_missing=True)
    return pd.concat([df_dic[key] for key in df_dic.keys()],axis=1)

Call the Get_combined() function to create dataframes out of all data in a folder.

In [None]:
%%time
df_tmax = get_combined_df(fpth="Data/*.csv",var="TMAX")
df_tmin = get_combined_df(fpth="Data/*.csv",var="TMIN")
df_prcp = get_combined_df(fpth="Data/*.csv",var="PRCP")

### Start of analysis ###

Plot time series of precipitation for all stations, and also accumulate the data and plot the average rainfall.

In [None]:
# Example of masking and accessing data from stations...
station = df_prcp.keys()[4]
plt.plot(df_prcp[station].index,df_prcp[station],'.',alpha=0.5)
plt.title("Station {0:s}".format(station))

In [None]:
df_prcp.KE000063740['2015']

In [None]:
df_prcp.KE000063740[df_prcp.KE000063740 > 100]

## Time series plots

Daily Mean and SEM values:
Mean uncertainty is given by SEM, where: $$ SEM = \frac {\sigma} {\sqrt{n-1}}$$

In [None]:
def daily_mean(date, df):
    """
    Return the mean value for all stations on a specified day,
    ignoring NAN values.
    """
    tmp = np.array([df_prcp[key][day] for key in df_prcp.keys()])
    valid = tmp > -1
    tmp = tmp[valid]
    mean = np.mean(tmp)
    std = np.std(tmp)
    count = len(tmp)
    sem = std/np.sqrt(count -1)
    return mean, std, count, sem

In [None]:
means = []
stds = []
sems = []
for day in df_prcp.index:#[10000:10005]
    tmp = np.array([df_prcp[key][day] for key in df_prcp.keys()])
    #print (tmp)
    valid = tmp > -1
    tmp = tmp[valid]
    mean = np.mean(tmp)
    std = np.std(tmp)
    count = len(tmp)
    sem = std/np.sqrt(count -1)
    #print(day.date())
    #print(daily_mean(date=day, df=df_prcp))
    means.append(mean)
    stds.append(std)
    sems.append(sem)

In [None]:
mean

In [None]:
plt.errorbar(df_prcp.index,means,yerr=sems,xerr=None,alpha=0.75)
plt.xlim(xmin='1991-01-01',xmax='2015-08-31')
plt.ylim((0.0,200))
plt.title("East Africa Daily Precipitation Mean ($\mu$) ")
plt.ylabel("Precipitation (mm) day$^{-1}$")
plt.xlabel("Year")

In [None]:
#np.shape(means)

Accumulated Daily Mean and SEM values

In [None]:
Acc_mean =[]
Acc_sem = []
for day in df_prcp.index:#[10000:10005]
    tmp = np.array([df_prcp[key][day] for key in df_prcp.keys()])
    #print (tmp)
    valid = tmp > -1
    tmp = tmp[valid]
    csum  = np.cumsum(tmp)
    cmean = np.mean(csum)
    cstd  = np.std(csum)
    ccount = len(csum)
    csem = cstd/np.sqrt(count -1)
    
    Acc_mean.append(cmean)
    Acc_sem.append(csem)
    
df_prcp['Accumulated']=pd.Series(Acc_mean,index=df_prcp.index) #Adding columns to the data frames
df_prcp['DailyMean']=pd.Series(means,index=df_prcp.index)
#df_prcp.tail()

In [None]:
plt.errorbar(df_prcp.index,Acc_mean,yerr=Acc_sem,xerr=None,alpha=0.75)
plt.xlim(xmin='1961-01-01',xmax='2015-08-31')
plt.ylim((0.0,400))
plt.title("East Africa Daily Precipitation ")
plt.ylabel("Precipitation (mm) day$^{-1}$")
plt.xlabel("Year")

DOY mean (μ) and SEM values

In [None]:
doy_mean=[]
doy_sem =[]
doy_csum=[]
for doy in range(366):
    index = df_prcp.index.dayofyear == doy+1 
    doy_mean.append(np.nanmean(df_prcp[index]))
    doy_sem.append(np.nanstd(df_prcp[index])/np.sqrt(len(df_prcp[index])-1))  

In [None]:
plt.errorbar(range(366),doy_mean,xerr=None, yerr=doy_sem, alpha=0.6)
plt.xlim(0,max(range(366)))
plt.title("East Africa DOY Mean ($\mu$) Rainfall")
plt.ylabel("Precipitation (mm)")
plt.xlabel("Day of Year")

Anomaly

Use the seasonal DOY mean to calculate deviations (anomaly) from the daily mean

In [None]:
##Anomalies = Observation - Climatology
anomalies = []
for n,day in enumerate(df_prcp.index):
    #print("Index: {0} Date: {1} Value: {0:3.3f}".format(n,df_prcp.index[n].date(),day[0]))
    doyi = df_prcp.index[n].dayofyear -1 # Create an index to call doy_mean
    #print(n, doy_mean[doyi])
    anomalies.append(day - doy_mean[doyi])

In [None]:
anomalies

Correlation Matrix

In [None]:
corr_matrix = df_prcp
corr_matrix.corr(method='pearson', min_periods=1)

calculate the percentiles

In [None]:
def percentile(df, percentile):
   # size = len(df_prcp)
     
    p90 = []
    p95 = []
    p99 = [] 
    for day in df_prcp.index:
        tmp = np.array([df_prcp[key][day] for key in df_prcp.keys()])
        p90.append(np.percentile(tmp,90))    # ...find the nth percentile uncertainty values 
        p95.append(np.percentile(tmp,95)) 
        p99.append(np.percentile(tmp,99)) 
        
        
        return p90, p95, p99

In [None]:
#from scipy.stats import scoreatpercentile
# percentiles of interest
#tmp = np.array([df_prcp[key][day] for key in df_prcp.keys()])
#perc = [min(tmp), scoreatpercentile(tmp,10), scoreatpercentile(tmp,25),
              # scoreatpercentile(tmp,50), scoreatpercentile(tmp,75),
               #scoreatpercentile(tmp,90), max(tmp)]


In [None]:
a = [154, 400, 1124, 82, 94, 108]
print (np.percentile(a,95)) # gives the 95th percentile