Reformat and do 5-day averaging on daily sea ice data.

In [1]:
!wget -qN ftp://sidads.colorado.edu/pub/DATASETS/NOAA/G02135/north/daily/data/NH_seaice_extent_final.csv
!wget -qN ftp://sidads.colorado.edu/pub/DATASETS/NOAA/G02135/north/daily/data/NH_seaice_extent_nrt.csv
!wget -qN ftp://sidads.colorado.edu/pub/DATASETS/NOAA/G02135/south/daily/data/SH_seaice_extent_final.csv
!wget -qN ftp://sidads.colorado.edu/pub/DATASETS/NOAA/G02135/south/daily/data/SH_seaice_extent_nrt.csv


Variables to set before running:


In [2]:
hemisphere = 'north'  # 'south' or 'north'
climatology_years = (1981, 2010)


In [3]:
import datetime as dt
import numpy as np

import os
import pandas as pd
from pandas import ExcelWriter
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
pd.options.display.mpl_style = 'default'



In [4]:

def parse_the_date(year, mm, dd):
    return dt.date(int(year), int(mm), int(dd))

def slurp_csv(filename):
    data = pd.read_csv(filename, header = None, skiprows=2,
                       names=["year", "mm", "dd", "extent", "missing", "source"],
                       parse_dates={'date':['year', 'mm', 'dd']},
                       date_parser=parse_the_date, index_col='date')
    data = data.drop(['missing', 'source'], axis=1)
    return data


def read_a_hemisphere(hemisphere):
    final_prod_filename = os.path.join('{hemi}H_seaice_extent_final.csv'.format(hemi=hemisphere[0:1].upper()))
    nrt_prod_filename = os.path.join('{hemi}H_seaice_extent_nrt.csv'.format(hemi=hemisphere[0:1].upper()))

    final = slurp_csv(final_prod_filename)
    nrt = slurp_csv(nrt_prod_filename)
    all_data = pd.concat([final, nrt])
    return all_data



In [5]:
df = read_a_hemisphere(hemisphere)
df.head(3)

Unnamed: 0_level_0,extent
date,Unnamed: 1_level_1
1978-10-26,10.231
1978-10-28,10.42
1978-10-30,10.557


Set indices to datetime indexes and reindex so that every daily timestep is included in the series.

In [6]:
df.index = pd.to_datetime(df.index)
df  = df.reindex(index=pd.date_range('1978-10-25', dt.date.today().strftime('%Y-%m-%d')))
df['hemi'] = hemisphere
df.head()

Unnamed: 0,extent,hemi
1978-10-25,,north
1978-10-26,10.231,north
1978-10-27,,north
1978-10-28,10.42,north
1978-10-29,,north


## interpolate missing data in SMMR period.

We don't want to interpolate across any timeperiods where more than one day
of data is missing.  So we are going to union the NaN that remain after a
back fill and forward fill in order to leave any gaps in the data record
alone.

So start by using the backfill to fill any NaN locations that have a valid "next" value.
So start by using the forwardfill to fill any NaN locations that have a valid "previous" value.

In [7]:
df['backfill'] = df.extent.fillna(method='bfill', limit=1)
df['forwardfill'] = df.extent.fillna(method='ffill', limit=1)


In [8]:
df.head()

Unnamed: 0,extent,hemi,backfill,forwardfill
1978-10-25,,north,10.231,
1978-10-26,10.231,north,10.231,10.231
1978-10-27,,north,10.42,10.231
1978-10-28,10.42,north,10.42,10.42
1978-10-29,,north,10.557,10.42


In [9]:
df['19871201':'19871206']

Unnamed: 0,extent,hemi,backfill,forwardfill
1987-12-01,12.504,north,12.504,12.504
1987-12-02,12.6,north,12.6,12.6
1987-12-03,,north,,12.6
1987-12-04,,north,,
1987-12-05,,north,,
1987-12-06,,north,,


In [10]:
df['19880110':'19880114']

Unnamed: 0,extent,hemi,backfill,forwardfill
1988-01-10,,north,,
1988-01-11,,north,,
1988-01-12,,north,14.826,
1988-01-13,14.826,north,14.826,14.826
1988-01-14,14.854,north,14.854,14.854


So the union of backfill's NaN and forward fill NaN will capture any missing
data that doesn't have a valid data point both before and after itself in the series.
We can get a list of is really NAN by saving this off.

In [11]:
is_really_nan = pd.isnull(df['backfill']) | pd.isnull(df['forwardfill'])

Use the interpolation scheme to do simple linear regression on all data and then
Mark missing any large gaps in the linearly interpolated data and drop the backfill and forwardfill columns


In [12]:
df['interpolated'] = df.extent.interpolate()
df[is_really_nan].interpolated = np.nan
df.drop(['forwardfill', 'backfill'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


So now we have a simple dataframe with daily extents and daily interpolated extents

In [13]:
df.head()

Unnamed: 0,extent,hemi,interpolated
1978-10-25,,north,
1978-10-26,10.231,north,10.231
1978-10-27,,north,10.3255
1978-10-28,10.42,north,10.42
1978-10-29,,north,10.4885


##Compute climatological means by working with just the data between your desired climatology years.

In [14]:
clim_data = df[(df.index.year >= climatology_years[0])&(df.index.year <= climatology_years[1] )].copy()

In [15]:
print clim_data.head(),"\n...\n" ,clim_data.tail()

            extent   hemi  interpolated
1981-01-01  14.288  north       14.2880
1981-01-02     NaN  north       14.3955
1981-01-03  14.503  north       14.5030
1981-01-04     NaN  north       14.4810
1981-01-05  14.459  north       14.4590 
...
            extent   hemi  interpolated
2010-12-27  12.358  north        12.358
2010-12-28  12.398  north        12.398
2010-12-29  12.457  north        12.457
2010-12-30  12.558  north        12.558
2010-12-31  12.670  north        12.670


show the years of the climatology and then number of years to work with.

In [16]:
print len(np.unique(clim_data.index.year))
print np.unique(clim_data.index.year)

30
[1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995
 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010]


grab the mean value of the interpolated extents for each month/day combination

In [17]:
clim_averages = clim_data.groupby([clim_data.index.month, clim_data.index.day]).mean()[['interpolated']]


**check yourself**:  You can see in the three panels below that the value we get by calling
`mean()` on the `groupby` result is the same as expected by averaging the day
and month data separately

In [18]:
clim_data[(clim_data.index.month == 1)&(clim_data.index.day == 1)]['interpolated'].values

array([ 14.288 ,  14.371 ,  14.257 ,  14.0065,  13.946 ,  14.036 ,
        14.197 ,  14.19  ,  14.261 ,  14.319 ,  13.634 ,  14.069 ,
        14.039 ,  14.094 ,  14.144 ,  13.804 ,  13.657 ,  14.025 ,
        13.823 ,  13.442 ,  13.479 ,  13.59  ,  13.647 ,  13.502 ,
        13.16  ,  13.16  ,  13.11  ,  13.206 ,  13.189 ,  13.205 ])

In [19]:
np.mean(clim_data[(clim_data.index.month == 1)&(clim_data.index.day == 1)]['interpolated'].values)

13.795016666666671

In [20]:
clim_averages.head(1)

Unnamed: 0,Unnamed: 1,interpolated
1,1,13.795017


In [21]:
clim_averages = clim_averages.rename(columns={'interpolated': '1981-2010'})
clim_averages.head(1)

Unnamed: 0,Unnamed: 1,1981-2010
1,1,13.795017


####Set the daily extent data into the correct format for display and for concatenating with the clim_averages

In [22]:
df.index

<class 'pandas.tseries.index.DatetimeIndex'>
[1978-10-25, ..., 2015-05-05]
Length: 13342, Freq: D, Timezone: None

In [23]:
df = df[['extent']].set_index([df.index.year, df.index.month, df.index.day]).unstack(0)


In [24]:
df.index

MultiIndex(levels=[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 1, 2, 3, 4, 5, 6, 7, 8, ...]])

remove the extraneous 'extent' level on the columns so we can concat. (turn into a simple index)

In [25]:
df.columns = df.columns.droplevel(0)

add a spacer on the output

In [26]:
space = clim_averages.copy()

In [27]:
space['1981-2010'] = "    "
space.rename(columns={'1981-2010': '   '}, inplace=True)


In [28]:
daily_extent_with_climatological_average = pd.concat([df,space, clim_averages], axis=1)

r = daily_extent_with_climatological_average

Add back a toplevel multi-index column

In [29]:
r.columns

Index([1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, u'   ', u'1981-2010'], dtype='object')

In [30]:
r['extent'] = 'Daily Extents : with climatological means based on interpolated data'
r.set_index('extent', append=True, inplace=True)
r = r.unstack('extent')
r.columns =r.columns.reorder_levels(['extent', None])

In [31]:
import calendar
month_names = [calendar.month_name[x] for x in range(1,13)]
r.index = r.index.set_levels(month_names, level=0)


In [32]:
writer = ExcelWriter('../output/{hemi}_test_daily.xls'.format(hemi=hemisphere))
r.to_excel(writer, float_format = "%.3f")
writer.save()

In [33]:
# cleanup
!rm -f NH_seaice_extent_final.csv NH_seaice_extent_nrt.csv SH_seaice_extent_final.csv SH_seaice_extent_nrt.csv
