# Compute summary statistics for the daily sea ice index.

# From the CSV files determine the day of maximum and minimum extent for each month and how that month's max and min ranks with all other months

The input data format is just a date and extent for each day we have data.
```
Year, Month, Day,     Extent,    Missing, Source Data
YYYY,    MM,  DD, 10^6 sq km, 10^6 sq km, Source data product web site: http://nsidc.org/d....
1978,    10,  26,     10.231,      0.000, ftp://sidads.colorado.edu/pub/DATASETS/nsidc0051....
1978,    10,  28,     10.420,      0.000, ftp://sidads.colorado.edu/pub/DATASETS/nsidc0051....
1978,    10,  30,     10.557,      0.000, ftp://sidads.colorado.edu/pub/DATASETS/nsidc0051....
....
```


Start by downloading the daily sea ice extent data from NSIDC's website.

In [25]:
!wget -qN ftp://sidads.colorado.edu/pub/DATASETS/NOAA/G02135/north/daily/data/NH_seaice_extent_final.csv
!wget -qN ftp://sidads.colorado.edu/pub/DATASETS/NOAA/G02135/north/daily/data/NH_seaice_extent_nrt.csv
!wget -qN ftp://sidads.colorado.edu/pub/DATASETS/NOAA/G02135/south/daily/data/SH_seaice_extent_final.csv
!wget -qN ftp://sidads.colorado.edu/pub/DATASETS/NOAA/G02135/south/daily/data/SH_seaice_extent_nrt.csv


In [26]:
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import os
%pylab inline
import pandas as pd
from pandas import ExcelWriter
pd.options.display.mpl_style = 'default'


Populating the interactive namespace from numpy and matplotlib


code to read the CSV files.

In [27]:

def parse_the_date(year, mm, dd):
    return dt.date(int(year), int(mm), int(dd))

def slurp_csv(filename):
    data = pd.read_csv(filename, header = None, skiprows=2,
                       names=["year", "mm", "dd", "extent", "missing", "source"],
                       parse_dates={'date':['year', 'mm', 'dd']},
                       date_parser=parse_the_date, index_col='date')
    data = data.drop('missing', axis=1)
    return data

def read_a_hemisphere(hemisphere):
    final_prod_filename = os.path.join('{hemi}H_seaice_extent_final.csv'.format(hemi=hemisphere[0:1].upper()))
    nrt_prod_filename = os.path.join('{hemi}H_seaice_extent_nrt.csv'.format(hemi=hemisphere[0:1].upper()))

    final = slurp_csv(final_prod_filename)
    nrt = slurp_csv(nrt_prod_filename)
    all_data = pd.concat([final, nrt])
    return all_data



 Read CSV data

In [28]:
north = read_a_hemisphere('north')
south = read_a_hemisphere('south')
south.head()

Unnamed: 0_level_0,extent,source
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1978-10-26,17.634,ftp://sidads.colorado.edu/pub/DATASETS/nsidc0...
1978-10-28,17.815,ftp://sidads.colorado.edu/pub/DATASETS/nsidc0...
1978-10-30,17.671,ftp://sidads.colorado.edu/pub/DATASETS/nsidc0...
1978-11-01,17.534,ftp://sidads.colorado.edu/pub/DATASETS/nsidc0...
1978-11-03,17.493,ftp://sidads.colorado.edu/pub/DATASETS/nsidc0...


Add columns for year and month: We have do this because when we read the CSV file
we converted the existing year/month/day columns into a python datetime object.
also drop the source because we don't care where the data came from (near real time or production)

In [29]:
def add_year_month_columns(df):
    a = df.copy()
    a = a.drop('source',1)
    a = a.reset_index()
    a['year'] = pd.to_datetime(a.date).dt.year
    a['month'] = pd.to_datetime(a.date).dt.month
    a = a.set_index('date')
    return a

In [30]:
north = add_year_month_columns(north)
south = add_year_month_columns(south)

In [31]:
north.head()

Unnamed: 0_level_0,extent,year,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1978-10-26,10.231,1978,10
1978-10-28,10.42,1978,10
1978-10-30,10.557,1978,10
1978-11-01,10.67,1978,11
1978-11-03,10.787,1978,11


In [32]:
south.head()

Unnamed: 0_level_0,extent,year,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1978-10-26,17.634,1978,10
1978-10-28,17.815,1978,10
1978-10-30,17.671,1978,10
1978-11-01,17.534,1978,11
1978-11-03,17.493,1978,11


Add 5 day rolling mean to the timesereis.

In [33]:
def add_rolling_mean(df, window=5, min_periods=2):
    copy = df.copy()
    # create an empty ts to align our extent data with
    ts = pd.Series(NaN, index=pd.date_range('1978-10-25', dt.date.today().strftime('%Y-%m-%d')))
    copy.index = copy.index.to_datetime()
    copy = df.align(ts, axis=0, join='right')[0]
    df['5day-Avg'] = pd.rolling_mean(copy['extent'], window=5, min_periods=2)
    return df

Want date back in the columns

In [34]:
north = add_rolling_mean(north)
south = add_rolling_mean(south)

In [35]:
north.head(1)

Unnamed: 0_level_0,extent,year,month,5day-Avg
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1978-10-26,10.231,1978,10,


In [36]:
north = north.reset_index()
south = south.reset_index()
north.head(1)


Unnamed: 0,date,extent,year,month,5day-Avg
0,1978-10-26,10.231,1978,10,


Use a groupby to compute the row locations that represent the minimum and
maximum extent and grab those rows into new variables.  AKA: Filter out everything
but the minimum/maximum extent for each month and year

In [37]:
def select_min_and_max_variable_rows_by_year_and_month(df, variable):
    min_groups = df.loc[df.groupby(['year','month'])[variable].idxmin()][['date', variable, 'year', 'month']]
    max_groups = df.loc[df.groupby(['year','month'])[variable].idxmax()][['date', variable, 'year', 'month']]
    return {'min': min_groups,  'max': max_groups}

create dictionaries of max and min values for each hemisphere and for daily and rolling-mean

In [38]:
n = select_min_and_max_variable_rows_by_year_and_month(north, 'extent')
navg = select_min_and_max_variable_rows_by_year_and_month(north, '5day-Avg')
s = select_min_and_max_variable_rows_by_year_and_month(south, 'extent')
savg = select_min_and_max_variable_rows_by_year_and_month(south, '5day-Avg')


show that we have actually selected different data for daily and 5-average data

In [39]:
n['max'][3:5]

Unnamed: 0,date,extent,year,month
48,1979-01-30,15.912,1979,1
61,1979-02-25,16.579,1979,2


In [40]:
navg['max'][3:5]

Unnamed: 0,date,5day-Avg,year,month
48,1979-01-30,15.795333,1979,1
62,1979-02-27,16.515,1979,2


In [41]:
def add_rank(df, rank_by):
    df['rank'] = df.groupby('month')[rank_by].rank()
    return df

add rank column for each month and hemsiphere's max and min:

In [42]:
n['max'] = add_rank(n['max'], 'extent')
n['min'] = add_rank(n['min'], 'extent')
s['max'] = add_rank(s['max'], 'extent')
s['min'] = add_rank(s['min'], 'extent')

navg['max'] = add_rank(navg['max'], '5day-Avg')
navg['min'] = add_rank(navg['min'], '5day-Avg')
savg['max'] = add_rank(savg['max'], '5day-Avg')
savg['min'] = add_rank(savg['min'], '5day-Avg')



In [43]:
def do_annual_min_max_ranking(df, field):
    min_index = df.groupby(['year'])[field].idxmin()
    max_index = df.groupby(['year'])[field].idxmax()
    mindata = df.loc[min_index][['date', field]]
    mindata['rank'] = mindata[field].rank()
    maxdata = df.loc[max_index][['date', field]]
    maxdata['rank'] = maxdata[field].rank()

    mindata = mindata.set_index(pd.to_datetime(mindata.date).dt.year)
    maxdata = maxdata.set_index(pd.to_datetime(maxdata.date).dt.year)

    maxdata = maxdata.add_prefix('max_')
    mindata = mindata.add_prefix('min_')

    data = pd.concat([mindata, maxdata], axis=1)
    return data
    


It is also desired that we have Annual min/max rank data so revisit the north and south

In [44]:
north_annual_by_day = do_annual_min_max_ranking(north, 'extent')
north_annual_averaged = do_annual_min_max_ranking(north, '5day-Avg')

In [45]:
south_annual_by_day = do_annual_min_max_ranking(south, 'extent')
south_annual_averaged = do_annual_min_max_ranking(south, '5day-Avg')

In [46]:
south_annual_averaged.head(3)

Unnamed: 0,min_date,min_5day-Avg,min_rank,max_date,max_5day-Avg,max_rank
1978,1978-12-31,7.596,38,1978-10-28,17.7245,2
1979,1979-02-19,2.928333,25,1979-09-15,18.323,8
1980,1980-02-26,2.574,7,1980-09-25,19.047,28


Write out the data frames in a nice format

In [47]:
import calendar
month_names = [calendar.month_name[x] for x in range(1,13)]

def swap_column_level_and_sort(df):
    df.columns = df.columns.swaplevel(1,0)
    df = df.sortlevel(0, axis=1)
    return df

# set index to year and month and then broadcast month back across the columns.
# next swap and sort so that you have the data grouped under the month.
def prepare_for_csv(df):
    df = df.set_index(['year','month']).unstack('month')
    df = swap_column_level_and_sort(df)
    df.columns = df.columns.set_levels(month_names, level=0)
    return df


def write_to_xls(df_list, writer, is_monthly=True):
    for df, sheet in df_list:
        if is_monthly:
            df = prepare_for_csv(df)
        df.to_excel(writer,'{sheet}'.format(sheet=sheet), float_format="%.3f")


writer = ExcelWriter('../output/Sea_Ice_Statistics.xls')

monthly_dataframelist =[(navg['min'], 'Northern 5day Min'),
                        (navg['max'], 'Northern 5day Max'),
                        (savg['min'], 'Southern 5day Min'),
                        (savg['max'], 'Southern 5day Max'),
                        (n['min'], 'Northern Daily Min'),
                        (n['max'], 'Northern Daily Max'),
                        (s['min'], 'Southern Daily Min'),
                        (s['max'], 'Southern Daily Max')]

annual_dataframelist = [(north_annual_averaged, 'North Annual 5day-avg'),
                        (north_annual_by_day, 'North Annual Daily'),
                        (south_annual_averaged, 'South Annual 5day-avg'),
                        (south_annual_by_day, 'South Annual Daily')]

write_to_xls(monthly_dataframelist, writer, is_monthly=True)
write_to_xls(annual_dataframelist, writer, is_monthly=False)

writer.save()

In [48]:
# clean up your csv files
!rm -f NH_seaice_extent_final.csv NH_seaice_extent_nrt.csv SH_seaice_extent_final.csv SH_seaice_extent_nrt.csv