In [1]:
import urllib
import pandas as pd
import numpy as np

In [2]:
# Choose iceberg year (2002 - 2015 available)
# Note: Iceberg Season starts in November so many datasets include dates from year-1
year = 2015

In [3]:
iip_url_base = 'ftp://sidads.colorado.edu/pub/DATASETS/NOAA/G00807/' 
iip_filename = f'IIP_{year}IcebergSeason.csv'
iip_url = iip_url_base + iip_filename
r = urllib.request.urlretrieve(iip_url)
df = pd.read_csv(r[0], converters={'TIME':str})

In [4]:
df['DATETIME'] = pd.to_datetime(df['DATE'] + 'T' + df['TIME'])

In [None]:
df

In [6]:
df['DATETIME'].dt.values[0].minute

39

In [7]:
# Choose the min number of observations for an eligible iceberg
min_num_obs = 10

In [8]:
eligible_bergs = np.asarray(df['BERG_NUMBER'].value_counts()\
                             .loc[df['BERG_NUMBER'].value_counts() > min_num_obs].index)
                             #df['BERG_NUMBER'].value_counts().loc[df['BERG_NUMBER'].value_counts() > min_num_obs]])

In [9]:
eligible_bergs

array([21511, 21165, 20668, 20100, 21109, 20645, 23460, 23494, 21186,
       20111, 20633, 20631, 21666, 21754, 20123, 20017, 23532, 20734,
       23508, 23036,    22, 23496, 23525, 20025, 20149, 21735, 20487,
       20679, 23167, 20178, 20267, 20295, 20720, 23459, 20151, 21640,
       21167, 21645, 20013, 20701, 21662, 21158, 23547, 20154, 23029,
       20662, 21156, 20099, 20063, 23597, 20655, 21163, 20224, 20657,
       20219, 20498, 20138, 23462, 23172, 23502, 22683, 23441, 20173,
       22924, 20502, 21282, 20223, 21629, 20204, 21651, 23596, 21672,
       20479, 23548, 20600, 20184, 20477, 20291, 23169, 20474, 20148,
       21187, 20580, 22587, 20722, 20509, 20501, 20439, 21170, 20215,
       21466, 20023, 20217, 21116, 23165, 21088, 21271, 23018, 20012,
       22929, 20323, 21173,    13, 21288, 21312, 21653, 20020, 21638,
       20087, 20031, 20428, 23628, 23599, 23509, 21164, 20324, 23600,
       21753, 23022, 20420, 20972, 20162, 20724, 20716, 20638, 20475,
       20482, 20296,

In [11]:
berg_id = eligible_bergs[0]
berg_df = df.loc[df['BERG_NUMBER'] == berg_id]

In [12]:
(berg_df.DATETIME.values[1] - berg_df.DATETIME.values[0]).astype('timedelta64[m]')

numpy.timedelta64(1146,'m')

In [13]:
len(berg_df)

26

In [14]:
np.timedelta64(24*60*3, 'm')

numpy.timedelta64(4320,'m')

In [15]:
np.timedelta64(24*60*3, 'm') - (berg_df.DATETIME.values[1] - berg_df.DATETIME.values[0]).astype('timedelta64[m]')

numpy.timedelta64(3174,'m')

In [16]:
eligible_bergs.size

140

In [24]:
chosen_inds_dict = {}
chosen_inds_arr = []

for i in range(eligible_bergs.size):

#for i in range(1):

    berg_id = eligible_bergs[i]
    berg_df = df.loc[df['BERG_NUMBER'] == berg_id]
    
    ind0 = berg_df.index.tolist()[0]
    indf = berg_df.index.tolist()[-1]
    
    max_time_dif = np.timedelta64(24*60*3, 'm')
    
    chosen_inds = []
    counter = 0

    for j in range(len(berg_df)-1):

        time_dif = (berg_df.DATETIME.values[j+1] - \
                    berg_df.DATETIME.values[j]).astype('timedelta64[m]')

        #hr, minu, sec = convert_timedelta(time_dif)

        
        if time_dif < max_time_dif:
            chosen_inds.append(j+ind0)

        elif len(chosen_inds) > 5:
            chosen_inds_dict['{}_{}'.format(berg_id, counter)] = chosen_inds
            counter += 1
            chosen_inds = []
        else:
            chosen_inds = []

    if len(chosen_inds) > 5:
        chosen_inds_dict['{}_{}'.format(berg_id, counter)] = chosen_inds      
        chosen_inds_arr.append(chosen_inds)

In [25]:
chosen_inds_arr

[[6326, 6327, 6328, 6329, 6330, 6331, 6332, 6333, 6334, 6335, 6336],
 [4986,
  4987,
  4988,
  4989,
  4990,
  4991,
  4992,
  4993,
  4994,
  4995,
  4996,
  4997,
  4998],
 [3286,
  3287,
  3288,
  3289,
  3290,
  3291,
  3292,
  3293,
  3294,
  3295,
  3296,
  3297,
  3298,
  3299,
  3300,
  3301],
 [573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587],
 [3148, 3149, 3150, 3151, 3152, 3153, 3154, 3155, 3156, 3157, 3158, 3159],
 [628, 629, 630, 631, 632, 633],
 [10738,
  10739,
  10740,
  10741,
  10742,
  10743,
  10744,
  10745,
  10746,
  10747,
  10748,
  10749,
  10750,
  10751,
  10752],
 [10822,
  10823,
  10824,
  10825,
  10826,
  10827,
  10828,
  10829,
  10830,
  10831,
  10832,
  10833,
  10834,
  10835],
 [276, 277, 278, 279, 280, 281, 282, 283, 284, 285],
 [966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977],
 [3496, 3497, 3498, 3499, 3500, 3501, 3502],
 [10465,
  10466,
  10467,
  10468,
  10469,
  10470,
  10471,
  10472,
  10473,
  1047

In [18]:
dictkeylist = list(chosen_inds_dict.keys())

# init empty df2 with same col names and dtypes as df 
df2 = pd.DataFrame.from_items([(name, pd.Series(data=None, dtype=series.dtype)) 
                               for name, series in df.iteritems()])
for i in range(len(dictkeylist)):
    dictkey = dictkeylist[i]
    for j in range(len(chosen_inds_dict[dictkey])):
        chosen_ind = chosen_inds_dict[dictkey][j]
        df2 = df2.append(df.loc[df.index == chosen_ind])



In [None]:
chosen_inds_dict

In [None]:
df2

In [20]:
from sklearn import datasets

In [26]:
%store chosen_inds_dict
%store chosen_inds_arr
%store df2

Stored 'chosen_inds_dict' (dict)
Stored 'chosen_inds_arr' (list)
Stored 'df2' (DataFrame)


### Old Code

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from ftplib import FTP
import urllib
import pandas as pd

In [None]:
year = 2015 # 2002 through 2015 available
iip_url_base = 'ftp://sidads.colorado.edu/pub/DATASETS/NOAA/G00807/' 
iip_filename = f'IIP_{year}IcebergSeason.csv'
iip_url = iip_url_base + iip_filename

In [None]:
r = urllib.request.urlretrieve(iip_url_base)
#reopen(r[0]).readlines()

In [None]:
print(iip_url)

In [None]:
r = urllib.request.urlretrieve(iip_url)
df = pd.read_csv(r[0])

In [None]:
df

In [None]:
# Latitude is North South

print('min lat: {}, min lon: {};\nmax lat: {}, max lon: {}'
      .format(df['LATITUDE'].min(), df['LONGITUDE'].min(),
              df['LATITUDE'].max(), df['LONGITUDE'].max()))

In [None]:
n = df.BERG_NUMBER.mode()[0]

In [None]:
df.loc[df['BERG_NUMBER'] == n]

In [None]:
df['BERG_NUMBER'].value_counts().loc[df['BERG_NUMBER'].value_counts() > 10]

In [None]:
eligible_bergs = np.asarray([df['BERG_NUMBER'].value_counts().loc[df['BERG_NUMBER'].value_counts() > 10].index,
                             df['BERG_NUMBER'].value_counts().loc[df['BERG_NUMBER'].value_counts() > 10]])

In [None]:
eligible_bergs

In [None]:
eligible_id = eligible_bergs[0][0]

In [None]:
eligible_berg = df.loc[df['BERG_NUMBER'] == eligible_id]

In [None]:
eligible_berg.loc[eligible_berg['DATE'].str.contains(r'^6/[1|2][5-9|0-9]')]

In [None]:
eligible_berg.loc[eligible_berg['DATE'].str.contains(r'^6/[1|2][5-9|0-9]')]

In [None]:
eligible_berg['date'] = pd.to_datetime(eligible_berg.DATE + ' ' + eligible_berg.TIME.astype(str))

In [None]:
eligible_berg['month'] = pd.DatetimeIndex(eligible_berg.date).month
eligible_berg['day'] = pd.DatetimeIndex(eligible_berg.date).day
eligible_berg['hour'] = pd.DatetimeIndex(eligible_berg.date).hour

In [None]:
eligible_berg

In [None]:
def convert_timedelta(duration):
    days, seconds = duration.days, duration.seconds
    hours = days * 24 + seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = (seconds % 60)
    return hours, minutes, seconds

In [None]:
eligible_berg_id = int(eligible_berg['BERG_NUMBER'].mode())
print(eligible_berg_id)

In [None]:
eligible_berg_id = int(eligible_berg['BERG_NUMBER'].mode())
ind0 = eligible_berg.index.tolist()[0]
indf = eligible_berg.index.tolist()[-1]
chosen_inds = []
timestamp_list = eligible_berg.date.tolist()
chosen_inds_dict = {}
counter = 0

for i in range(len(timestamp_list)-1):
    
    time_dif = timestamp_list[i+1] - timestamp_list[i]
    
    hr, minu, sec = convert_timedelta(time_dif)
    
    if hr < 24*3:
        chosen_inds.append(i+ind0)
    
    elif len(chosen_inds) > 5:
        chosen_inds_dict['{}_{}'.format(eligible_berg_id, counter)] = chosen_inds
        counter += 1
        chosen_inds = []
    else:
        chosen_inds = []

if len(chosen_inds) > 5:
    chosen_inds_dict['{}_{}'.format(eligible_berg_id, counter)] = chosen_inds

In [None]:
chosen_inds

In [None]:
chosen_inds_dict

In [None]:
for count in range(counter+1):
    print(chosen_inds_dict['{}_{}'.format(eligible_berg_id, count)][0])

In [None]:
eligible_berg.loc[eligible_berg.index == chosen_inds[0]].date.values[0]

In [None]:
print(year)
el_berg_month_i = int(eligible_berg.loc[eligible_berg.index == chosen_inds[0]].month)
el_berg_month_f = int(eligible_berg.loc[eligible_berg.index == chosen_inds[-1]].month)
el_berg_day_i = int(eligible_berg.loc[eligible_berg.index == chosen_inds[0]].day)
el_berg_day_f = int(eligible_berg.loc[eligible_berg.index == chosen_inds[-1]].day)

In [None]:
int(eligible_berg.loc[eligible_berg.index == chosen_inds[0]].day)

In [None]:
import datetime

In [None]:
date_diff_i = datetime.date(year, el_berg_month_i, el_berg_day_i) - datetime.date(2000, 1, 1)
date_diff_f = datetime.date(year, el_berg_month_f, el_berg_day_f) - datetime.date(2000, 1, 1)

In [None]:
date_diff_i.days*24

In [None]:
df.loc[df['BERG_NUMBER'] == 21165]

In [None]:
def num_unique_berg_ids(df, col_name):
    # import pandas as pd
    if not isinstance(df, pd.DataFrame):
        print('Error: argument df must be a pandas dataframe.')
    elif not isinstance(col_name, str):
        print('Error: argument col_name must be a string.')
    else:
        #return df[col_name].size  # non-unique obs
        return df[col_name].nunique()

In [None]:
num_unique_berg_ids(df, 'BERG_NUMBER')

In [None]:
def arr_unique_berg_ids(df, col_name):
    # import pandas as pd
    if not isinstance(df, pd.DataFrame):
        print('Error: argument df must be a pandas dataframe.')
    elif not isinstance(col_name, str):
        print('Error: argument col_name must be a string.')
    else:
        return df[col_name].unique()    

In [None]:
arr_unique_berg_ids(df, 'BERG_NUMBER')

In [None]:
def obs_per_berg(df, col_name):
    # import pandas as pd
    if not isinstance(df, pd.DataFrame):
        print('Error: argument df must be a pandas dataframe.')
    elif not isinstance(col_name, str):
        print('Error: argument col_name must be a string.')
    else:
        return df.groupby([col_name]).size()
    

In [None]:
unique_obs_df = obs_per_berg(df, 'BERG_NUMBER')

In [None]:
unique_obs_df.describe()

In [None]:
unique_obs_df.argmax()

In [None]:
data = np.vstack((df['LATITUDE'], df['LONGITUDE']))

In [None]:
plt.scatter(df['LONGITUDE'], df['LATITUDE'])

In [None]:
from mpl_toolkits.basemap import Basemap

In [None]:
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 10)

In [None]:
# Lambert Conformal Conic map.
min_lon = -70.
max_lon = -10.
min_lat = 30.
max_lat = 70.

m = Basemap(llcrnrlon = min_lon, llcrnrlat = min_lat,
            urcrnrlon = max_lon, urcrnrlat = max_lat,
            projection = 'lcc', lat_1 = 20.,
            lat_2 = 40., lon_0 = -60.,
            resolution = 'l', area_thresh = 1000.)

m.drawcoastlines()
m.drawcountries()
m.drawmapboundary(fill_color='#99ffff')
m.fillcontinents(color='#cc9966',lake_color='#99ffff')
m.drawparallels(np.arange(0,100,10),labels=[1,1,0,0])
m.drawmeridians(np.arange(-100,0,10),labels=[0,0,0,1])
plt.title('')

#m.plot(df['LONGITUDE'], df['LATITUDE'], linewidth = 1.5, color = 'r')
x, y = m(list(df['LONGITUDE']), list(df['LATITUDE']))
m.scatter(x,y, s=.5, marker ='.',  color = 'k')

plt.show()

In [None]:
# create new figure, axes instances.
fig = plt.figure()
ax = fig.add_axes([0.1,0.1,0.8,0.8])

# setup mercator map projection.
m = Basemap(llcrnrlon=-100.,llcrnrlat=20.,urcrnrlon=20.,urcrnrlat=60.,\
            rsphere=(6378137.00,6356752.3142),\
            resolution='l',projection='merc',\
            lat_0=40.,lon_0=-20.,lat_ts=20.)

m.drawcoastlines(zorder=0)
m.fillcontinents(zorder=0)
m.drawparallels(np.arange(10,90,20),labels=[1,1,0,1])
m.drawmeridians(np.arange(-180,180,30),labels=[1,1,0,1])
ax.set_title('')

m.plot(df['LONGITUDE'], df['LATITUDE'], linewidth = 1.5, color = 'r')
#m.scatter(df['LONGITUDE'], df['LATITUDE'], marker ='o', color = 'k')


plt.show()