In [2]:
import numpy as np
import math
from datetime import timedelta
from operator import attrgetter
from netCDF4 import Dataset
from netCDF4 import MFDataset
from collections import defaultdict
import xarray as xr
import pandas as pd
import os

In [3]:
'''
Written by Chad Valencia, chadvalencia@gmail.com

for David Lindo Atichati, PhD, CUNY

Dataset ETL

The Purpose of this file is to parse data into organized csvs for different visualizations. By doing so
we can cut down processing time of the dataset and visualize accordingly.
'''

'\nWritten by Chad Valencia, chadvalencia@gmail.com\n\nfor David Lindo Atichati, PhD, CUNY\n\nDataset ETL\n\nThe Purpose of this file is to parse data into organized csvs for different visualizations. By doing so\nwe can cut down processing time of the dataset and visualize accordingly.\n'

In [4]:
cycnc = Dataset('./output_tracking/Cyclonic.nc')
acycnc = Dataset('./output_tracking/Anticyclonic.nc')

In [5]:
def poslon(l):
    j = []
    for i in l:
        if i < 0:
            i=360+i
            j.append(i)
        else:
            j.append(i)
    return j
        
def dfnc(nc):
    '''
    This takes an nc dataset and creates a Pandas dataframe
    with columns eddy, date and position
    '''
    lat=list(nc['lat'])
    lon=poslon(list(nc['lon']))
    radius = list(nc['radius_e']) #radius in km
    amp = list(nc['A'])
    dates = list(nc['j1'])
    eddy = list(nc['track'])
    df = pd.DataFrame([eddy,dates,lat,lon,radius,amp]).T
    df.columns=['eddy','date','lat','lon','radius','amplitude']
    df['date']=pd.to_datetime(df['date'],origin='julian',unit='D')
    return df

In [6]:
cyc = dfnc(cycnc)
acyc = dfnc(acycnc)

In [7]:
def pos_compare(lat, lon):
    lat_in, lon_in = False, False

    if 15.92 < lat < 34.04043:
        lat_in = True
    if 176.04 < lon < 209.0341:
            lon_in = True
    return lat_in and lon_in

def truecol(df):
    l=[]
    for i in range(len(df)):
        if pos_compare(df['lat'][i],df['lon'][i]):
            l.append(True)
        else:
            l.append(False)
    df['truecol']=l
    df = df[df['truecol']==True]
    df = df.drop('truecol',axis=1)
    return df

In [8]:
cdf = truecol(cyc)
adf = truecol(acyc)

In [9]:
# Saving cdf to ./data/cdf.csv and adf to ./data/adf.csv
# cdf is a dataframe of Longitude-Adjusted cylconic eddies, with 1 degree removed on each border, unbinned.
# adf is a dataframe of Longitude-Adjusted cylconic eddies, with 1 degree removed on each border, unbinned.
cdf.to_csv('./data/cdf.csv')
adf.to_csv('./data/adf.csv')

In [10]:
### Skip to here if not first time running
cdf = pd.read_csv('./data/cdf.csv')
cdf.describe()

Unnamed: 0.1,Unnamed: 0,eddy,lat,lon,radius,amplitude
count,157718.0,157718.0,157718.0,157718.0,157718.0,157718.0
mean,87989.784508,4857.821295,25.285731,192.848019,47.682352,2.99654
std,50673.003517,2902.821074,5.290045,9.654817,27.229527,3.541591
min,0.0,1.0,15.920247,176.040161,15.0,0.054436
25%,44335.25,2428.0,20.78618,184.546986,27.2,0.853937
50%,88081.5,4727.0,25.169032,192.926651,39.6,1.79273
75%,131981.75,7352.0,29.955281,201.341152,60.75,3.756357
max,176047.0,10115.0,34.040428,209.034088,210.35,47.40176


In [11]:
adf = pd.read_csv('./data/adf.csv')
adf

Unnamed: 0.1,Unnamed: 0,eddy,date,lat,lon,radius,amplitude
0,0,1.0,2009-05-01 12:00:00,16.711275,201.793015,30.55,0.327685
1,1,1.0,2009-05-02 12:00:00,16.678257,201.710434,33.75,0.406118
2,2,1.0,2009-05-03 12:00:00,16.689449,201.585663,29.85,0.347254
3,3,1.0,2009-05-04 12:00:00,16.632107,201.512939,36.25,0.509210
4,4,1.0,2009-05-05 12:00:00,16.580832,201.447174,31.45,0.429348
5,5,1.0,2009-05-06 12:00:00,16.651991,201.352310,28.40,0.310445
6,6,2.0,2009-05-01 12:00:00,19.849276,207.901001,29.05,0.891934
7,7,2.0,2009-05-02 12:00:00,19.811481,207.849670,31.45,0.963795
8,8,2.0,2009-05-03 12:00:00,19.806429,207.796921,29.05,0.822990
9,9,2.0,2009-05-04 12:00:00,19.815142,207.750305,31.50,0.927562


In [12]:
def latsquish(df):
    #makes a dataframe with lat2 for binning
    dflat = df.copy()
    dflat['lat2']=dflat['lat']
    dflat.lat2 = dflat.lat2.astype(int)
    dfnew = dflat.groupby(['eddy','lat2'])
    result = dfnew.agg({
        'eddy':'first',
        'date':'count',
        'lat':np.mean,
        'lon':np.mean,
        'radius':np.mean,
        'amplitude':np.mean
    })
    return result

def latlonsquish(df):
    #makes a dataframe with lat2 for binning
    dflatlon= df.copy()
    dflatlon['lat2']=dflatlon['lat'].astype(int)
    dflatlon['lon2']=dflatlon['lon'].astype(int)
    dflatlon['latlon']=list(zip((dflatlon['lat'].astype(int)),(dflatlon['lon'].astype(int))))
    dfnew = dflatlon.groupby(['eddy','latlon'])
    result = dfnew.agg({
        'eddy':'first',
        'date':'count',
        'lat':np.mean,
        'lon':np.mean,
        'lat2':'first',
        'lon2':'first',
        'radius':np.mean,
        'amplitude':np.mean
    })
    return result

In [13]:
cldf = latsquish(cdf) # CE data for viz 5, flatten on latitude only
clldf = latlonsquish(cdf) # CE data for viz 4,6,7, flatten on both lat/lon 1 degree bins
aldf = latsquish(adf) # AE data for 5
alldf = latlonsquish(adf) #AE data for 4,6,7

In [14]:
cldf.to_csv('./data/cldf.csv')
clldf.to_csv('./data/clldf.csv')
aldf.to_csv('./data/aldf.csv')
alldf.to_csv('./data/alldf.csv')

cldf = pd.read_csv('./data/cldf.csv')
clldf = pd.read_csv('./data/clldf.csv')
aldf = pd.read_csv('./data/aldf.csv')
alldf = pd.read_csv('./data/alldf.csv')

In [15]:
#The Following Dataframe is for Vis 6, calculating non-linearity

In [16]:
def dfdist(nc):
    '''
    This takes an nc dataset and creates a Pandas dataframe
    with columns eddy, date and position
    '''
    lat=list(nc['lat'])
    lon=poslon(list(nc['lon']))
    u = list(nc['U'])
    dates = list(nc['j1'])
    eddy = list(nc['track'])
    df = pd.DataFrame([eddy,dates,lat,lon,u]).T
    df.columns=['eddy','date','lat','lon','u']
    df['date']=pd.to_datetime(df['date'],origin='julian',unit='D')
    return df

In [17]:
adist = truecol(dfdist(acycnc))
cdist = truecol(dfdist(cycnc))
adist.to_csv('./data/adist.csv')
cdist.to_csv('./data/cdist.csv')
adist = pd.read_csv('./data/adist.csv')
cdist = pd.read_csv('./data/cdist.csv')