In [5]:
#%matplotlib notebook
%matplotlib qt

#This notebook is a testbed for importing PEAC Center USAPI
#raifnall data using pandas and doing some quick analysis

import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as cols
import matplotlib.cm as cm

from mpl_toolkits.basemap import Basemap, shiftgrid

import numpy as np
import numpy.ma as ma

from datetime import date
import datetime

import calendar

from netCDF4 import Dataset

from scipy import signal, linalg, stats

from pycurrents.codas import to_day, to_date
from pycurrents.plot.mpltools import dday_to_mpl

from pycurrents.system import Bunch
from pycurrents.num import eof
from pycurrents.num import rangeslice

import pickle

from scipy.special import comb
#import metpy.calc

In [6]:
datadir = '/home/erin/machine_learning_635/Final_Project/netcdf_data_files/'
exceldatadir = '/home/erin/machine_learning_635/Final_Project/'

In [7]:
# NCEP UWND, VWND ahd HGT all go from Jan 1948 to present and are on a 2.5 deg grid
# the grid goes from 0.0E to 357.5E and 90N to 90S
uwnd_data = Dataset(datadir + 'uwnd.mon.mean.nc')
vwnd_data = Dataset(datadir + 'vwnd.mon.mean.nc')
hgt_data = Dataset(datadir + 'hgt.mon.mean.nc')

# read lats,lons
ncep_latitudes = uwnd_data.variables['lat'][:]
ncep_longitudes = uwnd_data.variables['lon'][:]

#read in ncep levels
# these levels are
#1000,925,850,700,600,500,400,300,250,200,150,100,70,50,30,20,10
ncep_levels = uwnd_data.variables['level'][:]

#Choose pressure level now, to save obn memory
pressure_selection = ncep_levels[:] == 850


#time in hours since 1800-01-01 00:00:0.0

ncep_time = uwnd_data.variables['time'][:]

# get  wind data.
uin = np.squeeze(uwnd_data.variables['uwnd'][:, pressure_selection, :, :])
vin = np.squeeze(vwnd_data.variables['vwnd'][:, pressure_selection, :, :])
hgtin = np.squeeze(hgt_data.variables['hgt'][:, pressure_selection, :, :])

uwnd_data.close()
vwnd_data.close()
hgt_data.close()

# Let's just convert it to days:
ncep_dday_1800 = ncep_time / 24

#----------------------------------------------------------------
#GPCP data goes from Jan 1979 to present and is on a 2.5 deg grid
#same grid as NCEP
precip_data = Dataset(datadir + 'precip.mon.mean.nc')

#time in hours since 1800-01-01 00:00:0.0
precip_time = precip_data.variables['time'][:]

# read lats,lons
precip_latitudes = precip_data.variables['lat'][:]
precip_longitudes = precip_data.variables['lon'][:]

# get precip data.
precip_in = precip_data.variables['precip'][:, :, :]

precip_data.close()

# GPCP time is in days since jan 1 1800
precip_dday_1800 = precip_time

#----------------------------------------------------------------
#ERSST v4 data goes from Jan 1854 to present and is on a 2 deg grid
#88.0N - 88.0S, 0.0E - 358.0E.
ersst_data = Dataset(datadir + 'sst.mnmean.v4.nc')

# read lats,lons
ersst_latitudes = ersst_data.variables['lat'][:]
ersst_longitudes = ersst_data.variables['lon'][:]

#time in hours since 1800-01-01 00:00:0.0
ersst_time = ersst_data.variables['time'][:]

# get  wind data.
ersst_in = ersst_data.variables['sst'][:, :, :]

ersst_data.close()

# ERSST time is in days since jan 1 1800
ersst_dday_1800 = ersst_time

In [8]:
def extend(dday_1800, dat):
    """
    Return dday, dat extended to fill out the last year.
    
    This function was modifyed to take in only one data variable.
    """
    n_orig = len(dday_1800)
    ymdhms = to_date(1800, dday_1800)
    nmissing = 12 - ymdhms[-1, 1]
    nmonths = n_orig + nmissing
    ymdhms_new = np.zeros((nmonths, 6), dtype=np.uint16)
    ymdhms_new[:n_orig, :] = ymdhms
    ymdhms_new[n_orig:, 0] = ymdhms[-1, 0]
    # Fill in the remaining months:
    ymdhms_new[n_orig:, 1] = np.arange(ymdhms[-1, 1] + 1, 13)
    ymdhms_new[n_orig:, 2] = 1
    dday_new = to_day(1800, ymdhms_new)
    
    shape_new = (nmonths, dat.shape[1], dat.shape[2])
    ## EF: modify to work with masked array or ndarray
    if np.ma.isMA(dat):
        dat_new = np.ma.masked_all(shape_new, float)
    else:    
        dat_new = np.nan + np.zeros(shape_new, float)
    
    dat_new[:n_orig, :, :] = dat
    
    #This Function has been modifyed to also return the date of the last valid data and its ymdhms index
    
    last_valid_date = to_date(1800,dday_1800[-1])
    last_valid_index = n_orig -1
   
    return Bunch(dday_1800=dday_new,
                 ymdhms=ymdhms_new,
                 dat=dat_new,
                 last_valid_date=last_valid_date,
                 last_valid_index=last_valid_index)

def cal_climatology_and_anomaly(data,ymdhms,last_valid_date,last_valid_index,latitudes,longitudes,start_year, end_year):
                           
    #Select the years for climatology from the new ymdhms_padded
    # All the monthly data for the years you want to calculate the climatology
    
    clim_selection = ((ymdhms[:, 0] >= start_year) & (ymdhms[:, 0] <= end_year))
    
    # EF: Trim everything right at the start.
    data = data[clim_selection]
    ymdhms = ymdhms[clim_selection]
    
    #Calculate the total numbers of years in the climatology
    length_in_years = ymdhms[-1, 0] - ymdhms[0, 0] + 1
    
    # EF: ensure we have masked arrays and no nans
    data = np.ma.masked_invalid(data)
    
    #reshape the matrix so that is has dimensions of [years, months, lat, lon]
    reshaped_data = np.reshape(data, (length_in_years, 12, 
                                      len(latitudes), 
                                      len(longitudes)))
    #Calculate the climatology by using nanmeans, along the yeas axis
    #climatology = np.nanmean(reshaped_data, axis=0)
    
    climatology = reshaped_data.mean(axis=0)
    anomaly = reshaped_data - climatology
    anomaly = anomaly.reshape(data.shape)
    
    if end_year < last_valid_date[0]:
        last_valid_date_new = ymdhms[-1,:]
        last_valid_index_new = -1
    else:
        last_valid_date_new = last_valid_date
        last_valid_index_new = np.where((ymdhms[:,0]== last_valid_date[0]) &
                                        (ymdhms[:,1]== last_valid_date[1]) &
                                        (ymdhms[:,2]== last_valid_date[2]))[0][0]
    
    return Bunch(climatology=climatology,
                 anomaly=anomaly, 
                 ymdhms=ymdhms,
                 last_valid_date = last_valid_date_new,
                 last_valid_index = last_valid_index_new)

def seasonal_anomaly(m_anom,last_valid_index):
    s_anom = np.ma.zeros(m_anom.shape, float)
    orig_mask = np.ma.getmaskarray(m_anom)
    
    if last_valid_index == -1:
        s_anom[1:-1] = (m_anom[:-2] + m_anom[1:-1] + m_anom[2:]) / 3
    
        s_anom[0] = (m_anom[0] + m_anom[1]) / 2
        s_anom[-1] = (m_anom[-2] + m_anom[-1]) / 2
    
    else:
        #s_anom[1:last_valid_index-1,:,:] = (m_anom[:last_valid_index-2,:,:] + 
        #                                    m_anom[1:last_valid_index-1,:,:] + 
        #                                    m_anom[2:last_valid_index,:,:]) / 3
        s_anom[1:last_valid_index,:,:] = (m_anom[:last_valid_index-1,:,:] + 
                                          m_anom[1:last_valid_index,:,:] + 
                                          m_anom[2:last_valid_index+1,:,:]) / 3
    
        s_anom[0,:,:] = (m_anom[0,:,:] + m_anom[1,:,:]) / 2
        s_anom[last_valid_index,:,:] = (m_anom[last_valid_index-1,:,:] + 
                                        m_anom[last_valid_index,:,:]) / 2
        
    s_anom_out = np.ma.array(s_anom, mask=orig_mask)
    
    return Bunch(s_anom = s_anom_out, 
                 last_valid_index = last_valid_index)

def seasonal_anomaly2(m_anom, nmin=2):
    newshape = [3] + list(m_anom.shape)
    newshape[1] += 2
    accum = np.ma.zeros(newshape, dtype=m_anom.dtype)
    accum[:] = np.ma.masked
    accum[0, :-2] = m_anom[:]
    accum[1, 1:-1] = m_anom[:]
    accum[2, 2:] = m_anom[:]
    out = accum.mean(axis=0)[1:-1]
    out = np.ma.masked_where(accum[:, 1:-1].count(axis=0) < nmin, out, copy=False)
    return Bunch(s_anom = out)

def running_sum(m_anom, window, nmin = 2):
    nmin = (window+1)/2
    chop = (window -1)/2

    newshape = [window] + list(m_anom.shape)
    newshape[1] += window -1
    accum = np.ma.zeros(newshape, dtype=m_anom.dtype)
    accum[:] = np.ma.masked
    for i in range(window):
        end = -window+i+1
        if end == 0:
            accum[i, i:] = m_anom[:]
        else:
            accum[i, i:end] = m_anom[:]
    out = accum.mean(axis=0)[chop:-chop]
    out = np.ma.masked_where(accum[:, chop:-chop].count(axis=0) < nmin, out, copy=False)
    return Bunch(s_anom = out)

def running_sum_2(m_anom, window, nmin = 2):
    
    if window % 2 == 0:
        nmin = (window)/2
        #nmin = 1
        chop = (window-1)

        newshape = [window] + list(m_anom.shape)
        newshape[1] += window -1
        accum = np.ma.zeros(newshape, dtype=m_anom.dtype)
        accum[:] = np.ma.masked
        for i in range(window):
            end = -window+i+1
            if end == 0:
                accum[i, i:] = m_anom[:]
            else:
                accum[i, i:end] = m_anom[:]
        if window != 2:
            start=window/2
            stop = (window/2) -1
            out = accum.mean(axis=0)[start:-stop]
            out = np.ma.masked_where(accum[:,start:-stop].count(axis=0) < nmin, out, copy=False)
        else:
            out = accum.mean(axis=0)[1:]
            out = np.ma.masked_where(accum[:,1:].count(axis=0) < nmin, out, copy=False)
        
    else:
        nmin = (window+1)/2
        chop = (window -1)/2

        newshape = [window] + list(m_anom.shape)
        newshape[1] += window -1
        accum = np.ma.zeros(newshape, dtype=m_anom.dtype)
        accum[:] = np.ma.masked
        for i in range(window):
            end = -window+i+1
            if end == 0:
                accum[i, i:] = m_anom[:]
            else:
                accum[i, i:end] = m_anom[:]
        out = accum.mean(axis=0)[chop:-chop]
        out = np.ma.masked_where(accum[:, chop:-chop].count(axis=0) < nmin, out, copy=False)
        
    return Bunch(s_anom = out)

In [9]:
#fill out the data for the missing months at the end of the last year of the dataset
precip = extend(precip_dday_1800,precip_in)

uwnd = extend(ncep_dday_1800,uin)
vwnd = extend(ncep_dday_1800,vin)
hgt = extend(ncep_dday_1800,hgtin)

sst = extend(ersst_dday_1800,ersst_in)

In [10]:
mpldays_precip = dday_to_mpl(1800, precip_dday_1800)
mpldays_ncep = dday_to_mpl(1800, ncep_dday_1800)

In [11]:
#Here we calculate the anomaly
precip_ca = cal_climatology_and_anomaly(precip.dat,precip.ymdhms,
                                        precip.last_valid_date, precip.last_valid_index,
                                        precip_latitudes,
                                        precip_longitudes,1979,2016)

uwnd_ca = cal_climatology_and_anomaly(uwnd.dat,uwnd.ymdhms,
                                      uwnd.last_valid_date,uwnd.last_valid_index,
                                      ncep_latitudes,
                                      ncep_longitudes,1965,2016)

vwnd_ca = cal_climatology_and_anomaly(vwnd.dat,vwnd.ymdhms,
                                      vwnd.last_valid_date,vwnd.last_valid_index,
                                      ncep_latitudes,
                                      ncep_longitudes,1965,2016)

hgt_ca = cal_climatology_and_anomaly(hgt.dat,hgt.ymdhms,
                                     hgt.last_valid_date,hgt.last_valid_index,
                                     ncep_latitudes,
                                     ncep_longitudes,1965,2016)

sst_ca = cal_climatology_and_anomaly(sst.dat,sst.ymdhms,
                                     sst.last_valid_date,sst.last_valid_index,
                                     ersst_latitudes,
                                     ersst_longitudes,1965,2016)

precip_seasonal_anom = running_sum_2(precip_ca.anomaly,6, nmin=4)

uwnd_seasonal_anom = running_sum_2(uwnd_ca.anomaly,3, nmin=2)
vwnd_seasonal_anom = running_sum_2(vwnd_ca.anomaly,3, nmin=2)
hgt_seasonal_anom = running_sum_2(hgt_ca.anomaly,3, nmin=2)

sst_seasonal_anom = running_sum_2(sst_ca.anomaly,3, nmin=2)

  dout = self.data[indx]
  dout._mask = _mask[indx]


In [12]:
peac_station_rain = pd.ExcelFile(exceldatadir +'PEAC_station_rainfall_database.xlsx')
print(peac_station_rain.sheet_names)

ONI_file = pd.ExcelFile(exceldatadir +'CPC_ONI.xlsx')
print(ONI_file.sheet_names)

['Guam', 'Saipan', 'Koror', 'Yap', 'Chuuk', 'Pohnpei', 'Majuro', 'Kwajalein', 'PagoPago', 'Saipan_old', 'JFM', 'FMA', 'MAM', 'AMJ', 'MJJ', 'JJA', 'JAS', 'ASO', 'SON', 'OND', 'NDJ', 'DJF']
['ONI']


In [13]:
def read_USAPI_data(file_name, island_name):
    
    #Here we read the data into python from the excel file
    raw_data= pd.read_excel(file_name, sheetname = island_name, 
                            skiprows = 1, parse_cols = "A:O")
    
    #
    raw_matrix = raw_data.as_matrix(columns=None)
    print(type(raw_matrix), raw_matrix.dtype)
    
    years =  raw_matrix[:51,1]
    station_id = raw_matrix[0,0]

    rainfall = raw_matrix[:51,3:]

    rainfall[rainfall == '9999'] = np.nan
    rainfall[rainfall == "nan"] = np.nan

    rainfall = rainfall * 0.1
    
    # We have to convert from an object array to floating point.
    rainfall = np.ma.masked_invalid(rainfall.astype(float))
    print('rainfall: ', rainfall.dtype, rainfall[5, 5])
 
    #Initiate the ymdhms array as an array of int values filler with zeros
    #that has the length og the total number of time steps years*months
    ymdhms = np.zeros([51*12,6],dtype = np.int)

    #here we will the first column with the year values from the excel sheet. 
    #repeated each one 12 consecutive times
    ymdhms[:,0] = np.repeat(years,12)

    #Here we create a list of the month indices
    m = np.arange(1,13)
    #we tile the list so that the entire list 1..12 repeats as many times as the number of years
    mm = np.tile(m,51)

    ymdhms[:,1] = mm

    #the day column is filled with 15
    ymdhms[:,2] = 15
    
    dday = to_day(1800, ymdhms)
    mpldays = dday_to_mpl(1800, dday)
    mpldaysformated = mpl.dates.num2date(mpldays)

    out = Bunch(island_name = island_name, station_id = station_id, rainfall=rainfall,
               ymdhms = ymdhms, mpldaysformated = mpldaysformated)
    
    return out
    #return station_id, rainfall, ymdhms, mpldaysformated
    
def read_index_data(file_name, index_name):
    
    #Here we read the data into python from the excel file
    raw_data= pd.read_excel(file_name, sheetname = index_name, skiprows = 1, parse_cols = "A:M")
    
    #
    raw_matrix = raw_data.as_matrix(columns=None)
    print(type(raw_matrix), raw_matrix.dtype)
    
    years =  raw_matrix[:,0]
    index = raw_matrix[:,1:]
    

    # We have to convert from an object array to floating point.
    index = np.ma.masked_invalid(index.astype(float))
    print('index: ', index.dtype, index[5, 5])
 
    #Initiate the ymdhms array as an array of int values filler with zeros
    #that has the length og the total number of time steps years*months
    ymdhms = np.zeros([67*12,6],dtype = np.int)

    #here we will the first column with the year values from the excel sheet. 
    #repeated each one 12 consecutive times
    ymdhms[:,0] = np.repeat(years,12)

    #Here we create a list of the month indices
    m = np.arange(1,13)
    #we tile the list so that the entire list 1..12 repeats as many times as the number of years
    mm = np.tile(m,67)

    ymdhms[:,1] = mm

    #the day column is filled with 15
    ymdhms[:,2] = 15
    
    dday = to_day(1800, ymdhms)
    mpldays = dday_to_mpl(1800, dday)
    mpldaysformated = mpl.dates.num2date(mpldays)

    out = Bunch(index_name = index_name, index = index,
               ymdhms = ymdhms, mpldaysformated = mpldaysformated)
    
    return out
    #return station_id, rainfall, ymdhms, mpldaysformated

def seasonal_anomaly_old(m_anom):
    s_anom = np.ma.zeros(m_anom.shape, float)
    s_anom[1:-1] = (m_anom[:-2] + m_anom[1:-1] + m_anom[2:]) / 3
    s_anom[0] = (m_anom[0] + m_anom[1]) / 2
    s_anom[-1] = (m_anom[-2] + m_anom[-1]) / 2
    return s_anom

def seasonal_sum(m_anom):
    s_anom = np.ma.zeros(m_anom.shape, float)
    s_anom[1:-1] = (m_anom[:-2] + m_anom[1:-1] + m_anom[2:])
    s_anom[0] = (m_anom[0] + m_anom[1])
    s_anom[-1] = (m_anom[-2] + m_anom[-1])
    return s_anom

def running_sum(m_anom, window, nmin = 2):
    
    if window % 2 == 0:
        nmin = (window)/2
        #nmin = 1
        chop = (window-1)

        newshape = [window] + list(m_anom.shape)
        newshape[1] += window -1
        accum = np.ma.zeros(newshape, dtype=m_anom.dtype)
        accum[:] = np.ma.masked
        for i in range(window):
            end = -window+i+1
            if end == 0:
                accum[i, i:] = m_anom[:]
            else:
                accum[i, i:end] = m_anom[:]
        if window != 2:
            start=window/2
            stop = (window/2) -1
            out = accum.mean(axis=0)[start:-stop]
            out = np.ma.masked_where(accum[:,start:-stop].count(axis=0) < nmin, out, copy=False)
        else:
            out = accum.mean(axis=0)[1:]
            out = np.ma.masked_where(accum[:,1:].count(axis=0) < nmin, out, copy=False)
        
    else:
        nmin = (window+1)/2
        chop = (window -1)/2

        newshape = [window] + list(m_anom.shape)
        newshape[1] += window -1
        accum = np.ma.zeros(newshape, dtype=m_anom.dtype)
        accum[:] = np.ma.masked
        for i in range(window):
            end = -window+i+1
            if end == 0:
                accum[i, i:] = m_anom[:]
            else:
                accum[i, i:end] = m_anom[:]
        out = accum.mean(axis=0)[chop:-chop]
        out = np.ma.masked_where(accum[:, chop:-chop].count(axis=0) < nmin, out, copy=False)
        
    return Bunch(s_anom = out)

In [14]:
station_name_list = ["Koror", "Yap", "Guam", "Chuuk", "Pohnpei", "Kwajalein", "Majuro", "Saipan"]
#variable_name_list = [koror, yap, guam, chuuk, phonpei, kwajalein, majuro]

stations = Bunch()
for name in station_name_list:
    stations[name] = read_USAPI_data(peac_station_rain, name)
    
for raindata in stations.values():
    #print(raindata.island_name)
    #print(np.shape(raindata.rainfall))
    raindata.monmean = raindata.rainfall.mean(axis=0)
    raindata.monstd = raindata.rainfall.std(axis=0)
    raindata.monanom = raindata.rainfall - raindata.monmean

<class 'numpy.ndarray'> object
rainfall:  float64 498.1
<class 'numpy.ndarray'> object
rainfall:  float64 354.3
<class 'numpy.ndarray'> object
rainfall:  float64 128.6
<class 'numpy.ndarray'> object
rainfall:  float64 353.9
<class 'numpy.ndarray'> object
rainfall:  float64 599.9
<class 'numpy.ndarray'> object
rainfall:  float64 292.9
<class 'numpy.ndarray'> object
rainfall:  float64 340.5
<class 'numpy.ndarray'> object
rainfall:  float64 --


In [15]:
print(np.shape(stations.Kwajalein.monanom))
kwajalein_seasonal_anom = seasonal_anomaly_old(stations.Kwajalein.monanom.ravel())
kwajalein_seasonal_total = seasonal_sum(stations.Kwajalein.rainfall.ravel())
kwajalein_dry_season_total = running_sum(stations.Kwajalein.rainfall.ravel(),6)
kwajalein_dry_season_anom = running_sum(stations.Kwajalein.monanom.ravel(),6)

guam_seasonal_anom = seasonal_anomaly_old(stations.Guam.monanom.ravel())
guam_seasonal_total = seasonal_sum(stations.Guam.rainfall.ravel())
guam_dry_season_total = running_sum(stations.Guam.rainfall.ravel(),6)
guam_dry_season_anom = running_sum(stations.Guam.monanom.ravel(),6)


yap_seasonal_anom = seasonal_anomaly_old(stations.Yap.monanom.ravel())
yap_seasonal_total = seasonal_sum(stations.Yap.rainfall.ravel())
yap_dry_season_total = running_sum(stations.Yap.rainfall.ravel(),6)
yap_dry_season_anom = running_sum(stations.Yap.monanom.ravel(),6)

majuro_seasonal_anom = seasonal_anomaly_old(stations.Majuro.monanom.ravel())
majuro_seasonal_total = seasonal_sum(stations.Majuro.rainfall.ravel())
majuro_dry_season_total = running_sum(stations.Majuro.rainfall.ravel(),6)
majuro_dry_season_anom = running_sum(stations.Majuro.monanom.ravel(),6)

chuuk_seasonal_anom = seasonal_anomaly_old(stations.Chuuk.monanom.ravel())
chuuk_seasonal_total = seasonal_sum(stations.Chuuk.rainfall.ravel())
chuuk_dry_season_total = running_sum(stations.Chuuk.rainfall.ravel(),6)
chuuk_dry_season_anom = running_sum(stations.Chuuk.monanom.ravel(),6)

koror_seasonal_anom = seasonal_anomaly_old(stations.Koror.monanom.ravel())
koror_seasonal_total = seasonal_sum(stations.Koror.rainfall.ravel())
koror_dry_season_total = running_sum(stations.Koror.rainfall.ravel(),6)
koror_dry_season_anom = running_sum(stations.Koror.monanom.ravel(),6)

pohnpei_seasonal_anom = seasonal_anomaly_old(stations.Pohnpei.monanom.ravel())
pohnpei_seasonal_total = seasonal_sum(stations.Pohnpei.rainfall.ravel())
pohnpei_dry_season_total = running_sum(stations.Pohnpei.rainfall.ravel(),6)
pohnpei_dry_season_anom = running_sum(stations.Pohnpei.monanom.ravel(),6)

saipan_seasonal_anom = seasonal_anomaly_old(stations.Saipan.monanom.ravel())
saipan_seasonal_total = seasonal_sum(stations.Saipan.rainfall.ravel())
saipan_dry_season_total = running_sum(stations.Saipan.rainfall.ravel(),6)
saipan_dry_season_anom = running_sum(stations.Saipan.monanom.ravel(),6)

(51, 12)


  dout = self.data[indx]
  dout._mask = _mask[indx]


In [16]:
def spi_calculation(data, climatology_series):
    
    good_data = data[~data.mask]
    good_climatology = climatology_series[~climatology_series.mask]
    
    spi = np.empty_like(good_data)
    spi[:] = np.NAN

    #fit gamma distribution to climatology series
    fit_alpha, fit_loc, fit_beta = stats.gamma.fit(good_climatology)

    #find cumulative probabilities of data from fitted distribution
    data_cdf = stats.gamma.cdf(good_data, fit_alpha, loc=fit_loc, scale=fit_beta)

    # find the percent points from the random normal dist

    spi[:] = stats.norm.ppf(data_cdf, loc=0, scale=1)
    
    return spi

In [17]:
kwajalein_dry_season_total_matrix = kwajalein_dry_season_total.s_anom.reshape(51, 12)
kwajalein_dry_season_anom_matrix = kwajalein_dry_season_anom.s_anom.reshape(51, 12)
majuro_dry_season_anom_matrix = majuro_dry_season_anom.s_anom.reshape(51, 12)
guam_dry_season_anom_matrix = guam_dry_season_anom.s_anom.reshape(51, 12)
yap_dry_season_anom_matrix = yap_dry_season_anom.s_anom.reshape(51, 12)
koror_dry_season_anom_matrix = koror_dry_season_anom.s_anom.reshape(51, 12)
chuuk_dry_season_anom_matrix = chuuk_dry_season_anom.s_anom.reshape(51, 12)
pohnpei_dry_season_anom_matrix = pohnpei_dry_season_anom.s_anom.reshape(51, 12)
saipan_dry_season_anom_matrix = saipan_dry_season_anom.s_anom.reshape(51, 12)

#Put ONI data into a pandas table
season_list = ['DJF' , 'JFM' , 'FMA',
               'MAM' , 'AMJ' , 'MJJ',
               'JJA' , 'JAS' , 'ASO',
               'SON' , 'OND' , 'NDJ']

seven_month_list = ['Oct-Jan-Apr' , 'Nov-Feb-May' , 'Dec-Mar-Jun',
                    'Jan-Apr-Jul' , 'Feb-May-Aug' , 'Mar-Jun-Sep',
                    'Apr-Jul-Oct' , 'May-Aug-Nov' , 'Jun-Sep-Dec',
                    'Jul-Oct-Jan' , 'Aug-Nov-Feb' , 'Sep-Dec-Mar']   

six_month_list = ['Nov-Apr' , 'Dec-May' , 'Jan-Jun',
                  'Feb-Jul' , 'Mar-Aug' , 'Apr-Sep',
                  'May-Oct' , 'Jun-Nov' , 'Jul-Dec',
                  'Aug-Jan' , 'Sep-Feb' , 'Oct-Mar']    

kawj_dry_season_total_df = pd.DataFrame(kwajalein_dry_season_total_matrix, columns = six_month_list, index = range(1966,2017))
kawj_dry_season_anom_df = pd.DataFrame(kwajalein_dry_season_anom_matrix, columns = six_month_list, index = range(1966,2017))
#kawj_df

In [18]:
kwajalein_dry_season_total_matrix = kwajalein_dry_season_total.s_anom.reshape(51, 12)

kwajalein_spi_matrix = np.empty_like(kwajalein_dry_season_total_matrix)
kwajalein_spi_matrix[:] = np.NAN

for col in range(kwajalein_dry_season_total_matrix.shape[1]):
    #print(kwajalein_seasonal_total_matrix[:,col])
    spi = spi_calculation(np.squeeze(kwajalein_dry_season_total_matrix[:,col]),
                          np.squeeze(kwajalein_dry_season_total_matrix[:,col]))

    kwajalein_spi_matrix[:len(spi),col] = spi
    

    
guam_dry_season_total_matrix = guam_dry_season_total.s_anom.reshape(51, 12)
guam_spi_matrix = np.empty_like(guam_dry_season_total_matrix)
guam_spi_matrix[:] = np.NAN

for col in range(guam_dry_season_total_matrix.shape[1]):
    #print(kwajalein_seasonal_total_matrix[:,col])
    spi = spi_calculation(np.squeeze(guam_dry_season_total_matrix[:,col]),
                          np.squeeze(guam_dry_season_total_matrix[:,col]))

    guam_spi_matrix[:len(spi),col] = spi
    
    
yap_dry_season_total_matrix = yap_dry_season_total.s_anom.reshape(51, 12)
yap_spi_matrix = np.empty_like(yap_dry_season_total_matrix)
yap_spi_matrix[:] = np.NAN

for col in range(yap_dry_season_total_matrix.shape[1]):
    #print(kwajalein_seasonal_total_matrix[:,col])
    spi = spi_calculation(np.squeeze(yap_dry_season_total_matrix[:,col]),
                          np.squeeze(yap_dry_season_total_matrix[:,col]))

    yap_spi_matrix[:len(spi),col] = spi
    
majuro_dry_season_total_matrix = majuro_dry_season_total.s_anom.reshape(51, 12)
majuro_spi_matrix = np.empty_like(majuro_dry_season_total_matrix)
majuro_spi_matrix[:] = np.NAN

for col in range(majuro_dry_season_total_matrix.shape[1]):
    #print(kwajalein_seasonal_total_matrix[:,col])
    spi = spi_calculation(np.squeeze(majuro_dry_season_total_matrix[:,col]),
                          np.squeeze(majuro_dry_season_total_matrix[:,col]))

    majuro_spi_matrix[:len(spi),col] = spi
    
    
koror_dry_season_total_matrix = koror_dry_season_total.s_anom.reshape(51, 12)
koror_spi_matrix = np.empty_like(koror_dry_season_total_matrix)
koror_spi_matrix[:] = np.NAN

for col in range(koror_dry_season_total_matrix.shape[1]):
    #print(kwajalein_seasonal_total_matrix[:,col])
    spi = spi_calculation(np.squeeze(koror_dry_season_total_matrix[:,col]),
                          np.squeeze(koror_dry_season_total_matrix[:,col]))

    koror_spi_matrix[:len(spi),col] = spi
    
    
chuuk_dry_season_total_matrix = chuuk_dry_season_total.s_anom.reshape(51, 12)
chuuk_spi_matrix = np.empty_like(chuuk_dry_season_total_matrix)
chuuk_spi_matrix[:] = np.NAN

for col in range(chuuk_dry_season_total_matrix.shape[1]):
    #print(kwajalein_seasonal_total_matrix[:,col])
    spi = spi_calculation(np.squeeze(chuuk_dry_season_total_matrix[:,col]),
                          np.squeeze(chuuk_dry_season_total_matrix[:,col]))

    chuuk_spi_matrix[:len(spi),col] = spi
    
    
pohnpei_dry_season_total_matrix = pohnpei_dry_season_total.s_anom.reshape(51, 12)
pohnpei_spi_matrix = np.empty_like(pohnpei_dry_season_total_matrix)
pohnpei_spi_matrix[:] = np.NAN

for col in range(pohnpei_dry_season_total_matrix.shape[1]):
    #print(kwajalein_seasonal_total_matrix[:,col])
    spi = spi_calculation(np.squeeze(pohnpei_dry_season_total_matrix[:,col]),
                          np.squeeze(pohnpei_dry_season_total_matrix[:,col]))

    pohnpei_spi_matrix[:len(spi),col] = spi
    
saipan_dry_season_total_matrix = saipan_dry_season_total.s_anom.reshape(51, 12)
saipan_spi_matrix = np.empty_like(saipan_dry_season_total_matrix)
saipan_spi_matrix[:] = np.NAN
saipan_spi_matrix_temp = np.empty_like(saipan_dry_season_total_matrix)
saipan_spi_matrix_temp[:] = np.NAN

for col in range(saipan_dry_season_total_matrix.shape[1]):
    #print(kwajalein_seasonal_total_matrix[:,col])
    spi = spi_calculation(np.squeeze(saipan_dry_season_total_matrix[:,col]),
                          np.squeeze(saipan_dry_season_total_matrix[:,col]))

    saipan_spi_matrix_temp[-len(spi):,col] = spi

saipan_spi_matrix_temp[saipan_spi_matrix_temp == np.inf] = 7
saipan_mask = np.ma.getmask(saipan_dry_season_total_matrix)
saipan_spi_matrix = np.ma.masked_invalid(np.ma.array(saipan_spi_matrix_temp,mask = saipan_mask))
    
#Put ONI data into a pandas table
seven_month_list = ['Oct-Jan-Apr' , 'Nov-Feb-May' , 'Dec-Mar-Jun',
                    'Jan-Apr-Jul' , 'Feb-May-Aug' , 'Mar-Jun-Sep',
                    'Apr-Jul-Oct' , 'May-Aug-Nov' , 'Jun-Sep-Dec',
                    'Jul-Oct-Jan' , 'Aug-Nov-Feb' , 'Sep-Dec-Mar']    
    
six_month_list = ['Nov-Apr' , 'Dec-May' , 'Jan-Jun',
                  'Feb-Jul' , 'Mar-Aug' , 'Apr-Sep',
                  'May-Oct' , 'Jun-Nov' , 'Jul-Dec',
                  'Aug-Jan' , 'Sep-Feb' , 'Oct-Mar']    

In [19]:
average_all_station_spi_matrix = np.mean(np.array([kwajalein_spi_matrix, 
                                                   majuro_spi_matrix, 
                                                   guam_spi_matrix, 
                                                   yap_spi_matrix,
                                                   chuuk_spi_matrix,
                                                   koror_spi_matrix,
                                                   pohnpei_spi_matrix]), axis=0)


average_all_station_dry_season_anom_matrix = np.mean(np.array([kwajalein_dry_season_anom_matrix, 
                                                               majuro_dry_season_anom_matrix, 
                                                               guam_dry_season_anom_matrix, 
                                                               yap_dry_season_anom_matrix,
                                                               chuuk_dry_season_anom_matrix, 
                                                               koror_dry_season_anom_matrix, 
                                                               pohnpei_dry_season_anom_matrix,]), axis=0)

southern_station_spi_matrix = np.mean(np.array([majuro_spi_matrix, 
                                                   chuuk_spi_matrix,
                                                   koror_spi_matrix,
                                                   pohnpei_spi_matrix]), axis=0)


southern_station_dry_season_anom_matrix = np.mean(np.array([majuro_dry_season_anom_matrix, 
                                                               chuuk_dry_season_anom_matrix, 
                                                               koror_dry_season_anom_matrix, 
                                                               pohnpei_dry_season_anom_matrix,]), axis=0)

average_2_station_spi_matrix = np.mean(np.array([kwajalein_spi_matrix, 
                                                   guam_spi_matrix]), axis=0)


average_2_station_dry_season_anom_matrix = np.mean(np.array([kwajalein_dry_season_anom_matrix, 
                                                               guam_dry_season_anom_matrix]), axis=0)

average_all_station_SPI_df = pd.DataFrame(average_all_station_spi_matrix, columns = six_month_list, index = range(1966,2017))
average_all_station_dry_season_anom_df = pd.DataFrame(average_all_station_dry_season_anom_matrix, columns = six_month_list, index = range(1966,2017))

average_2_station_SPI_df = pd.DataFrame(average_2_station_spi_matrix, columns = six_month_list, index = range(1966,2017))
average_2_station_dry_season_anom_df = pd.DataFrame(average_2_station_dry_season_anom_matrix, columns = six_month_list, index = range(1966,2017))

southern_station_SPI_df = pd.DataFrame(southern_station_spi_matrix, columns = six_month_list, index = range(1966,2017))
southern_station_dry_season_anom_df = pd.DataFrame(southern_station_dry_season_anom_matrix, columns = six_month_list, index = range(1966,2017))



oni = read_index_data(ONI_file, "ONI")
oni_selection = ((oni.ymdhms[:, 0] >= 1965))
oni_time_series = oni.index.ravel()
oni_period = oni_time_series[oni_selection]
oni_period_matrix = oni_period.reshape(52,12)
ONI_df = pd.DataFrame(oni_period_matrix, columns = season_list, index = range(1965,2017))

<class 'numpy.ndarray'> float64
index:  float64 -0.6


In [20]:
hgt_2_lat_sl = rangeslice(ncep_latitudes, (10, 20.001))
hgt_2_lon_sl = rangeslice(ncep_longitudes, (140, 180.001))

hgt_2_avg = np.nanmean(hgt_seasonal_anom.s_anom[:,:,hgt_2_lon_sl], axis=2)
hgt_2_avg = np.nanmean(hgt_2_avg[:,hgt_2_lat_sl], axis=1)

hgt_2_avg_matrix = hgt_2_avg.reshape(52, 12)

hgt_2_avg_matrix_df = pd.DataFrame(hgt_2_avg_matrix, columns = season_list, index = range(1965,2017))

In [21]:
uwnd_2_avg = np.nanmean(uwnd_seasonal_anom.s_anom[:,:,hgt_2_lon_sl], axis=2)
uwnd_2_avg = np.nanmean(uwnd_2_avg[:,hgt_2_lat_sl], axis=1)

uwnd_2_avg_matrix = uwnd_2_avg.reshape(52, 12)

uwnd_2_avg_matrix_df = pd.DataFrame(uwnd_2_avg_matrix, columns = season_list, index = range(1965,2017))

In [22]:
uwnd_3_avg = np.nanmean(uwnd_seasonal_anom.s_anom[:,:,hgt_2_lon_sl], axis=2)
uwnd_3_avg = np.nanmean(uwnd_3_avg[:,hgt_2_lat_sl], axis=1)

uwnd_3_avg_matrix = uwnd_3_avg.reshape(52, 12)

new_wind_index = uwnd_2_avg_matrix - uwnd_3_avg_matrix

new_wind_index_df = pd.DataFrame(new_wind_index, columns = season_list, index = range(1965,2017))

In [23]:
predictor_train_df = pd.concat([ONI_df.loc[1965:2004]['SON'],ONI_df.loc[1965:2004]['JJA'],
                         uwnd_2_avg_matrix_df.loc[1965:2004]['SON'], uwnd_2_avg_matrix_df.loc[1965:2004]['JJA']],
                         axis = 1, keys = ['SON ONI', 'JJA ONI', 'SON uwnd', 'JJA uwnd'])

predictor_test_df = pd.concat([ONI_df.loc[2005:2015]['SON'],ONI_df.loc[2005:2015]['JJA'],
                         uwnd_2_avg_matrix_df.loc[2005:2015]['SON'], uwnd_2_avg_matrix_df.loc[2005:2015]['JJA']],
                         axis = 1, keys = ['SON ONI', 'JJA ONI', 'SON uwnd', 'JJA uwnd'])

predictand_train_df = average_2_station_SPI_df.loc[1966:2005]['Dec-May']
predictand_test_df = average_2_station_SPI_df.loc[2006:2016]['Dec-May']

##  Large data (small grid) set

In [25]:
sst_area_lats = rangeslice(ersst_latitudes, (-30, 30.001))
sst_area_lons = rangeslice(ersst_longitudes, (110, 300.001))

sst_test = sst_seasonal_anom.s_anom[:,sst_area_lats,sst_area_lons]

sst_selection = (sst_ca.ymdhms[:, 1] == 10)
sst_selection = np.nonzero(sst_selection)[0]
sst_test_region_son = sst_test[sst_selection]
sst_flat = np.reshape(sst_test_region_son,(52,2976))

sst_predictor_train = sst_flat[:40,:]
sst_predictor_test = sst_flat[40:-1,:]

uwnd_area_lats = rangeslice(ncep_latitudes, (-30, 50.001))
uwnd_area_lons = rangeslice(ncep_longitudes, (110, 300.001))

uwnd_test = uwnd_seasonal_anom.s_anom[:,uwnd_area_lats, uwnd_area_lons]

uwnd_selection = (uwnd_ca.ymdhms[:, 1] == 10)
uwnd_selection = np.nonzero(uwnd_selection)[0]
uwnd_test_region_son = uwnd_test[uwnd_selection]
print(np.shape(uwnd_test_region_son))
uwnd_flat = np.reshape(uwnd_test_region_son,(52,2541))

uwnd_predictor_train = uwnd_flat[:40,:]
uwnd_predictor_test = uwnd_flat[40:-1,:]

predictor_train_matrix = np.hstack((sst_predictor_train, uwnd_predictor_train))
predictor_test_matrix = np.hstack((sst_predictor_test, uwnd_predictor_test))

print(len(predictand_train_df))
print(np.shape(predictor_train_matrix))

print(len(predictand_test_df))
print(np.shape(predictor_test_matrix))

(52, 33, 77)
40
(40, 5517)
11
(11, 5517)


# Random Forest Regressor Here

In [26]:
from sklearn.ensemble import RandomForestRegressor

In [27]:
big_forest = RandomForestRegressor(n_estimators=100)

In [28]:
big_forest.fit(uwnd_predictor_train, predictand_train_df)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [29]:
big_prediction_result = big_forest.predict(uwnd_predictor_test)

In [30]:
dummy = np.arange(2006,2017)
plt.plot(dummy,predictand_test_df,dummy, big_prediction_result)
plt.legend(['data', 'prediction'], loc='upper left')

<matplotlib.legend.Legend at 0x7fe5e63d2550>