In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [4]:
combs = np.loadtxt('combs_guadalquivir.csv', delimiter = ',', dtype = str)

In [5]:
combs.shape

(105, 2)

In [6]:
guada = pd.read_csv('GuadaAB.csv')
guada = guada[guada['Date5.1'] < '2017-11-19']
guada = guada[guada['Date5.1'] > '1970-01-01']
guada

Unnamed: 0.1,Unnamed: 0,Code,Sample,Site,Tide,Date,Date5.1,Specie,A,B
0,1,111,1,1,1,1997-06-03,1997-06-19,Alburnus.alburnus,0.000000,0.000000
1,2,111,1,1,1,1997-06-03,1997-06-19,Alosa.fallax,0.000000,0.000000
2,3,111,1,1,1,1997-06-03,1997-06-19,Ameiurus.melas,0.000000,0.000000
3,4,111,1,1,1,1997-06-03,1997-06-19,Ammodytes.tobianus,0.000000,0.000000
4,5,111,1,1,1,1997-06-03,1997-06-19,Anchialina.agilis,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
374776,374777,21832,218,3,2,2015-09-15,2015-09-14,Umbrina.cirrosa,0.000000,0.000000
374777,374778,21832,218,3,2,2015-09-15,2015-09-14,Upogebia.deltaura,0.000000,0.000000
374778,374779,21832,218,3,2,2015-09-15,2015-09-14,Upogebia.pusilla,0.000000,0.000000
374779,374780,21832,218,3,2,2015-09-15,2015-09-14,Upogebia.tipica,0.000000,0.000000


In [7]:
def get_tide_list(tide="ebb"):
    """
    Helper function returning ebb and flow tide indices

    :param tide: ebb or flow
    :return: list of tide numbers
    """
    if tide == "flood":
        return [1, 3]
    if tide == "ebb":
        return [2, 4]
    Exception("Invalid tide entered. Must be ebb or flow.")
    

def initialise_dataset(site=5, tide="flood", AB="A", scale=True):
    """
    Create dataframe of the required form for pyEDM applications, for a given site-tide pair

    :param scale: if true, all abundances/biomasses scaled to zero mean unit variance
    :param AB: enter 'A' to consider variable abundance, or 'B' to consider variable biomass
    :param site: filter dataframe to data from a single site
    :param tide: filter dataframe to data from a tidal pair
    :return: pandas dataframe
    """
    spec_df = pd.read_csv(Path("../../data/GuadaAB.csv"))
    sig_species = [
        "Mesopodopsis.slabberi",
        "Neomysis.integer",
        "Rhopalophthalmus.tartessicus",
        "Pomatoschistus",
        "Engraulis.encrasicolus",
        "Crangon.crangon",
        "Palaemon.longirostris",
        "Sardina.pilchardus",
        "Palaemon.macrodactylus",
        "Dicentrarchus.punctatus",
        "Liza.ramada",
        "Liza.aurata",
        "Argyrosomus.regius",
        "Chelon.labrosus",
        "Cyprinus.carpio",
    ]
    dropped = ["A", "B"]
    dropped.remove(AB)
    sig_df = spec_df.drop([dropped[0], "Date", "Code", "Sample"], axis=1, inplace=False)
    pd.to_datetime(sig_df["Date5.1"])
    sig_df = sig_df.set_index("Date5.1")
    sig_df = sig_df[sig_df.Specie.isin(sig_species)]
    sig_df = sig_df[sig_df.Site.eq(site)]
    sig_df = sig_df.loc[:, ~sig_df.columns.str.contains("^Unnamed")]
    sig_df = sig_df[sig_df.Tide.isin(get_tide_list(tide))]
    sig_df = sig_df.drop(["Site", "Tide"], axis=1)
    df = sig_df.pivot_table(values=AB, index=["Date5.1"], columns=["Specie"])
    df = df.fillna(0)
    if scale:
        scaler = StandardScaler()
        df[df.columns] = scaler.fit_transform(df[df.columns])
    df.insert(loc=0, column="Time", value=range(len(df)))
    return df


def initialise_allsp(df, AB="A", scale=True):
    """
    Combine all data from different site-tide pairs to create a dataframe of the required form for pyEDM applications.

    :param AB: enter 'A' to consider variable abundance, or 'B' to consider variable biomass
    :param scale: if True, scale all data to 0 mean and unit variance. Scaling before applying pyEDM techniques advised
    :return: pandas dataframe
    """
    spec_df = df
    sig_species = [
        "Mesopodopsis.slabberi",
        "Neomysis.integer",
        "Rhopalophthalmus.tartessicus",
        "Pomatoschistus",
        "Engraulis.encrasicolus",
        "Crangon.crangon",
        "Palaemon.longirostris",
        "Sardina.pilchardus",
        "Palaemon.macrodactylus",
        "Dicentrarchus.punctatus",
        "Liza.ramada",
        "Liza.aurata",
        "Argyrosomus.regius",
        "Chelon.labrosus",
        "Cyprinus.carpio",
    ]
    dropped = ["A", "B"]
    dropped.remove(AB)
    sig_df = spec_df.drop([dropped[0], "Date", "Code", "Sample"], axis=1)
    pd.to_datetime(sig_df["Date5.1"])
    sig_df = sig_df[sig_df.Specie.isin(sig_species)]
    sig_df = sig_df.loc[:, ~sig_df.columns.str.contains("^Unnamed")]
    grouped_df = sig_df.groupby(["Date5.1", "Specie"], as_index=False).sum()
    grouped_df = grouped_df.drop(["Site", "Tide"], axis=1)
    grouped_df = grouped_df.set_index("Date5.1")
    df = grouped_df.pivot_table(values=AB, index=["Date5.1"], columns=["Specie"])
    df = df.fillna(
        method="bfill"
    )  # think about what's the best method for filling gaps. Fill with zeros instead?
    if scale:
        scaler = StandardScaler()
        df[df.columns] = scaler.fit_transform(df[df.columns])
    df.insert(loc=0, column="Time", value=range(len(df)))
    return df

In [22]:
guada_B = initialise_allsp(guada, AB="A", scale=True)

In [23]:
#guada_B.drop(['Time'], inplace = True, axis = 1)

In [24]:
#guada_B.drop(guada_B.tail(1).index,inplace=True)
guada_B

Specie,Time,Argyrosomus.regius,Chelon.labrosus,Crangon.crangon,Cyprinus.carpio,Dicentrarchus.punctatus,Engraulis.encrasicolus,Liza.aurata,Liza.ramada,Mesopodopsis.slabberi,Neomysis.integer,Palaemon.longirostris,Palaemon.macrodactylus,Pomatoschistus,Rhopalophthalmus.tartessicus,Sardina.pilchardus
Date5.1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1997-06-19,0,-0.425090,-0.362363,-0.474677,0.181297,2.229152,-0.095824,-0.195175,-0.131541,-0.162186,2.511632,0.196230,-0.492144,-0.409040,-0.800610,-0.322784
1997-07-06,1,-0.392634,-0.362363,-0.335964,5.222183,-0.126221,0.898129,-0.198570,-0.138089,1.771438,3.339553,1.647588,-0.492144,0.608170,2.509766,-0.325143
1997-08-05,2,-0.392070,-0.362363,0.130194,0.243777,-0.238643,-0.255207,-0.198570,-0.138042,-0.021724,4.059527,3.392149,-0.492144,-0.082529,-0.640341,-0.333708
1997-09-03,3,-0.318527,-0.362363,-0.163550,0.037267,-0.248766,-0.003479,-0.198570,-0.138077,-0.111936,0.566429,1.848393,-0.492144,0.134840,-0.690145,-0.333708
1997-10-03,4,-0.404380,-0.362363,-0.227443,-0.187904,-0.211633,0.299113,-0.198570,-0.137687,-0.366056,1.084495,2.200784,-0.492144,-0.125129,-0.628725,-0.333708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-05-18,213,-0.383908,0.497892,0.046068,-0.283337,-0.243735,-0.224993,-0.057263,-0.120917,0.016211,-0.249766,-0.280876,-0.452561,1.072897,0.362587,-0.284337
2015-06-17,214,-0.412325,0.145240,-0.214714,-0.283337,-0.253631,0.273354,-0.138610,-0.133656,-0.536457,-0.279892,0.312501,-0.162311,0.358377,-0.460635,-0.318443
2015-07-14,215,-0.395331,-0.171793,-0.385025,-0.274246,-0.256235,0.061829,-0.147329,-0.132884,-0.410012,-0.282707,0.246012,0.184396,-0.276624,-0.686943,-0.333708
2015-08-12,216,-0.428887,-0.259323,-0.373187,-0.283337,-0.256815,-0.369143,-0.176775,-0.137675,-0.630384,-0.287497,0.625447,-0.363764,0.208999,-0.845351,-0.333708


In [25]:
names = guada_B.columns.values.tolist()
names

['Time',
 'Argyrosomus.regius',
 'Chelon.labrosus',
 'Crangon.crangon',
 'Cyprinus.carpio',
 'Dicentrarchus.punctatus',
 'Engraulis.encrasicolus',
 'Liza.aurata',
 'Liza.ramada',
 'Mesopodopsis.slabberi',
 'Neomysis.integer',
 'Palaemon.longirostris',
 'Palaemon.macrodactylus',
 'Pomatoschistus',
 'Rhopalophthalmus.tartessicus',
 'Sardina.pilchardus']

In [26]:
# for name in names:
#     temp = np.array(guada_B[name], dtype = float)
#     np.save(name, temp)

In [27]:
import numpy as np
import pandas as pd
from itertools import combinations
import pyEDM
import seaborn as sns
from joblib import Parallel, delayed
from numpy.random import default_rng
from scipy.spatial.distance import squareform, pdist
from sklearn.preprocessing import StandardScaler



def recurrence_matrix(x_e, quantile_per=12.5):
    """
    Generates binary recurrence matrix, where the closest ``quantile_per``% of pairwise distances are assigned a 1,
    and all other pairs assigned a 0.
    :param x_e: embedded dataframe of the required form such as that generated through ``pyEDM.Embed()``
    :param quantile_per: percentile of distances to set equal to 1
    :return: dataframe of recurrence matrix
    """
    dist_mat = squareform(pdist(x_e.values, metric="chebyshev"))
    binary_dist_mtx = (dist_mat <= np.percentile(dist_mat, quantile_per)).astype(int)
    # print("1s: ", np.count_nonzero(binary_dist_mtx) / (len(binary_dist_mtx) ** 2))
    return binary_dist_mtx

def twins_list(len_df, Ng, r_mtx, obs_per_year=12):
    """
    Creates an array of all twin points. Twin points are defined as those which are sufficiently close together in
    state space (same column in the recurrence matrix) and have the same seasonality (measurements from the same month)
    :param len_df: length of original dataframe (i.e. length of time series)
    :param Ng: length of embedded dataframe
    :param r_mtx: recurrence matrix dataframe
    :param obs_per_year: seasonality of the data
    :return: array of twins, formatted in two different ways. A_arr is the useful format for what follows
    """
    Eminus1 = len_df - Ng
    Alist = [[i] for i in range(len_df)]
    for i, j in combinations(range(Ng), 2):  # does not include repeats
        if np.array_equal(r_mtx[:, i], r_mtx[:, j]) and (j - i) % obs_per_year == 0:
            Alist[i + Eminus1].append(j + Eminus1)
            Alist[j + Eminus1].append(i + Eminus1)
    Q = np.array([len(twins) for twins in Alist])
    A_arr = (
        np.zeros((len_df, np.max(Q)), dtype=int) - 1000
    )  # -1000 here after trying it
    for i in range(len_df):
        A_arr[i, : Q[i]] = Alist[i]
    return A_arr, Q



def network_surrogates(len_df, twins_arr, Q, Ng, M=100, obs_per_year=12):
    """
    Generate all surrogates using a network method described in report. First generate points in phase space,
    and then append on initial time series
    :param len_df: length of original dataframe (i.e. length of time series)
    :param twins_arr: array of the twin points of every point (e.g. ouput of twins list
    :param Q: array of degrees of each node
    :param Ng: length of embedded dataframe/ number of nodes
    :param M: number of walkers starting on each node
    :param obs_per_year: sampling observations per year of data
    :return: arrays for which each column is a surrogate
    """
    nodes = np.arange(Ng - 2)
    Eminus1 = len_df - Ng
    start_nodes = nodes[nodes % obs_per_year == 0]
    start_nodes = start_nodes[start_nodes >= obs_per_year]
    num_nodes = len(start_nodes)
    X = np.zeros((Ng, M, num_nodes), dtype=int)
    X[0, :, :] = np.tile(start_nodes, (M, 1))

    rng = default_rng()  # generate random numbers (using numpy generator method)
    rand1 = rng.random((Ng, M, num_nodes))

    for i in range(1, Ng):
        X[i, :, :] = X[i - 1, :, :]
        QX = Q[X[i, :, :]]  # degrees of nodes where each walker is currently
        Ri = (
            rand1[i, :, :] * QX
        )  # Generate array of random numbers, each less than QX[i,j]
        Ri = Ri.astype(int)  # integer conversion
        mask = X[i, :, :] < (Ng - 1)  # relax this condition to let points jump off
        X[i, :, :][mask] = twins_arr[X[i, :, :], Ri][mask]  # update walker locations
        X[i, :, :][X[i, :, :] < (Ng - 1)] += 1
    surrogates = X[:, X[-1, :, :] < (Ng - 1)]
    # back propagate initial times to get to full length of time series
    initial_times = np.add(
        np.tile(surrogates[0, :] - Eminus1, (Eminus1, 1)),
        np.mgrid[:Eminus1, : len(surrogates[0, :])][0],
    )
    return np.concatenate((initial_times, surrogates))  # each column is a surrogate!!


def run_surrogates(df, mae_dict, sp, n_surrogates=5, obs_per_year=12, thresholds=-1):
    """
    Generates all the surrogates using the rEDM method. Note the ability to insert a list of
    thresholds to try. Default is the rEDM implementation list of thresholds to try.
    :param df: input dataframe
    :param mae_dict: dictionary of lists; generate with :meth:`src.processing.embedding_dimension.mae_dictionary`
    :param sp: species for which to generate surrogates
    :param n_surrogates: number of surrogates to generate
    :param obs_per_year: period of seasonality of data
    :param thresholds: list of thresholds to try. Numbers between 5 and 20 recommended.
    :return: embedded dataframe, surrogates, optimal embedding dimension
    """
    surrogate_slice = None
    optE = 0
    len_df = len(df)
    for count in range(25):
        optE = mae_dict[sp][count][0]
        x_e = pyEDM.Embed(dataFrame=df, E=optE, columns=sp)
        Ng = len(x_e)
        if thresholds == -1:
            thresholds = [
                12.5,
                12,
                11,
                10,
                9,
                8,
                7,
                6,
                5,
                15,
                16,
                17,
                18,
                19,
                20,
                4,
            ]
        twins_arr = None
        Q = None
        for thres in thresholds:
            r_mtx = recurrence_matrix(x_e, thres)
            twins_arr, Q = twins_list(
                len_df, Ng=Ng, r_mtx=r_mtx, obs_per_year=obs_per_year
            )
            # if there are at least 10 twins
            if np.sum(Q) - Ng > 10:
                break
        surrogate_slice = network_surrogates(
            len_df, twins_arr, Q, Ng, M=100 * n_surrogates, obs_per_year=obs_per_year
        )
        if len(surrogate_slice[0, :]) >= n_surrogates:
            print("E = {}".format(optE))
            print("Threshold = {}".format(thres))
            break
    rng = default_rng()
    return (
        x_e,
        rng.choice(a=surrogate_slice, size=n_surrogates, replace=False, axis=1),
        optE,
    )


def generate_all_surrogates(df, mae_dict, n_surrogates=5, obs_per_year=12):
    """
    Generates surrogates for all species of a given dataframe, using the :meth:`src.PLTS.networkccm3.run_surrogates`
    method.
    :param df: input dataframe
    :param mae_dict: dictionary of lists; generate with :meth:`src.processing.embedding_dimension.mae_dictionary`
    :param n_surrogates: number of surrogates to use with phase-lock twin surrogate method
    :param obs_per_year: periodicity of time series data (12 for Guadalquivir data, 24 for Maizuru data)
    :return: length of embedded dataframe, array of surrogate time series, array of optimal embedding dimensions
    """
    species_list = list(df.columns[1:])
    len_df = len(df)
    surr_array = np.zeros((len(species_list), len_df, n_surrogates))
    Es = np.zeros(len(species_list))
    for i, sp in enumerate(species_list):
        x_e, surr_array[i], Es[i] = run_surrogates(
            df, mae_dict, sp, n_surrogates=n_surrogates, obs_per_year=obs_per_year
        )
    return x_e, surr_array, Es.astype(int)



def mae_dictionary(df, max_dim=24):
    """
    Creates dictionary with species as keys and each corresponding index a list of embedding dimensions from 1 to
    max_dim in order of optimality according to mean absolute error. See section 3.2 of report.
    :param df: input dataframe of usual form (e.g. that given by :meth:`src.abundance_tools.initialise_dataset`)
    :param max_dim: maximum embedding dimension to include in dictionary. Default is 24, copying Ushio paper
    :return: dictionary of optimal embedding dimension information
    """
    mae_dict = {}
    for _, sp in enumerate(df.columns[1:]):
        MAEs = [[] for _ in range(2, max_dim + 1)]
        for E in range(2, max_dim + 1):
            library_string = "1 {}".format(len(df) - E)
            preds = pyEDM.Simplex(
                dataFrame=df,
                columns=sp,
                target=sp,
                E=E,
                Tp=1,
                lib=library_string,
                pred=library_string,
                exclusionRadius=1,
            )
            MAEs[E - 2] = [
                E,
                np.nanmean(
                    np.abs((preds["Predictions"] - preds["Observations"]).values)
                ),
            ]
        mae_dict[sp] = MAEs
    for sp in mae_dict:
        mae_dict[sp].sort(key=takeSecond)
    return mae_dict

def takeSecond(elem):
    """
    Returns second element of a list. Auxiliary function for :meth:`src.processing.embedding_dimension.mae_dictionary`
    :param elem: list
    :return: second element
    """
    return elem[1]


def twin_surrogate_esn(X, Y, Nsurr, period):
    df = pd.DataFrame()
    df['X'] = X
    df['Y'] = Y
    df.insert(loc=0, column="Time", value=range(len(df)))
    
    df_mae_dict = mae_dictionary(df, max_dim=24)
    _, surr_array, _ = generate_all_surrogates(df, df_mae_dict, n_surrogates = Nsurr, obs_per_year = period)
    
    return surr_array[0,:,:].T, surr_array[1,:,:].T

def DeepESN_real(X, Y, n_data = 50, period = 1, verbose = False):
    
    # Get length of time series
    N = X.shape[0]
    
    # compute surrogates
    Xsurr, Ysurr = twin_surrogate_esn(X, Y, Nsurr = n_data, period = period)
    
    
    # run the deepESN of the surrogates
    xmapy_surr, ymapx_surr = compute_confidence(Xsurr, Ysurr, verbose)

    # run deepESN for actual data
    xmapy, ymapx = compute_lags(X, Y, n_data, verbose)

    return xmapy_surr, ymapx_surr, xmapy, ymapx

In [28]:
mae_dict = mae_dictionary(guada_B, max_dim=24)

In [29]:
_, surrogates, _ = generate_all_surrogates(guada_B, mae_dict = mae_dict, n_surrogates=100, obs_per_year=12)

E = 13
Threshold = 15
E = 24
Threshold = 15
E = 3
Threshold = 17
E = 16
Threshold = 12.5
E = 9
Threshold = 12.5
E = 16
Threshold = 4
E = 12
Threshold = 4
E = 11
Threshold = 4
E = 5
Threshold = 4
E = 20
Threshold = 19
E = 5
Threshold = 4
E = 4
Threshold = 12.5
E = 4
Threshold = 4
E = 7
Threshold = 4
E = 13
Threshold = 9


In [31]:
names = guada_B.columns.values.tolist()
names.remove('Time')
names

['Argyrosomus.regius',
 'Chelon.labrosus',
 'Crangon.crangon',
 'Cyprinus.carpio',
 'Dicentrarchus.punctatus',
 'Engraulis.encrasicolus',
 'Liza.aurata',
 'Liza.ramada',
 'Mesopodopsis.slabberi',
 'Neomysis.integer',
 'Palaemon.longirostris',
 'Palaemon.macrodactylus',
 'Pomatoschistus',
 'Rhopalophthalmus.tartessicus',
 'Sardina.pilchardus']

In [32]:
len(names)

15

In [33]:
for i, name in enumerate(names):
    np.save(name + '_surr', surrogates[i,:,:].T)

In [34]:
surrogates.shape

(15, 218, 100)