In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import torch
import random
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data_utils
from torch.nn.utils import clip_grad_norm
from tqdm import tqdm

from config import TRAIN_INFO, AC_DATA, MU, STD, CV_SIZE
from utils import T, scale

In [None]:
plt.style.use('dark_background')

In [None]:
train_info = pd.read_csv(TRAIN_INFO)
ac_data = np.load(AC_DATA)['acoustic_data']

In [None]:
# ac_data = scale(ac_data)

In [None]:
len(ac_data)

In [None]:
# d = np.load(AC_DATA)['acoustic_data']

In [None]:
# d.std()

In [None]:
train_info

In [None]:
def get_quake_period(i):
        index_start, chunk_length = train_info['index_start'][i], train_info['chunk_length'][i]
        t_start, t_end = train_info['t_start'][i], train_info['t_end'][i]
        ac_data_period = ac_data[ index_start : index_start + chunk_length ]
        ttf_data_period = np.linspace(t_start, t_end, chunk_length, dtype=np.float32)
        return ac_data_period, ttf_data_period

In [None]:
seqs = []
ttfs = []
y_periods = []
for index, period in train_info.iterrows():
    ix_start = int(period['index_start'])
    chunk_length = int(period['chunk_length'])
    t_start = period['t_start']
    t_end = period['t_end']
    y_period = index
    period_data = ac_data[ix_start: ix_start+chunk_length]
    period_ttf = np.linspace(t_start, t_end, chunk_length, dtype=np.float32)
    split_length = 150000
    period_splits = []
    period_ys = []
    for i in range(chunk_length//split_length):
        x = period_data[i*split_length:(i+1)*split_length]
        y = period_ttf[i*split_length:(i+1)*split_length][-1]
        period_splits.append(x)
        period_ys.append(y)
    print('    ',len(period_splits), np.mean(period_ys))
    seqs.extend(period_splits)
    ttfs.extend(period_ys)

#### Scale x

In [None]:
scaled_seqs = [scale(x) for x in tqdm(seqs)]

In [None]:
# scaled_seqs = []
# for x in tqdm(seqs):
#     max_val = max(x)
#     min_val = min(x)
#     scaled = (x-min_val)/(max_val-min_val)
#     scaled_seqs.append(scaled)

In [None]:
# seqs = scaled_seqs

In [None]:
# plt.hist(seqs[0])

#### rescale y

In [None]:
ys = [np.log(1+x) for x in ttfs] # to undo : [np.exp(x)+1 for x in ys]

In [None]:
plt.hist(ys)

In [None]:
ttfs = ys

#### Bucket

In [None]:
buckets = np.arange(0, max(ttfs), 0.5)

In [None]:
buckets

In [None]:
classification=np.digitize(ttfs, buckets)

In [None]:
classification = classification - 1

In [None]:
plt.hist(classification)

In [None]:
len(classification)

In [None]:
wgts = {}
n = (classification==1).mean()
for w in np.unique(classification):
    wgts[w] = n/(classification == w).mean()

In [None]:
wgts

In [None]:
pd.to_pickle(wgts, 'class_weights.pkl')

In [None]:
classes=np.unique(classification)

In [None]:
for x in classification:
    print(wgts[x])
    break

In [None]:
# l1_wgts = [wgts[i] for i in classification]

In [None]:
# pd.to_pickle(l1_wgts, 'l1_wgts.pkl')

In [None]:
# import random

In [None]:
# xs = []
# ys = []
# s = 250
# for i in list(classes):
#     s = int(0.5*s)
#     print(s)
#     ixs=np.where(classification==int(i))[0]
#     ixs = random.sample(list(ixs), s)
#     ixs = list(ixs)
#     x = [seqs[a] for a in ixs]
#     y = [ttfs[a] for a in ixs]
#     xs.extend(x)
#     ys.extend(y)

In [None]:
# seqs = xs
# ttfs = ys

In [None]:
plt.hist(ttfs)

#### Train/CV split

In [None]:
len(seqs), len(scaled_seqs), len(ttfs)

In [None]:
def get_idxs():
    idxs = list(np.arange(len(seqs)))
    samples = int(len(idxs)*CV_SIZE)
    cv_idxs = random.sample(idxs, samples)
    train_idxs = [x for x in idxs if x not in cv_idxs]
    return train_idxs, cv_idxs

In [None]:
from utils import scale

In [None]:
# train_idxs, cv_idxs = get_idxs()
# train_set = [(T(scale(seqs[a])), T(scale(ttfs[a])) for a in train_idxs]
# cv_set = [(T(seqs[a]), T(ttfs[a])) for a in cv_idxs]

In [None]:
# train_idxs, cv_idxs = get_idxs()
# train_set = [(T(seqs[a]), T(ttfs[a])) for a in train_idxs]
# cv_set = [(T(seqs[a]), T(ttfs[a])) for a in cv_idxs]

In [None]:
train_idxs, cv_idxs = get_idxs()
train_set = [(T(scaled_seqs[a]), T(classification[a].astype(np.int64))) for a in train_idxs]
cv_set = [(T(scaled_seqs[a]), T(classification[a].astype(np.int64))) for a in cv_idxs]

In [None]:
# train_idxs, cv_idxs = get_idxs()
# train_set = [(T(seqs[a].astype(np.float32)), T(scaled_seqs[a]), T(ttfs[a])) for a in train_idxs]
# cv_set = [(T(seqs[a].astype(np.float32)), T(scaled_seqs[a]), T(ttfs[a])) for a in cv_idxs]

In [None]:
train_classes = [classification[a] for a in train_idxs]
l1_wgts = [wgts[i] for i in train_classes]

In [None]:
pd.to_pickle(l1_wgts, 'l1_wgts.pkl')

In [None]:
len(train_set)

In [None]:
len(train_idxs)

In [None]:
pd.to_pickle(train_set, 'train_set.pkl')
pd.to_pickle(cv_set, 'cv_set.pkl')

In [None]:
train_set[0]

In [None]:
q = pd.read_pickle('train_set.pkl')

In [None]:
ys = torch.stack([x[1] for x in q])

In [None]:
ys.mean()