In [1]:
from pathlib import Path

In [50]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import TensorDataset, DataLoader

In [53]:
seed = 1
np.random.seed(seed)

In [3]:
# ROOT = Path.cwd().parent/'input'
ROOT = Path.home()/'data'/'careercon2019'

In [4]:
SAMPLE = ROOT/'sample_submission.csv'
TRAIN = ROOT/'X_train.csv'
TARGET = ROOT/'y_train.csv'
TEST = ROOT/'X_test.csv'

ID_COLS = ['series_id', 'measurement_number']

x_cols = {
    'series_id': np.uint32,
    'measurement_number': np.uint32,
    'orientation_X': np.float32,
    'orientation_Y': np.float32,
    'orientation_Z': np.float32,
    'orientation_W': np.float32,
    'angular_velocity_X': np.float32,
    'angular_velocity_Y': np.float32,
    'angular_velocity_Z': np.float32,
    'linear_acceleration_X': np.float32,
    'linear_acceleration_Y': np.float32,
    'linear_acceleration_Z': np.float32
}

y_cols = {
    'series_id': np.uint32,
    'group_id': np.uint32,
    'surface': str
}

In [5]:
x_trn = pd.read_csv(TRAIN, usecols=x_cols.keys(), dtype=x_cols)
x_tst = pd.read_csv(TEST, usecols=x_cols.keys(), dtype=x_cols)
y_trn = pd.read_csv(TARGET, usecols=y_cols.keys(), dtype=y_cols)

In [6]:
def add_euler_angles(df):
    """Adds Euler angles features to the dataset."""
    
    x, y, z, w = [df[f'orientation_{s}'] for s in list('XYZW')]
    nx, ny, nz = quaternion_to_euler(x, y, z, w)
    df['euler_X'] = nx
    df['euler_Y'] = ny
    df['euler_Z'] = nz
    return df

In [7]:
def quaternion_to_euler(x, y, z, w):
    """Converts quaternion values into Euler angles (roll, pitch and yaw)."""
    
    t0 = 2.0*(w*x + y*z)
    t1 = 1.0 - 2.0*(x*x + y*y)
    X = np.arctan2(t0, t1)
    
    t2 = np.clip(2.0*(w*y - z*x), -1, 1)
    Y = np.arcsin(t2)
    
    t3 = 2.0*(w*z + x*y)
    t4 = 1.0 - 2.0*(y*y + z*z)
    Z = np.arctan2(t3, t4)
    
    return X, Y, Z

In [8]:
def startswith(df, prefix):
    return df.columns[df.columns.str.startswith(prefix)].tolist()

In [10]:
trn_sz, tst_sz = x_trn.series_id.nunique(), x_tst.series_id.nunique()
print(f'Number of series: {trn_sz} train, {tst_sz} test')

Number of series: 3810 train, 3816 test


In [11]:
x_tst['series_id'] += len(x_trn)

In [12]:
data = pd.concat([x_trn, x_tst], axis=0).reset_index(drop=True)

In [13]:
data = add_euler_angles(data)

In [14]:
data = data.drop(columns=['measurement_number'] + startswith(data, 'orient'))

In [18]:
data.sample(5).T

Unnamed: 0,143195,618716,802766,866678,761708
series_id,1118.0,488703.0,490141.0,490640.0,489820.0
angular_velocity_X,-0.017536,0.25055,0.019738,-0.006506,-0.091019
angular_velocity_Y,-0.056279,0.003368,-0.16634,-0.016665,-0.015642
angular_velocity_Z,0.031492,-0.076702,0.57619,0.045611,0.15972
linear_acceleration_X,-1.5482,-0.086836,0.39684,1.6791,-0.65016
linear_acceleration_Y,1.4046,4.7831,2.9783,3.4683,6.7599
linear_acceleration_Z,-10.587,-10.841,-9.5308,-5.0052,-7.2218
euler_X,2.839165,2.841314,2.84374,2.843757,2.840602
euler_Y,-0.017802,-0.011658,-0.015219,-0.014885,-0.014945
euler_Z,3.046572,2.08197,-0.759491,-0.251047,2.313972


In [19]:
euler_cols = startswith(data, 'euler')
linear_cols = startswith(data, 'linear') 
angular_cols = startswith(data, 'angular')

In [20]:
def abs_fft(arr): return np.abs(np.fft.rfft(arr))

In [21]:
def zero_mean(x): return x - x.mean()

In [22]:
def zscore(x): return (x - x.mean())/x.std()

In [23]:
groups = data.groupby('series_id')

In [24]:
data = pd.concat([
    groups[euler_cols].diff().fillna(0),
    groups[linear_cols].transform(zero_mean),
    groups[angular_cols].transform(zero_mean)
], axis=1, sort=False)

In [25]:
fft_data = (
    groups[linear_cols + angular_cols]
    .apply(lambda df: df.apply(abs_fft, axis=0))
    .reset_index('series_id', drop=True))

In [30]:
seq_len = 128
fft_seq_len = seq_len//2 + 1

In [35]:
raw_arr = data.values.reshape([trn_sz + tst_sz, seq_len, len(data.columns)])

In [36]:
fft_arr = fft_data.values.reshape([trn_sz + tst_sz, fft_seq_len, len(fft_data.columns)])

In [37]:
print(f'Prepared datasets shapes: {raw_arr.shape} raw, {fft_arr.shape} fft')

Prepared datasets shapes: (7626, 128, 9) raw, (7626, 65, 6) fft


In [55]:
enc = LabelEncoder().fit(y_trn['surface'])
target = list(enc.transform(y_trn['surface']) + 1)
target += [0] * tst_sz
target = np.array(target)
assert len(target) == trn_sz + tst_sz

In [51]:
def create_datasets(data, target, train_size, valid_pct=0.1, seed=None):
    raw, fft = data
    assert len(raw) == len(fft)
    sz = train_size
    idx = np.arange(sz)
    trn_idx, val_idx = train_test_split(
        idx, test_size=valid_pct, random_state=seed)
    trn_ds = TensorDataset(
        torch.tensor(raw[:sz][trn_idx]).float(), 
        torch.tensor(fft[:sz][trn_idx]).float(), 
        torch.tensor(target[:sz][trn_idx]).long())
    val_ds = TensorDataset(
        torch.tensor(raw[:sz][val_idx]).float(), 
        torch.tensor(fft[:sz][val_idx]).float(), 
        torch.tensor(target[:sz][val_idx]).long())
    tst_ds = TensorDataset(
        torch.tensor(raw[sz:]).float(), 
        torch.tensor(fft[sz:]).float(), 
        torch.tensor(target[sz:]).long())
    return trn_ds, val_ds, tst_ds

In [52]:
def create_loaders(data, bs=128, jobs=0):
    trn_ds, val_ds, tst_ds = data
    trn_dl = DataLoader(trn_ds, batch_size=bs, shuffle=True, num_workers=jobs)
    val_dl = DataLoader(val_ds, batch_size=bs, shuffle=False, num_workers=jobs)
    tst_dl = DataLoader(tst_ds, batch_size=bs, shuffle=False, num_workers=jobs)
    return trn_dl, val_dl, tst_dl

In [56]:
datasets = create_datasets((raw_arr, fft_arr), target, trn_sz, seed=seed)