In [1]:
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import multiprocessing
from tqdm import tqdm
import pickle as pkl
from copy import deepcopy
from typing import *

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
import plotly
import plotly.express as px
import plotly.io as plt_io
import plotly.graph_objects as go


from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters


from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from utils import *
from dataloader import *

In [47]:
def sliding_window(data:np.array, window_size:int, overlap:float) -> np.array:
    overlap = int(window_size * (overlap / 100))
    out = np.array([data[i:i+window_size] for i in range(0, len(data), window_size-overlap) if len(data[i:i+window_size]) == window_size])
    return out


def make_window(data:dict, window_size:int, overlap:float):
    L = []
    for key in data.keys():
        L.append(data[key])
    tmp = np.array(L)
    out = sliding_window(tmp, window_size, overlap)
    return out

def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]


def create_dataset(data:dict, subjects:List[str], window_size:int=60, overlap:float=0.0): 
    labels = [] 
    for idx, subject_id in tqdm(enumerate(subjects), total=len(subjects)):     
        if subject_id[0] == 'I':
            label = 0
        else:
            label = 1
        if idx == 0:
            win_data = make_window(data[subject_id], window_size, overlap)
            all_data_part1 = win_data
            labels.extend([label] * win_data.shape[0])
        else:
            win_data = make_window(data[subject_id], window_size, overlap)
            all_data_part2 = win_data
            labels.extend([label] * win_data.shape[0])

        if idx == 0:
            all_data = all_data_part1
        else:
            all_data = np.vstack((all_data, all_data_part2))
            
    X, Y = unison_shuffled_copies(np.array(all_data), np.array(labels))
    X = torch.Tensor(X)
    Y = torch.Tensor(Y).long()

    return X, Y


def plot_pca(X:pd.DataFrame):

    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X)
    #X_normalized = scaler.transform(X)

    pca = PCA(n_components=3)

    X_pca = pca.fit_transform(X_normalized)

    print("Explained variance ratio 3 composants:" , pca.explained_variance_ratio_)
    print("Singular values 3 composants: ", pca.singular_values_)

    new_df = pd.DataFrame(X_pca, columns=['pca_1','pca_2','pca_3'])
    new_df['labels'] = y

    fig = px.scatter_3d(new_df, x='pca_1', y='pca_2', z='pca_3', color='labels', size_max=10)
    fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
    fig.show()


def plot_tsne(X:pd.DataFrame):
    
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X)
    
    X_tsne = TSNE(n_components=3, learning_rate='auto',
                  init='random').fit_transform(X_normalized)
    
    new_df = pd.DataFrame(X_tsne, columns=['tsne_1','tsne_2','tsne_3'])
    new_df['labels'] = y

    fig = px.scatter_3d(new_df, x='tsne_1', y='tsne_2', z='tsne_3', color='labels')
    fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
    fig.show()

def plot_2d(component1, component2, y):
    
    fig = go.Figure(data=go.Scatter(
        x = component1,
        y = component2,
        mode='markers',
        marker=dict(
            size=10,
            color=y, #set color equal to a variable
            #colorscale='Rainbow', # one of plotly colorscales
            showscale=True,
            line_width=1
        )
    ))
    fig.update_layout(margin=dict( l=100,r=100,b=100,t=100),width=700,height=500)                 
    fig.layout.template = 'plotly_dark'
    
    fig.show()

def plot_3d(component1, component2, component3, y):
    fig = go.Figure(data=[go.Scatter3d(
            x=component1,
            y=component2,
            z=component3,
            mode='markers',
            marker=dict(
                size=5,
                color=y,                # set color to an array/list of desired values
                colorscale='Plotly3',   # choose a colorscale
                opacity=1,
                line_width=1
            )
        )])
    # tight layout
    fig.update_layout(margin=dict(l=100,r=100,b=100,t=100),width=1000,height=700)
    #fig.layout.template = 'plotly_dark'
    fig.write_image("test.png") 
    
    fig.show()

In [2]:
raw_data = load_pickle("../data/learning/raw_data.pkl")

In [69]:
list(raw_data.keys())#[:16]

['Chir_01',
 'Chir_03',
 'Chir_04',
 'Chir_05',
 'Intern_02',
 'Intern_03',
 'Intern_04',
 'Intern_05',
 'Intern_06',
 'Intern_07',
 'Intern_08',
 'Intern_09',
 'Intern_10',
 'Intern_11',
 'Intern_12',
 'Intern_13',
 'Intern_14',
 'Intern_15',
 'Intern_16',
 'Intern_17',
 'Intern_19',
 'Intern_21',
 'Intern_22',
 'Intern_23']

In [68]:
len(list(raw_data.keys()))

24

In [89]:
def reshape_temporal_landmarks(data:dict, ex_id:str, ws:int, overlap:float):
    start_id = 0
    all_df, y = pd.DataFrame(), []
    for subject_id in tqdm(list(data.keys())[:16]):
        print(subject_id)
        label = 1 if subject_id.startswith('C') else 0
        try:
            data_win = make_window(data[subject_id][ex_id], ws, overlap)
            for win_idx in range(data_win.shape[0]):
                df = pd.DataFrame(data_win[win_idx,:,:], columns=['feat_' + str(idx) for idx in range(63)])
                df['time'] = [i for i in range(1,len(df)+1)]
                df['id'] = [(start_id + win_idx) for _ in range(len(df))]
                all_df = pd.concat([all_df, df])
                y.append(label)
            start_id += (win_idx+1)
        except:
            pass
    return all_df, np.array(y)

In [90]:
all_df, y = reshape_temporal_landmarks(raw_data, 'ex2', 300, 0)

 25%|██▌       | 4/16 [00:00<00:00, 32.25it/s]

Chir_01
Chir_03
Chir_04
Chir_05
Intern_02
Intern_03


 50%|█████     | 8/16 [00:00<00:00, 26.53it/s]

Intern_04
Intern_05
Intern_06
Intern_07
Intern_08


 69%|██████▉   | 11/16 [00:00<00:00, 23.91it/s]

Intern_09
Intern_10
Intern_11


 88%|████████▊ | 14/16 [00:00<00:00, 18.16it/s]

Intern_12
Intern_13


100%|██████████| 16/16 [00:00<00:00, 17.16it/s]


In [91]:
all_df.shape, y.shape

((81000, 65), (270,))

In [92]:
all_df

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_55,feat_56,feat_57,feat_58,feat_59,feat_60,feat_61,feat_62,time,id
0,0.000000,0.000000,0.000000e+00,0.045132,-0.005488,0.013965,0.080902,-0.000293,0.013731,0.102230,...,0.051716,-0.033197,0.083343,0.054235,-0.018410,0.075226,0.051311,-0.003629,1,0
1,0.341097,0.482260,1.160808e-07,0.386231,0.476854,0.013634,0.422258,0.482156,0.013251,0.443869,...,0.533569,-0.034260,0.424959,0.535639,-0.019661,0.416924,0.532511,-0.005024,2,0
2,0.142373,0.462454,1.797099e-07,0.175935,0.456277,0.000703,0.212183,0.461059,-0.004905,0.235571,...,0.511893,-0.035350,0.212880,0.513820,-0.027894,0.218572,0.510433,-0.020706,3,0
3,0.140336,0.460639,3.247610e-07,0.177676,0.455315,0.002186,0.213493,0.461221,-0.003543,0.236771,...,0.510568,-0.035087,0.211823,0.512783,-0.025741,0.216675,0.510467,-0.017669,4,0
4,0.140051,0.460697,3.233133e-07,0.177617,0.455348,0.001828,0.213412,0.461075,-0.004040,0.236531,...,0.510675,-0.033335,0.212062,0.512874,-0.023713,0.217424,0.510561,-0.015497,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.192260,0.490726,4.813634e-08,0.227720,0.490355,-0.005709,0.264567,0.493852,-0.006257,0.290772,...,0.504900,0.006445,0.299848,0.503581,0.012227,0.305049,0.501832,0.016824,296,269
296,0.199009,0.492124,4.593207e-08,0.230971,0.489819,-0.004815,0.265222,0.490576,-0.006299,0.291107,...,0.508865,-0.000560,0.297445,0.506564,0.003791,0.297707,0.504447,0.007309,297,269
297,0.199222,0.493813,7.473941e-10,0.233727,0.484288,-0.004743,0.268636,0.485019,-0.005532,0.292582,...,0.520337,0.001303,0.297118,0.513438,0.006669,0.297583,0.508700,0.011238,298,269
298,0.196433,0.494444,-7.417771e-08,0.234847,0.482687,-0.006016,0.272388,0.481969,-0.005945,0.296240,...,0.520565,0.001163,0.296733,0.512557,0.005165,0.294620,0.506465,0.008735,299,269


In [93]:
extraction_settings = ComprehensiveFCParameters()

# MinimalFCParameters, EfficientFCParameters

X = extract_features(all_df, column_id='id', column_sort='time',
                    default_fc_parameters=MinimalFCParameters(),
                    impute_function=impute, n_jobs=10)

X_filtered = select_features(X, y, n_jobs=10)

Feature Extraction: 100%|██████████| 50/50 [00:01<00:00, 41.30it/s]


In [94]:
X.shape, X_filtered.shape

((270, 630), (270, 387))

In [82]:
var = sorted(list(X_filtered.columns))

In [83]:
var.insert(0, len(var))

In [84]:
var

[236,
 'feat_0__mean',
 'feat_0__median',
 'feat_0__minimum',
 'feat_0__root_mean_square',
 'feat_0__sum_values',
 'feat_10__absolute_maximum',
 'feat_10__maximum',
 'feat_10__mean',
 'feat_10__median',
 'feat_10__minimum',
 'feat_10__root_mean_square',
 'feat_10__sum_values',
 'feat_11__maximum',
 'feat_12__mean',
 'feat_12__median',
 'feat_12__minimum',
 'feat_12__root_mean_square',
 'feat_12__sum_values',
 'feat_13__absolute_maximum',
 'feat_13__maximum',
 'feat_13__mean',
 'feat_13__median',
 'feat_13__minimum',
 'feat_13__root_mean_square',
 'feat_13__sum_values',
 'feat_15__mean',
 'feat_15__median',
 'feat_15__minimum',
 'feat_15__root_mean_square',
 'feat_15__sum_values',
 'feat_16__mean',
 'feat_16__median',
 'feat_16__root_mean_square',
 'feat_16__sum_values',
 'feat_18__mean',
 'feat_18__median',
 'feat_18__minimum',
 'feat_18__root_mean_square',
 'feat_18__sum_values',
 'feat_19__absolute_maximum',
 'feat_19__maximum',
 'feat_19__mean',
 'feat_19__median',
 'feat_19__root_m

In [85]:
# open file in write mode
with open(r'sales.txt', 'w') as fp:
    for item in var:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

Done


In [95]:
X_ = X_filtered

scaler = StandardScaler()
scaler.fit(X_)
X_normalized = scaler.transform(X_)

pca = PCA(n_components=min(X_.shape[0], X_.shape[1]))
pca.fit(X_normalized)

X_pca = pca.fit_transform(X_normalized)

plot_3d(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], y)