# Context
Phase three of this project is data cleanup and dimensionality reduction.\
This notebook drives this process for the data that went through the
transformations in PH2.

In [4]:
# Set project's environment variables
import os
import sys
from dotenv import load_dotenv
load_dotenv(dotenv_path="../project.env")
sys.path.append(os.environ["PYTHONPATH"])

import pickle

In [5]:
# Import project-wide and PH2 specific variables and functions
import superheader as sup
import PH3header as ph3

In [6]:
PH3_IS_W2 = True

if PH3_IS_W2:
  prePH3_ROOT = sup.PH2_DATA_ROOT
  PH3_SUB_DATA_ROOT = os.path.join(sup.PH3_DATA_ROOT, sup.PH3_W2_CODE)
  PH3_SUB_BINGEN_ROOT = os.path.join(sup.PH3_BINGEN_ROOT, sup.PH3_W2_CODE)
  PH3_SUB_MEDIAGEN_ROOT = os.path.join(sup.PH3_MEDIAGEN_ROOT, sup.PH3_W2_CODE)
else:
  prePH3_ROOT = sup.PH1_DATA_ROOT
  PH3_SUB_DATA_ROOT = os.path.join(sup.PH3_DATA_ROOT, sup.PH3_WO2_CODE)
  PH3_SUB_BINGEN_ROOT = os.path.join(sup.PH3_BINGEN_ROOT, sup.PH3_WO2_CODE)
  PH3_SUB_MEDIAGEN_ROOT = os.path.join(sup.PH3_MEDIAGEN_ROOT, sup.PH3_WO2_CODE)

sup.report_dir_if_not_exists(prePH3_ROOT)
sup.create_dir_if_not_exists(PH3_SUB_DATA_ROOT)
sup.create_dir_if_not_exists(PH3_SUB_BINGEN_ROOT)
sup.create_dir_if_not_exists(PH3_SUB_MEDIAGEN_ROOT)

Directory /Users/diego/Desktop/iteso/TOG/data/PH2/all-classes exists. Continuing with execution
Directory /Users/diego/Desktop/iteso/TOG/data/PH3/all-classes/w2 exists. Continuing with execution
Directory /Users/diego/Desktop/iteso/TOG/bin/gen/PH3/all-classes/w2 exists. Continuing with execution
Directory /Users/diego/Desktop/iteso/TOG/media/gen/PH3/all-classes/w2 exists. Continuing with execution


# Source dataframe loading

In [7]:
import pandas as pd

## Active hand detection

In [8]:
prePH3_ah_frame_df = pd.read_csv(os.path.join(prePH3_ROOT, f"{sup.DATA_AH_PF}.csv"))
prePH3_ah_frame_df

Unnamed: 0,fileid,person_id,cycle_num,handedness,class_name,class_numeric,current_frame,active_hand,h_mean_x,h_mean_y,...,cp0z,cp11x,cp11y,cp11z,cp12x,cp12y,cp12z,cp_h_mean_x,cp_h_mean_y,cp_h_mean_z
0,p01/Ciclo_1_5_Derecha/Ciclo_1_5_Derecha_1.mp4,p01,1,0,1,29,0,1,0.402325,0.431794,...,-0.130771,-1.790365,0.758362,0.060317,-1.957494,1.462776,-0.587015,-2.402462,0.948084,-0.124299
1,p01/Ciclo_1_5_Derecha/Ciclo_1_5_Derecha_1.mp4,p01,1,0,1,29,0,0,0.564077,0.991533,...,-0.130771,-3.036731,0.842753,0.349662,-1.957494,1.462776,-0.587015,-2.042705,1.020513,-0.073060
2,p01/Ciclo_1_5_Derecha/Ciclo_1_5_Derecha_1.mp4,p01,1,0,1,29,1,1,0.398845,0.437636,...,-0.187774,-1.858167,0.846123,0.018568,-1.968054,1.608527,-0.659838,-2.472514,1.049298,-0.177663
3,p01/Ciclo_1_5_Derecha/Ciclo_1_5_Derecha_1.mp4,p01,1,0,1,29,1,0,0.563614,0.992229,...,-0.187774,-3.137224,0.971932,0.272847,-1.968054,1.608527,-0.659838,-2.092982,1.108208,-0.114549
4,p01/Ciclo_1_5_Derecha/Ciclo_1_5_Derecha_1.mp4,p01,1,0,1,29,2,1,0.396001,0.443303,...,-0.135368,-1.946311,0.816087,0.073966,-2.090933,1.629638,-0.625052,-2.579638,1.040590,-0.133036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94338,p11/Ciclo_5_5_Izquierda/Ciclo_5_5_Izquierda_z.mp4,p11,5,1,z,28,9,1,0.615409,0.453786,...,-0.209412,-1.597262,1.062073,-0.057946,-1.483173,1.745201,-0.659519,-1.530058,1.183440,-0.091197
94339,p11/Ciclo_5_5_Izquierda/Ciclo_5_5_Izquierda_z.mp4,p11,5,1,z,28,10,1,0.626146,0.451569,...,-0.259935,-1.516973,1.103281,-0.110097,-1.360762,1.748590,-0.698988,-1.430187,1.181708,-0.109829
94340,p11/Ciclo_5_5_Izquierda/Ciclo_5_5_Izquierda_z.mp4,p11,5,1,z,28,10,0,0.426564,0.884752,...,-0.259935,-2.441346,1.395153,-0.035105,-1.360762,1.748590,-0.698988,-1.856482,1.201426,-0.245549
94341,p11/Ciclo_5_5_Izquierda/Ciclo_5_5_Izquierda_z.mp4,p11,5,1,z,28,11,1,0.647923,0.438531,...,-0.200455,-1.535220,1.089146,-0.062901,-1.425573,1.777704,-0.652256,-1.406661,1.201472,-0.065859


# Cleanup

## Identify numeric columns which will go through standardization and dimensionality reduction

In [9]:
if PH3_IS_W2:
  ah_pf_dataCols = sup.pf_wrist_hand_landmark_columns+sup.pf_chest_pose_landmark_columns+sup.pf_h_v123_columns
else:
  ah_pf_dataCols = sup.pf_hand_landmark_columns+sup.pf_pose_landmark_columns

## Standardize data

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
ah_pf_scaler = StandardScaler()

PH3_AH_FRM_std_df = prePH3_ah_frame_df[sup.tag_columns+sup.class_columns+[sup.current_frame_col]]
PH3_AH_FRM_std_df[ah_pf_dataCols] = ah_pf_scaler.fit_transform(prePH3_ah_frame_df[ah_pf_dataCols])

PH3_AH_FRM_std_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  PH3_AH_FRM_std_df[ah_pf_dataCols] = ah_pf_scaler.fit_transform(prePH3_ah_frame_df[ah_pf_dataCols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  PH3_AH_FRM_std_df[ah_pf_dataCols] = ah_pf_scaler.fit_transform(prePH3_ah_frame_df[ah_pf_dataCols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  PH3_AH_

Unnamed: 0,fileid,person_id,cycle_num,handedness,class_name,class_numeric,active_hand,current_frame,wh0x,wh0y,...,cp_h_mean_z,h_v1x,h_v1y,h_v1z,h_v2x,h_v2y,h_v2z,h_v3x,h_v3y,h_v3z
0,p01/Ciclo_1_5_Derecha/Ciclo_1_5_Derecha_1.mp4,p01,1,0,1,29,1,0,0.764884,-0.869695,...,-0.610671,-0.718217,-1.012030,0.622521,0.172250,-1.204814,0.918248,-0.075909,0.217703,1.429862
1,p01/Ciclo_1_5_Derecha/Ciclo_1_5_Derecha_1.mp4,p01,1,0,1,29,0,0,1.438041,0.608877,...,-0.307176,-0.528814,1.262030,0.320316,0.390479,1.085597,0.747229,0.290752,0.100832,-0.544824
2,p01/Ciclo_1_5_Derecha/Ciclo_1_5_Derecha_1.mp4,p01,1,0,1,29,1,1,0.754249,-0.842544,...,-0.926753,-0.745743,-1.005569,0.541612,0.160396,-1.202498,0.825669,-0.080316,0.175252,1.445045
3,p01/Ciclo_1_5_Derecha/Ciclo_1_5_Derecha_1.mp4,p01,1,0,1,29,0,1,1.454831,0.677052,...,-0.552918,-0.548609,1.257528,0.274740,0.369654,1.084366,0.681460,0.278871,0.077592,-0.543173
4,p01/Ciclo_1_5_Derecha/Ciclo_1_5_Derecha_1.mp4,p01,1,0,1,29,1,2,0.721207,-0.860864,...,-0.662418,-0.790832,-1.000761,0.558894,0.139945,-1.202872,0.817269,-0.053439,0.168285,1.473003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94338,p11/Ciclo_5_5_Izquierda/Ciclo_5_5_Izquierda_z.mp4,p11,5,1,z,28,1,9,0.983778,-1.610890,...,-0.414600,-0.701825,-0.994490,0.249044,0.035133,-1.181446,0.314437,-0.058324,0.791610,-0.375360
94339,p11/Ciclo_5_5_Izquierda/Ciclo_5_5_Izquierda_z.mp4,p11,5,1,z,28,1,10,0.930203,-1.254603,...,-0.524962,-0.362571,-1.026824,0.313129,0.484862,-1.163868,0.378834,-0.112987,0.783633,-0.458311
94340,p11/Ciclo_5_5_Izquierda/Ciclo_5_5_Izquierda_z.mp4,p11,5,1,z,28,0,10,-0.168283,1.394977,...,-1.328846,1.238634,0.464985,-2.230619,-0.456329,0.794221,-1.668068,-1.990362,-1.928283,-0.663990
94341,p11/Ciclo_5_5_Izquierda/Ciclo_5_5_Izquierda_z.mp4,p11,5,1,z,28,1,11,1.069942,-0.828993,...,-0.264520,0.244367,-1.038705,0.477251,0.919816,-1.130570,0.684420,-0.014337,0.630783,-0.275856


## Save standardizers

In [12]:
with open(os.path.join(PH3_SUB_BINGEN_ROOT, f"scaler_{sup.DATA_AH_PF}.pkl"), 'wb') as f:
  pickle.dump(ah_pf_scaler, f)

# Dimensionality Reduction

In [13]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA, KernelPCA
import umap.umap_ as umap

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
def visualize(origin_df, data_cols, label_col, reducer_name, kernel_name=""):    
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')

    labels_cat = origin_df[label_col].astype('category')
    categories = labels_cat.cat.categories
    cat_to_idx = {cat: i for i, cat in enumerate(categories)}
    colors = labels_cat.map(cat_to_idx)
    cmap = plt.cm.Spectral

    if reducer_name == sup.PH3_REDUCER_NAME_PCA:
        reducer = PCA(n_components=3, random_state=42)
    elif reducer_name == sup.PH3_REDUCER_NAME_UMAP:
        reducer = umap.UMAP(n_components=3, random_state=42)
    elif reducer_name == sup.PH3_REDUCER_NAME_KPCA:
        reducer = KernelPCA(kernel=kernel_name,n_components=3, random_state=42)

    data = reducer.fit_transform(origin_df[data_cols])
    sc = ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=colors, cmap=cmap, s=20)

    for cat, idx in cat_to_idx.items():
        ax.scatter([], [], [], color=cmap(idx / (len(categories) - 1)), label=cat)
    ax.legend(title=label_col)

    figtitle = f"{reducer_name}-{kernel_name}-{sup.DATA_AH_PF}"
    ax.set_title(figtitle)

    PH3_SUB3_MEDIAGEN_ROOT = os.path.join(PH3_SUB_MEDIAGEN_ROOT, reducer_name, kernel_name)
    sup.create_dir_if_not_exists(PH3_SUB3_MEDIAGEN_ROOT)
    figpath = os.path.join(PH3_SUB3_MEDIAGEN_ROOT, f"{figtitle}.png")
    plt.savefig(figpath, dpi=300, bbox_inches='tight')

    plt.show()

def writeout(origin_df, data_cols, label_cols, reducer_name, kernel_name=""):
    for n in sup.PH3_N_CANDIDATES:
        if reducer_name == sup.PH3_REDUCER_NAME_PCA:
            reducer = PCA(n_components=n, random_state=42)
        elif reducer_name == sup.PH3_REDUCER_NAME_UMAP:
            reducer = umap.UMAP(n_components=n, random_state=42)
        elif reducer_name == sup.PH3_REDUCER_NAME_KPCA:
            reducer = KernelPCA(kernel=kernel_name,n_components=n, random_state=42)

        data = reducer.fit_transform(origin_df[data_cols])

        reduced_df = pd.DataFrame(data)
        reduced_df[sup.tag_columns+label_cols+[sup.current_frame_col]] = origin_df[sup.tag_columns+label_cols+[sup.current_frame_col]]

        PH3_SUB3_DATA_ROOT = os.path.join(PH3_SUB_DATA_ROOT, reducer_name, kernel_name)
        sup.create_dir_if_not_exists(PH3_SUB3_DATA_ROOT)
        PH3_SUB3_BINGEN_ROOT = os.path.join(PH3_SUB_BINGEN_ROOT, reducer_name, kernel_name)
        sup.create_dir_if_not_exists(PH3_SUB3_BINGEN_ROOT)

        reduced_df.to_csv(os.path.join(PH3_SUB3_DATA_ROOT, f"{sup.DATA_AH_PF}_{n}.csv"), index=False)
        with open(os.path.join(PH3_SUB3_BINGEN_ROOT, f"{sup.DATA_AH_PF}{n}.pkl"), 'wb') as f:
            pickle.dump(reducer, f)

## PCA

In [None]:
PH3_AH_FRM_PCA_df = PH3_AH_FRM_std_df.copy()

### Active hand detection

#### Exploration

In [None]:
visualize(origin_df=PH3_AH_FRM_PCA_df, data_cols=ah_pf_dataCols, label_col=sup.active_hand_col, reducer_name=sup.PH3_REDUCER_NAME_PCA, kernel_name='')

In [None]:
ah_frm_big_PCA = PCA(n_components=40)
ah_frm_big_PCA.fit(PH3_AH_FRM_PCA_df[ah_pf_dataCols])
ah_frm_big_PCA.explained_variance_ratio_

#### Writeout

In [None]:
writeout(origin_df=PH3_AH_FRM_PCA_df, data_cols=ah_pf_dataCols, label_cols=sup.class_columns, reducer_name=sup.PH3_REDUCER_NAME_PCA, kernel_name='')

## Kernel PCA

In [15]:
PH3_AH_FRM_KPCA_df = PH3_AH_FRM_std_df.copy()

### Active hand detection

In [16]:
PH3_AH_FRM_KPCA_df

Unnamed: 0,fileid,person_id,cycle_num,handedness,class_name,class_numeric,active_hand,current_frame,wh0x,wh0y,...,cp_h_mean_z,h_v1x,h_v1y,h_v1z,h_v2x,h_v2y,h_v2z,h_v3x,h_v3y,h_v3z
0,p01/Ciclo_1_5_Derecha/Ciclo_1_5_Derecha_1.mp4,p01,1,0,1,29,1,0,0.764884,-0.869695,...,-0.610671,-0.718217,-1.012030,0.622521,0.172250,-1.204814,0.918248,-0.075909,0.217703,1.429862
1,p01/Ciclo_1_5_Derecha/Ciclo_1_5_Derecha_1.mp4,p01,1,0,1,29,0,0,1.438041,0.608877,...,-0.307176,-0.528814,1.262030,0.320316,0.390479,1.085597,0.747229,0.290752,0.100832,-0.544824
2,p01/Ciclo_1_5_Derecha/Ciclo_1_5_Derecha_1.mp4,p01,1,0,1,29,1,1,0.754249,-0.842544,...,-0.926753,-0.745743,-1.005569,0.541612,0.160396,-1.202498,0.825669,-0.080316,0.175252,1.445045
3,p01/Ciclo_1_5_Derecha/Ciclo_1_5_Derecha_1.mp4,p01,1,0,1,29,0,1,1.454831,0.677052,...,-0.552918,-0.548609,1.257528,0.274740,0.369654,1.084366,0.681460,0.278871,0.077592,-0.543173
4,p01/Ciclo_1_5_Derecha/Ciclo_1_5_Derecha_1.mp4,p01,1,0,1,29,1,2,0.721207,-0.860864,...,-0.662418,-0.790832,-1.000761,0.558894,0.139945,-1.202872,0.817269,-0.053439,0.168285,1.473003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94338,p11/Ciclo_5_5_Izquierda/Ciclo_5_5_Izquierda_z.mp4,p11,5,1,z,28,1,9,0.983778,-1.610890,...,-0.414600,-0.701825,-0.994490,0.249044,0.035133,-1.181446,0.314437,-0.058324,0.791610,-0.375360
94339,p11/Ciclo_5_5_Izquierda/Ciclo_5_5_Izquierda_z.mp4,p11,5,1,z,28,1,10,0.930203,-1.254603,...,-0.524962,-0.362571,-1.026824,0.313129,0.484862,-1.163868,0.378834,-0.112987,0.783633,-0.458311
94340,p11/Ciclo_5_5_Izquierda/Ciclo_5_5_Izquierda_z.mp4,p11,5,1,z,28,0,10,-0.168283,1.394977,...,-1.328846,1.238634,0.464985,-2.230619,-0.456329,0.794221,-1.668068,-1.990362,-1.928283,-0.663990
94341,p11/Ciclo_5_5_Izquierda/Ciclo_5_5_Izquierda_z.mp4,p11,5,1,z,28,1,11,1.069942,-0.828993,...,-0.264520,0.244367,-1.038705,0.477251,0.919816,-1.130570,0.684420,-0.014337,0.630783,-0.275856


In [17]:
reducer = KernelPCA(kernel=sup.PH3_REDUCER_KERNEL_NAME_COS,n_components=1, random_state=42)

data = reducer.fit_transform(PH3_AH_FRM_KPCA_df[ah_pf_dataCols])

: 

#### Exploration

In [None]:
visualize(origin_df=PH3_AH_FRM_KPCA_df, data_cols=ah_pf_dataCols, label_col=sup.active_hand_col, reducer_name=sup.PH3_REDUCER_NAME_KPCA, kernel_name=sup.PH3_REDUCER_KERNEL_NAME_COS)

In [None]:
for kernel_name in sup.PH3_REDUCER_KERNEL_NAMES:
  visualize(origin_df=PH3_AH_FRM_KPCA_df, data_cols=ah_pf_dataCols, label_col=sup.active_hand_col, reducer_name=sup.PH3_REDUCER_NAME_KPCA, kernel_name=kernel_name)

#### Writeout

In [None]:
for kernel_name in sup.PH3_REDUCER_KERNEL_NAMES:
  writeout(origin_df=PH3_AH_FRM_KPCA_df, data_cols=ah_pf_dataCols, label_cols=sup.class_columns, reducer_name=sup.PH3_REDUCER_NAME_KPCA, kernel_name=kernel_name)

## UMAP

In [None]:
PH3_AH_FRM_UMAP_df = PH3_AH_FRM_std_df.copy()

### Active hand detection

#### Exploration

In [None]:
visualize(origin_df=PH3_AH_FRM_UMAP_df, data_cols=ah_pf_dataCols, label_col=sup.active_hand_col, reducer_name=sup.PH3_REDUCER_NAME_UMAP, kernel_name='')

#### Writeout

In [None]:
writeout(origin_df=PH3_AH_FRM_UMAP_df, data_cols=ah_pf_dataCols, label_cols=sup.class_columns, reducer_name=sup.PH3_REDUCER_NAME_UMAP, kernel_name='')