# Setup

In [1]:
import time
import os
from pyspark.sql import SparkSession

top_dir = '/Users/diego/projects/itesoS3/machine-learning/machine-learning-s3'
NUM_CLASSES = 'two-classes'
ph1_labels_path = top_dir+f'/data/processed/ph1/{NUM_CLASSES}/ph1_data.csv'
ph2_labels_path = top_dir+f"/data/processed/ph2/{NUM_CLASSES}/ph2_parquet_files"
id_col_names = ['person_id', 'cycle_id', 'handedness', 'class', 'class_numeric']

In [2]:
# to get the execution time
start = time.time()

## SPARK SETUP
# Create Spark Session in localhost
spark = spark = SparkSession.builder.master("local").\
    appName("itesoS3-data-prep-ph2").\
    config("spark.driver.bindAddress","localhost").\
    config("spark.ui.port","4040").\
    getOrCreate()

# Create Spark Context
sc = spark.sparkContext
sc.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/14 22:39:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Read the labels file

In [3]:
df = spark.read\
    .option("header", "true") \
    .option("inferSchema", "true")\
    .csv(ph1_labels_path)

landmark_col_names = []
for i in range(21):
    landmark_col_names.append(str(i)+"x")
    landmark_col_names.append(str(i)+"y")
    landmark_col_names.append(str(i)+"z")

df.printSchema()

root
 |-- class: string (nullable = true)
 |-- person_id: integer (nullable = true)
 |-- cycle_id: integer (nullable = true)
 |-- handedness: integer (nullable = true)
 |-- frame_id: integer (nullable = true)
 |-- 0x: double (nullable = true)
 |-- 0y: double (nullable = true)
 |-- 0z: double (nullable = true)
 |-- 1x: double (nullable = true)
 |-- 1y: double (nullable = true)
 |-- 1z: double (nullable = true)
 |-- 2x: double (nullable = true)
 |-- 2y: double (nullable = true)
 |-- 2z: double (nullable = true)
 |-- 3x: double (nullable = true)
 |-- 3y: double (nullable = true)
 |-- 3z: double (nullable = true)
 |-- 4x: double (nullable = true)
 |-- 4y: double (nullable = true)
 |-- 4z: double (nullable = true)
 |-- 5x: double (nullable = true)
 |-- 5y: double (nullable = true)
 |-- 5z: double (nullable = true)
 |-- 6x: double (nullable = true)
 |-- 6y: double (nullable = true)
 |-- 6z: double (nullable = true)
 |-- 7x: double (nullable = true)
 |-- 7y: double (nullable = true)
 |-- 7z: 

In [4]:
df.show(n=10)

+-----+---------+--------+----------+--------+-------------------+-------------------+--------------------+-------------------+------------------+--------------------+------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+-----

# Phase 2 of data preparation transformations

## We obtain the center of gravity of the hand

In [5]:
relevant_df = df.select(id_col_names+landmark_col_names)
relevant_df.show(n=10)

+---------+--------+----------+-----+-------------+-------------------+-------------------+--------------------+-------------------+------------------+--------------------+------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+

In [6]:
from pyspark.sql.functions import lit

w_mean_df = relevant_df.alias('w_mean_df')

w_mean_df = w_mean_df.withColumn('mean_x', sum(w_mean_df['{}x'.format(i)] for i in range(21))/lit(21))
w_mean_df = w_mean_df.withColumn('mean_y', sum(w_mean_df['{}y'.format(i)] for i in range(21))/lit(21))
w_mean_df = w_mean_df.withColumn('mean_z', sum(w_mean_df['{}z'.format(i)] for i in range(21))/lit(21))
w_mean_df.show()

+---------+--------+----------+-----+-------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------

## We want the vector normal to the '"plane formed by the hand"

### We extract the position of points A (which lies between points 5 and 9) and B (which lies between points 13 and 17). 

In [7]:
AB_df = w_mean_df.alias('AB_df')

# A is the point between point #5 and point #9
AB_df = AB_df.withColumn('Ax', (AB_df['5x'] + AB_df['9x'])/lit(2))
AB_df = AB_df.withColumn('Ay', (AB_df['5y'] + AB_df['9y'])/lit(2))
AB_df = AB_df.withColumn('Az', (AB_df['5z'] + AB_df['9z'])/lit(2))

# B is the point between point #13 and point #17
AB_df = AB_df.withColumn('Bx', (AB_df['13x'] + AB_df['17x'])/lit(2))
AB_df = AB_df.withColumn('By', (AB_df['13y'] + AB_df['17y'])/lit(2))
AB_df = AB_df.withColumn('Bz', (AB_df['13z'] + AB_df['17z'])/lit(2))

AB_df.show(n=10)

+---------+--------+----------+-----+-------------+-------------------+-------------------+--------------------+-------------------+------------------+--------------------+------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+

#### We extract the vectors going from 0 to A  and from 0 to B

In [8]:
zeroA_zeroB_df = AB_df.alias('zeroA_zeroB_df')

# v2 is the vector going from 0 to A
zeroA_zeroB_df = zeroA_zeroB_df.withColumn('0Ax', zeroA_zeroB_df['Ax'] - zeroA_zeroB_df['0x'])
zeroA_zeroB_df = zeroA_zeroB_df.withColumn('0Ay', zeroA_zeroB_df['Ay'] - zeroA_zeroB_df['0y'])
zeroA_zeroB_df = zeroA_zeroB_df.withColumn('0Az', zeroA_zeroB_df['Az'] - zeroA_zeroB_df['0z'])

# v1 is the vector going from 0 to B
zeroA_zeroB_df = zeroA_zeroB_df.withColumn('0Bx', zeroA_zeroB_df['Bx'] - zeroA_zeroB_df['0x'])
zeroA_zeroB_df = zeroA_zeroB_df.withColumn('0By', zeroA_zeroB_df['By'] - zeroA_zeroB_df['0y'])
zeroA_zeroB_df = zeroA_zeroB_df.withColumn('0Bz', zeroA_zeroB_df['Bz'] - zeroA_zeroB_df['0z'])

zeroA_zeroB_df.show(n=10)

+---------+--------+----------+-----+-------------+-------------------+-------------------+--------------------+-------------------+------------------+--------------------+------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+

#### We need to normalize the vectors

In [9]:
norms_0A_0B_df = zeroA_zeroB_df.alias('norms_0A_0B_df')

# first we calculate the norm of each vector

norms_0A_0B_df = norms_0A_0B_df.withColumn('n0A', (norms_0A_0B_df['0Ax']**2 + norms_0A_0B_df['0Ax']**2 + norms_0A_0B_df['0Ax']**2 )**0.5)

norms_0A_0B_df = norms_0A_0B_df.withColumn('n0B', (norms_0A_0B_df['0Bx']**2 + norms_0A_0B_df['0Bx']**2 + norms_0A_0B_df['0Bx']**2 )**0.5)

norms_0A_0B_df.show(n=10)

+---------+--------+----------+-----+-------------+-------------------+-------------------+--------------------+-------------------+------------------+--------------------+------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+

In [10]:
normalized_v1_v2_df = norms_0A_0B_df.alias('normalized_v1_v2_df')

# Then we normalize the vector coordinates

############################### IMPORTANT NOTE ###############################
# 
# This is the point where we choose which of 0A and 0B is v1 and which is v2 for our future base.
# I hadn't realized this previously, but this is important with regards to handedness.
# I have decided that in the interest of keeping the coordinate system "hand centric", i will keep these two vectors attached to the previously chosen points of the hand.
# For v3, I have also decided it will always point out of the hand the way the hand closes. 
# This means for one of the two hands the coordinate system will not be "natural" in the pysics sense (as in the cross product of v1 and v2 is not v3, but rather the vector opposite of v3) 
#
############################### IMPORTANT NOTE ###############################

normalized_v1_v2_df = normalized_v1_v2_df.withColumn('v1x', normalized_v1_v2_df['0Bx']/normalized_v1_v2_df['n0B'])
normalized_v1_v2_df = normalized_v1_v2_df.withColumn('v1y', normalized_v1_v2_df['0By']/normalized_v1_v2_df['n0B'])
normalized_v1_v2_df = normalized_v1_v2_df.withColumn('v1z', normalized_v1_v2_df['0Bz']/normalized_v1_v2_df['n0B'])

normalized_v1_v2_df = normalized_v1_v2_df.withColumn('v2x', normalized_v1_v2_df['0Ax']/normalized_v1_v2_df['n0A'])
normalized_v1_v2_df = normalized_v1_v2_df.withColumn('v2y', normalized_v1_v2_df['0Ay']/normalized_v1_v2_df['n0A'])
normalized_v1_v2_df = normalized_v1_v2_df.withColumn('v2z', normalized_v1_v2_df['0Az']/normalized_v1_v2_df['n0A'])

normalized_v1_v2_df.show(n=10)

+---------+--------+----------+-----+-------------+-------------------+-------------------+--------------------+-------------------+------------------+--------------------+------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+

### We wish to obtain the vector normal to the 'plane of the palm', which we assimilate here to the plane formed by 0B and 0A

In [11]:
v3_df = normalized_v1_v2_df.alias('v3_df')

# prev3 is the cross product of v1 and v2.
# prev3_x = v1_y*v2_z - v1_z*v2_y
# prev3_y = v1_z*v2_x - v1_x*v2_z
# prev3_z = v1_x*v2_y - v1_y*v2_x

v3_df = v3_df.withColumn('prev3x', v3_df['v1y']*v3_df['v2z'] - v3_df['v1z']*v3_df['v2y'])
v3_df = v3_df.withColumn('prev3y', v3_df['v1z']*v3_df['v2x'] - v3_df['v1x']*v3_df['v2z'])
v3_df = v3_df.withColumn('prev3z', v3_df['v1x']*v3_df['v2y'] - v3_df['v1y']*v3_df['v2x'])

############################### IMPORTANT NOTE ###############################
#
# As previously mentioned, I will keep the coordinate system "hand centric" and have decided v3 should always point in the same way relative to the hand.
# As a consequence, for one of the two hands (the right hand), the coordinate system will not be "natural" in the physics sense.
#
############################### IMPORTANT NOTE ###############################
from pyspark.sql.functions import when

v3_df = v3_df.withColumn('v3x', when(v3_df['handedness'] == 1, v3_df['prev3x']).otherwise(-1 * v3_df['prev3x']))
v3_df = v3_df.withColumn('v3y', when(v3_df['handedness'] == 1, v3_df['prev3y']).otherwise(-1 * v3_df['prev3y']))
v3_df = v3_df.withColumn('v3z', when(v3_df['handedness'] == 1, v3_df['prev3z']).otherwise(-1 * v3_df['prev3z']))


# v1 and v2 were normalized so v3 is already normalized
v3_df.show(n=10)

+---------+--------+----------+-----+-------------+-------------------+-------------------+--------------------+-------------------+------------------+--------------------+------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+

### We drop the unnecessary columns

In [12]:
vectors_df = v3_df.select(id_col_names+landmark_col_names+['v1x', 'v1y', 'v1z', 'v2x', 'v2y', 'v2z']+['v3x', 'v3y', 'v3z']+['mean_x', 'mean_y', 'mean_z'])
vectors_df.show(n=10)

+---------+--------+----------+-----+-------------+-------------------+-------------------+--------------------+-------------------+------------------+--------------------+------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+

## We want a position and orientation independent coordinate system

### First we move the origin of the frame of reference to the wrist

In [13]:
wrist_df = vectors_df.alias('wrist_df')

# wrist is point 0

for i in range(21):
    wrist_df = wrist_df.withColumn('w{}x'.format(i), wrist_df['{}x'.format(i)] - wrist_df['0x'])
    wrist_df = wrist_df.withColumn('w{}y'.format(i), wrist_df['{}y'.format(i)] - wrist_df['0y'])
    wrist_df = wrist_df.withColumn('w{}z'.format(i), wrist_df['{}z'.format(i)] - wrist_df['0z'])

wrist_df.show()

+---------+--------+----------+-----+-------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------

### Next we want to change the base of the frame of reference to the one defined by (v1, v2, v3)



If $P$ is the matrix $\left(\begin{array}{cc} v_{1} & v_{2} & v_{3}\end{array}\right)$, we need to calculate it's inver $P^{-1}$.

$P^{-1}$ is the adjoint matrix of $P$ divided by the determinat of $P$.

if $P$ is
$$
\left(\begin{array}{cc} 
v_{1x} & v_{2x} & v_{3x}\\
v_{1y} & v_{2y} & v_{3y}\\
v_{1z} & v_{2z} & v_{3z}
\end{array}\right)
$$ 
then the adjoint matrix of $P$ is the transpose of
$$
\left(\begin{array}{cc} 
A_{11} & A_{12} & A_{13}\\
A_{21} & A_{22} & A_{23}\\
A_{31} & A_{32} & A_{33}
\end{array}\right)
$$ 

where $A_{ij} = det(C_{ij})$ is the cofactor
where is $C_{ij}$ is the minor of the adjoint of matrix $P$

and the determinant of $P$ is: $v_{1x} * det(C_{11}) - v_{2x} * det(C_{12}) + v_{3x} * det(C_{13})$

#### Let's calculate the nine cofactors

In [14]:


cofactors_df = wrist_df.alias('cofactors_df')

# A11
cofactors_df = cofactors_df.withColumn('A11', cofactors_df['v2y'] * cofactors_df['v3z'] - cofactors_df['v2z'] * cofactors_df['v3y'])

# A12
cofactors_df = cofactors_df.withColumn('A12', cofactors_df['v1y'] * cofactors_df['v3z'] - cofactors_df['v1z'] * cofactors_df['v3y'])

# A13
cofactors_df = cofactors_df.withColumn('A13', cofactors_df['v1y'] * cofactors_df['v2z'] - cofactors_df['v1z'] * cofactors_df['v2y'])

# A21
cofactors_df = cofactors_df.withColumn('A21', cofactors_df['v2x'] * cofactors_df['v3z'] - cofactors_df['v2z'] * cofactors_df['v3x'])

# A22
cofactors_df = cofactors_df.withColumn('A22', cofactors_df['v1x'] * cofactors_df['v3z'] - cofactors_df['v1z'] * cofactors_df['v3x'])

# A23
cofactors_df = cofactors_df.withColumn('A23', cofactors_df['v1x'] * cofactors_df['v2z'] - cofactors_df['v1z'] * cofactors_df['v2x'])

# A31
cofactors_df = cofactors_df.withColumn('A31', cofactors_df['v2x'] * cofactors_df['v3y'] - cofactors_df['v2y'] * cofactors_df['v3x'])

# A32
cofactors_df = cofactors_df.withColumn('A32', cofactors_df['v1x'] * cofactors_df['v3y'] - cofactors_df['v1y'] * cofactors_df['v3x'])

# A33
cofactors_df = cofactors_df.withColumn('A33', cofactors_df['v1x'] * cofactors_df['v2y'] - cofactors_df['v1y'] * cofactors_df['v2x'])

cofactors_df.show(n=10)



+---------+--------+----------+-----+-------------+-------------------+-------------------+--------------------+-------------------+------------------+--------------------+------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+

#### Now let's calculate the determinant

In [15]:
determinant_df = cofactors_df.alias('determinant_df')

# the determinant of P is: v1x * det(C11) - v2x * det(C12) + v3x * det(C13)

determinant_df = determinant_df.withColumn('det', determinant_df['v1x'] * determinant_df['A11'] - determinant_df['v2x'] * determinant_df['A12'] + determinant_df['v3x'] * determinant_df['A13'])

determinant_df.show(n=10)

+---------+--------+----------+-----+-------------+-------------------+-------------------+--------------------+-------------------+------------------+--------------------+------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+

### Now we can get the inverse matrix of P

In [16]:
inv_P_df = determinant_df.alias('inv_P_df')

# P^{-1} is equal to the transpose of the matrix of cofactors we previously found

inv_P_df = inv_P_df.withColumn('I11', inv_P_df['A11']/inv_P_df['det'])

inv_P_df = inv_P_df.withColumn('I12', inv_P_df['A21']/inv_P_df['det'])

inv_P_df = inv_P_df.withColumn('I13', inv_P_df['A31']/inv_P_df['det'])

inv_P_df = inv_P_df.withColumn('I21', inv_P_df['A12']/inv_P_df['det'])

inv_P_df = inv_P_df.withColumn('I22', inv_P_df['A22']/inv_P_df['det'])

inv_P_df = inv_P_df.withColumn('I23', inv_P_df['A32']/inv_P_df['det'])

inv_P_df = inv_P_df.withColumn('I31', inv_P_df['A13']/inv_P_df['det'])

inv_P_df = inv_P_df.withColumn('I32', inv_P_df['A23']/inv_P_df['det'])

inv_P_df = inv_P_df.withColumn('I33', inv_P_df['A33']/inv_P_df['det'])


inv_P_df.show(n=10)

+---------+--------+----------+-----+-------------+-------------------+-------------------+--------------------+-------------------+------------------+--------------------+------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+

### To change base we must perform the following operation: P^{-1] * v_i, where v_i is the position vector of point i

In [17]:
rebased_df = inv_P_df.alias('left_mult_df')


for i in range(21):
    rebased_df = rebased_df.withColumn('b{}x'.format(i), rebased_df['I11'] * rebased_df['w{}x'.format(i)] + rebased_df['I12'] * rebased_df['w{}y'.format(i)] + rebased_df['I13'] * rebased_df['w{}z'.format(i)])
    rebased_df = rebased_df.withColumn('b{}y'.format(i), rebased_df['I21'] * rebased_df['w{}x'.format(i)] + rebased_df['I22'] * rebased_df['w{}y'.format(i)] + rebased_df['I23'] * rebased_df['w{}z'.format(i)])
    rebased_df = rebased_df.withColumn('b{}z'.format(i), rebased_df['I31'] * rebased_df['w{}x'.format(i)] + rebased_df['I32'] * rebased_df['w{}y'.format(i)] + rebased_df['I33'] * rebased_df['w{}z'.format(i)])

rebased_df.show(n=10)


+---------+--------+----------+-----+-------------+-------------------+-------------------+--------------------+-------------------+------------------+--------------------+------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+

### We keep only the relevant columns

In [18]:
final_landmark_col_names = []

for i in range(21):
    final_landmark_col_names.append("b"+str(i)+"x")
    final_landmark_col_names.append("b"+str(i)+"y")
    final_landmark_col_names.append("b"+str(i)+"z")


final_df = rebased_df.select(id_col_names+final_landmark_col_names+['v3x', 'v3y', 'v3z']+['mean_x', 'mean_y', 'mean_z'])
final_df.show(n=10)

+---------+--------+----------+-----+-------------+----+---+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------

In [19]:
final_df.printSchema()

root
 |-- person_id: integer (nullable = true)
 |-- cycle_id: integer (nullable = true)
 |-- handedness: integer (nullable = true)
 |-- class: string (nullable = true)
 |-- class_numeric: integer (nullable = true)
 |-- b0x: double (nullable = true)
 |-- b0y: double (nullable = true)
 |-- b0z: double (nullable = true)
 |-- b1x: double (nullable = true)
 |-- b1y: double (nullable = true)
 |-- b1z: double (nullable = true)
 |-- b2x: double (nullable = true)
 |-- b2y: double (nullable = true)
 |-- b2z: double (nullable = true)
 |-- b3x: double (nullable = true)
 |-- b3y: double (nullable = true)
 |-- b3z: double (nullable = true)
 |-- b4x: double (nullable = true)
 |-- b4y: double (nullable = true)
 |-- b4z: double (nullable = true)
 |-- b5x: double (nullable = true)
 |-- b5y: double (nullable = true)
 |-- b5z: double (nullable = true)
 |-- b6x: double (nullable = true)
 |-- b6y: double (nullable = true)
 |-- b6z: double (nullable = true)
 |-- b7x: double (nullable = true)
 |-- b7y: double

# We write to the processed data directory. Data preparation is over

In [20]:
final_df\
        .write\
        .partitionBy('person_id', 'cycle_id', 'handedness', 'class')\
        .mode("overwrite")\
        .parquet(ph2_labels_path)





CodeCache: size=131072Kb used=42224Kb max_used=42225Kb free=88847Kb
 bounds [0x00000001081e8000, 0x000000010ab78000, 0x00000001101e8000]
 total_blobs=14648 nmethods=13670 adapters=889
 compilation: disabled (not enough contiguous free space left)


                                                                                

In [21]:
print((df.count(), len(df.columns)))

(3235, 69)


In [22]:
sc.stop()

In [23]:
print((time.time() - start)/60)

0.3802570859591166
