# Portfolio Assessment-2: “Systematic approach to develop ML model”

## 1 - Data collection

In [33]:
import pandas as pd
file_paths = ['ampc2/Boning.csv', 'ampc2/Slicing.csv']

# 6 cols frame
cols_to_read = [f'Right Hand {i}' for i in ['x', 'y', 'z']] + [f'Left Hand {i}' for i in ['x', 'y', 'z']] + ['Frame']

# 0 - boning
boning_df = pd.read_csv(file_paths[0], usecols=cols_to_read)
boning_df['class'] = 0

# 1 - slicing
slicing_df = pd.read_csv(file_paths[1], usecols=cols_to_read)
slicing_df['class'] = 1

df = pd.concat([boning_df, slicing_df], ignore_index=True)

df.to_csv('ampc2/combined_df.csv', index=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72060 entries, 0 to 72059
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Frame         72060 non-null  int64  
 1   Right Hand x  72060 non-null  float64
 2   Right Hand y  72060 non-null  float64
 3   Right Hand z  72060 non-null  float64
 4   Left Hand x   72060 non-null  float64
 5   Left Hand y   72060 non-null  float64
 6   Left Hand z   72060 non-null  float64
 7   class         72060 non-null  int64  
dtypes: float64(6), int64(2)
memory usage: 4.4 MB


## 2 - Create composite columns

### Column set 1 - Right Hand

In [34]:
import numpy as np

# ------- Column set 1 - Right hand -------

# Root mean square of x and y
df['rms_right_xy'] = np.sqrt(np.mean(df[['Right Hand x', 'Right Hand y']] ** 2, axis=1))

# Root mean square of y and z
df['rms_right_yz'] = np.sqrt(np.mean(df[['Right Hand y', 'Right Hand z']] ** 2, axis=1))

# Root mean square of x and z
df['rms_right_xz'] = np.sqrt(np.mean(df[['Right Hand x', 'Right Hand z']] ** 2, axis=1))

# Root mean square of x, y, and z
df['rms_right_xyz'] = np.sqrt(np.mean(df[['Right Hand x', 'Right Hand y', 'Right Hand z']] ** 2, axis=1))

# Right hand roll value
df['right_hand_roll'] = 180 * np.arctan2(df['Right Hand y'], np.sqrt(df['Right Hand x']**2 + df['Right Hand z']**2)) / np.pi

# Right hand pitch value
df['right_hand_pitch'] = 180 * np.arctan2(df['Right Hand x'], np.sqrt(df['Right Hand y']**2 + df['Right Hand z']**2)) / np.pi


### Column set 2 - Left Hand

In [35]:
# ------- Column set 2 - Left hand -------

# Root mean square of x and y
df['rms_left_xy'] = np.sqrt(np.mean(df[['Left Hand x', 'Left Hand y']] ** 2, axis=1))

# Root mean square of y and z
df['rms_left_yz'] = np.sqrt(np.mean(df[['Left Hand y', 'Left Hand z']] ** 2, axis=1))

# Root mean square of x and z
df['rms_left_xz'] = np.sqrt(np.mean(df[['Left Hand x', 'Left Hand z']] ** 2, axis=1))

# Root mean square of x, y, and z
df['rms_left_xyz'] = np.sqrt(np.mean(df[['Left Hand x', 'Left Hand y', 'Left Hand z']] ** 2, axis=1))

# Left hand roll value
df['left_hand_roll'] = 180 * np.arctan2(df['Left Hand y'], np.sqrt(df['Left Hand x']**2 + df['Left Hand z']**2)) / np.pi

# Left hand pitch value
df['left_hand_pitch'] = 180 * np.arctan2(df['Left Hand x'], np.sqrt(df['Left Hand y']**2 + df['Left Hand z']**2)) / np.pi

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72060 entries, 0 to 72059
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Frame             72060 non-null  int64  
 1   Right Hand x      72060 non-null  float64
 2   Right Hand y      72060 non-null  float64
 3   Right Hand z      72060 non-null  float64
 4   Left Hand x       72060 non-null  float64
 5   Left Hand y       72060 non-null  float64
 6   Left Hand z       72060 non-null  float64
 7   class             72060 non-null  int64  
 8   rms_right_xy      72060 non-null  float64
 9   rms_right_yz      72060 non-null  float64
 10  rms_right_xz      72060 non-null  float64
 11  rms_right_xyz     72060 non-null  float64
 12  right_hand_roll   72060 non-null  float64
 13  right_hand_pitch  72060 non-null  float64
 14  rms_left_xy       72060 non-null  float64
 15  rms_left_yz       72060 non-null  float64
 16  rms_left_xz       72060 non-null  float6

In [37]:
df.head()

Unnamed: 0,Frame,Right Hand x,Right Hand y,Right Hand z,Left Hand x,Left Hand y,Left Hand z,class,rms_right_xy,rms_right_yz,rms_right_xz,rms_right_xyz,right_hand_roll,right_hand_pitch,rms_left_xy,rms_left_yz,rms_left_xz,rms_left_xyz,left_hand_roll,left_hand_pitch
0,0,0.311465,-0.329472,0.750763,0.906499,-0.024053,0.779686,0,0.320595,0.57974,0.574742,0.506362,-22.065225,20.801413,0.641217,0.551584,0.845473,0.690466,-1.152438,49.287457
1,1,0.563723,-0.088187,1.033415,0.917992,0.172597,0.860215,0,0.40346,0.733391,0.832385,0.681544,-4.284295,28.524936,0.660492,0.620387,0.889573,0.733137,7.811881,46.296499
2,2,0.474087,-0.922834,0.802289,0.813233,0.054823,0.709743,0,0.733615,0.864665,0.658948,0.757199,-44.720146,21.191281,0.576348,0.503359,0.763244,0.62399,2.907617,48.80299
3,3,0.690891,-1.622115,0.393867,0.557506,0.006721,0.910102,0,1.246713,1.180336,0.562344,1.043027,-63.882655,22.484355,0.394245,0.643557,0.754685,0.61621,0.360782,31.489924
4,4,0.179927,-1.985673,1.390812,0.409958,-0.113903,0.978904,0,1.409835,1.714243,0.991648,1.403523,-54.767962,4.244606,0.300865,0.69686,0.750439,0.61625,-6.125888,22.586656


## 3 - Data pre-processing and Feature computation

In [None]:
import pandas as pd
import numpy as np

from scipy.signal import find_peaks
from scipy import integrate

def calculate_auc(y):
     return integrate.trapezoid(y) # Calculate area under the curve (AUC) using trapezoidal integration
    
def calculate_peak(y):
   peaks, _ = find_peaks(y)
   return len(peaks)


frames_per_minute = 60

num_minutes = len(df)

new_cols = {}

for column in df.columns:
    if column not in ['Frame', 'class']:
        # Initialize lists to hold the new features values for each chunk
        mean_values = []
        max_values = []
        min_values = []
        std_values = []
        auc_values = []
        peak_values = []
        
        # Loop through each chunk
        for i in range(num_minutes):
            start = i*frames_per_minute
            end = (i+1)*frames_per_minute
            
            # Calculate the mean, max, min, and std values for the chunk
            mean_values.append(np.mean(df[column][start:end]))
            max_values.append(np.max(df[column][start:end]))
            min_values.append(np.min(df[column][start:end]))
            std_values.append(np.std(df[column][start:end]))
            
            # Calculate the AUC for the chunk
            auc_values.append(calculate_auc(df[column][start:end]))
            
            # Calculate the number of peaks for the chunk
            peak_values.append(calculate_peak(df[column][start:end]))
            
        # Add the new features to the new_features_df
        new_cols[f'{column}_mean'] = mean_values
        new_cols[f'{column}_max'] = max_values
        new_cols[f'{column}_min'] = min_values
        new_cols[f'{column}_std'] = std_values
        new_cols[f'{column}_auc'] = auc_values
        new_cols[f'{column}_peak'] = peak_values
        
# Convert the new columns to a dataframe to avoid fragmentation issues
new_features_df = pd.DataFrame(new_cols)

new_features_df['class'] = df['class'][::frames_per_minute].reset_index(drop=True)
            
new_features_df['Minute'] = range(1, num_minutes + 1)

In [None]:
new_features_df.to_csv('ampc2/new_features_per_min.csv', index=False)
new_features_df.head()