#### **Week 3: Portfolio Assessment**

This notebook serves Porfolio Assessment in Week 3 of Unit `COS40007 - AI for Engineering`. All necessary inputs and data are configured automatically through downloading from google drive, so we don't need to change anything, just start and verify the results. My student ID ends with 1, so I will be using Shoulder - Left, Right, x, y, z

------------------------------------------------------------------------

**Minh Hieu Tran** - 104850021

### Step 1: Data Collection

In [None]:
import pandas as pd
import numpy as np

# Read the CSV files
df_boning = pd.read_csv('https://raw.githubusercontent.com/defurl/COS40007---AIEngineer/refs/heads/main/Week3/Boning.csv')
df_slicing = pd.read_csv('https://raw.githubusercontent.com/defurl/COS40007---AIEngineer/refs/heads/main/Week3/Slicing.csv')

# Select only Right Shoulder and Left Shoulder
boning_data = df_boning[['Frame', 'Right Shoulder x', 'Right Shoulder y', 'Right Shoulder z',
                        'Left Shoulder x', 'Left Shoulder y', 'Left Shoulder z']].copy()
slicing_data = df_slicing[['Frame', 'Right Shoulder x', 'Right Shoulder y', 'Right Shoulder z',
                          'Left Shoulder x', 'Left Shoulder y', 'Left Shoulder z']].copy()

# Add class labels
boning_data['class'] = 0
slicing_data['class'] = 1

print("Boning data shape:", boning_data.shape)
print("Slicing data shape:", slicing_data.shape)

Boning data shape: (54180, 8)
Slicing data shape: (17880, 8)


### Step 2: Create Composite Columns

In [5]:
def create_composite_features(data):
  result = data.copy()

  # For Right Shoulder
  rs_x, rs_y, rs_z = data['Right Shoulder x'], data['Right Shoulder y'], data['Right Shoulder z']

  # RMS calculations for Right Shoulder
  result['Right_Shoulder_RMS_xy'] = np.sqrt((rs_x**2 + rs_y**2) / 2)
  result['Right_Shoulder_RMS_yz'] = np.sqrt((rs_y**2 + rs_z**2) / 2)
  result['Right_Shoulder_RMS_zx'] = np.sqrt((rs_z**2 + rs_x**2) / 2)
  result['Right_Shoulder_RMS_xyz'] = np.sqrt((rs_x**2 + rs_y**2 + rs_z**2) / 3)

  # Roll and Pitch for Right Shoulder
  result['Right_Shoulder_Roll'] = 180 * np.arctan2(rs_y, np.sqrt(rs_x**2 + rs_z**2)) / np.pi
  result['Right_Shoulder_Pitch'] = 180 * np.arctan2(rs_x, np.sqrt(rs_y**2 + rs_z**2)) / np.pi

  # For Left Shoulder
  rs_x, ls_y, ls_z = data['Left Shoulder x'], data['Left Shoulder y'], data['Left Shoulder z']

  # RMS calculations for Left Shoulder
  result['Left_Shoulder_RMS_xy'] = np.sqrt((rs_x**2 + ls_y**2) / 2)
  result['Left_Shoulder_RMS_yz'] = np.sqrt((ls_y**2 + ls_z**2) / 2)
  result['Left_Shoulder_RMS_zx'] = np.sqrt((ls_z**2 + rs_x**2) / 2)
  result['Left_Shoulder_RMS_xyz'] = np.sqrt((rs_x**2 + ls_y**2 + ls_z**2) / 3)

  # Roll and Pitch for Left Shoulder
  result['Left_Shoulder_Roll'] = 180 * np.arctan2(ls_y, np.sqrt(rs_x**2 + ls_z**2)) / np.pi
  result['Left_Shoulder_Pitch'] = 180 * np.arctan2(rs_x, np.sqrt(ls_y**2 + ls_z**2)) / np.pi

  return result

# Apply composite feature creation to both datasets
boning_composite = create_composite_features(boning_data)
slicing_composite = create_composite_features(slicing_data)

print("After adding composite features:")
print("Boning data shape:", boning_composite.shape)
print("Slicing data shape:", slicing_composite.shape)

After adding composite features:
Boning data shape: (54180, 20)
Slicing data shape: (17880, 20)


### Step 3: Data Pre-processing & Feature Computation

In [6]:
from scipy.signal import find_peaks
import numpy as np

def compute_statistical_features(group):
  features = {}

  # Get all columns except Frame, class, and minute
  feature_cols = [col for col in group.columns if col not in ['Frame', 'class', 'minute']]

  for col in feature_cols:
    values = group[col].values

    # Mean
    features[f'{col}_mean'] = np.mean(values)
    # Standard deviation
    features[f'{col}_std'] = np.std(values)
    # Min
    features[f'{col}_min'] = np.min(values)
    # Max
    features[f'{col}_max'] = np.max(values)
    # Area under curve (AUC) using trapezoidal rule
    features[f'{col}_auc'] = np.trapz(np.abs(values))
    # Number of peaks
    peaks, _ = find_peaks(values, height=np.mean(values))
    features[f'{col}_num_peaks'] = len(peaks)

  return pd.Series(features)

def group_by_minute(data):
    data['minute'] = data['Frame'] // 60
    return data

# Group by minutes
boning_grouped = group_by_minute(boning_composite)
slicing_grouped = group_by_minute(slicing_composite)

print(f"Boning data: {len(boning_grouped['minute'].unique())} minutes")
print(f"Slicing data: {len(slicing_grouped['minute'].unique())} minutes")

Boning data: 903 minutes
Slicing data: 298 minutes


In [7]:
# Compute statistical features for each minute
boning_features = boning_grouped.groupby('minute').apply(compute_statistical_features).reset_index()
slicing_features = slicing_grouped.groupby('minute').apply(compute_statistical_features).reset_index()

# Add class labels
boning_features['class'] = 0
slicing_features['class'] = 1

# Combine datasets
final_dataset = pd.concat([boning_features, slicing_features], ignore_index=True)

# Remove the minute column
final_dataset = final_dataset.drop('minute', axis=1)

print("Final dataset shape:", final_dataset.shape)
print("Number of features (excluding class):", final_dataset.shape[1] - 1)
print("\nClass distribution:")
print(final_dataset['class'].value_counts())

  features[f'{col}_auc'] = np.trapz(np.abs(values))
  boning_features = boning_grouped.groupby('minute').apply(compute_statistical_features).reset_index()
  features[f'{col}_auc'] = np.trapz(np.abs(values))


Final dataset shape: (1201, 109)
Number of features (excluding class): 108

Class distribution:
class
0    903
1    298
Name: count, dtype: int64


  slicing_features = slicing_grouped.groupby('minute').apply(compute_statistical_features).reset_index()


### Step 4: Training - SVM Classifier

In [8]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [9]:

# Data preparation
X = final_dataset.drop('class', axis=1)
y = final_dataset['class']

print("===== Dataset for training: =====")
print(f"Features: {X.shape}")
print(f"Labels: {y.shape}")

# Split data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27, stratify=y)
print(f"\nTrain set: {X_train.shape}\nTest set: {X_test.shape}")

===== Dataset for training: =====
Features: (1201, 108)
Labels: (1201,)

Train set: (840, 108)
Test set: (361, 108)


#### 1 - 2: Basic SVM with train-test split (70/30) and 10-fold CV

In [27]:
# SVM
svm_test = SVC(random_state=27)
svm_test.fit(X_train, y_train)
y_pred_test = svm_test.predict(X_test)
accuracy_1_train_test = accuracy_score(y_test, y_pred_test)
print(f"Train-Test Split Accuracy: {accuracy_1_train_test:.4f}")

Train-Test Split Accuracy: 0.7507


In [11]:
# SVM 10-fold
svm_basic_cv = SVC(random_state=27)
cv_scores_basic = cross_val_score(svm_basic_cv, X, y, cv=10)
accuracy_2_cv = cv_scores_basic.mean()
std_2_cv = cv_scores_basic.std()
print(f"10-Fold CV Accuracy: {accuracy_2_cv:.4f} ± {std_2_cv:.4f}")

10-Fold CV Accuracy: 0.7519 ± 0.0033


#### 3: Hyperparameter tuning for BOTH train-test split AND 10-fold CV

In [12]:
# Param Grid for Grid Search
param_grid = {
  'C': [0.1, 1, 10, 100],
  'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
  'kernel': ['rbf', 'linear']
}

In [13]:
# Find best hyperparameters using GridSearch
grid_search = GridSearchCV(SVC(random_state=27), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"Best parameters found: {grid_search.best_params_}")

Best parameters found: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


In [14]:
# Apply hyperparameter tuning
print("Train-Test Split with Hyperparameter Tuning:")
y_pred_3a = grid_search.predict(X_test)
accuracy_3a_train_test = accuracy_score(y_test, y_pred_3a)
print(f"Train-Test Split Accuracy: {accuracy_3a_train_test:.4f}")

Train-Test Split with Hyperparameter Tuning:
Train-Test Split Accuracy: 0.7867


In [15]:
# Apply hyperparameter tuning to 10-fold Cross-Validation
print("10-Fold CV with Hyperparameter Tuning:")
# Use the best estimator for cross-validation
best_svm = grid_search.best_estimator_
cv_scores_3b = cross_val_score(best_svm, X, y, cv=10)
accuracy_3b_cv = cv_scores_3b.mean()
std_3b_cv = cv_scores_3b.std()
print(f"10-Fold CV Accuracy: {accuracy_3b_cv:.4f} ± {std_3b_cv:.4f}")

10-Fold CV with Hyperparameter Tuning:
10-Fold CV Accuracy: 0.7943 ± 0.0480


#### Step 4: Hyperparameter tuning + 10 best features

In [16]:
# Select 10 best features
selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]

In [17]:
# Split the selected features for train-test
X_train_selected, X_test_selected, _, _ = train_test_split(X_selected, y, test_size=0.3, random_state=27, stratify=y)

In [18]:
# 4a) Train-Test Split with Hyperparameter Tuning + 10 Best Features
print("Train-Test Split + Tuning + 10 Best Features:")
grid_search_4a = GridSearchCV(SVC(random_state=27), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_4a.fit(X_train_selected, y_train)
y_pred_4a = grid_search_4a.predict(X_test_selected)
accuracy_4a_train_test = accuracy_score(y_test, y_pred_4a)
print(f"Train-Test Split Accuracy: {accuracy_4a_train_test:.4f}")

Train-Test Split + Tuning + 10 Best Features:
Train-Test Split Accuracy: 0.7784


In [19]:
# 4b) 10-Fold CV with Hyperparameter Tuning + 10 Best Features
print("10-Fold CV + Tuning + 10 Best Features:")
best_svm_4b = grid_search_4a.best_estimator_
cv_scores_4b = cross_val_score(best_svm_4b, X_selected, y, cv=10)
accuracy_4b_cv = cv_scores_4b.mean()
std_4b_cv = cv_scores_4b.std()
print(f"10-Fold CV Accuracy: {accuracy_4b_cv:.4f} ± {std_4b_cv:.4f}")

10-Fold CV + Tuning + 10 Best Features:
10-Fold CV Accuracy: 0.8010 ± 0.0369


#### Step 5: Hyperparameter tuning + PCA (n_components=10)

In [20]:
# Apply PCA
print("Applying PCA to extract 10 principal components...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=10, random_state=27)
X_pca = pca.fit_transform(X_scaled)
print(f"PCA explained variance ratio: {pca.explained_variance_ratio_.sum():.4f}")

# Split the PCA features for train-test
X_train_pca, X_test_pca, _, _ = train_test_split(X_pca, y, test_size=0.3, random_state=27, stratify=y)

Applying PCA to extract 10 principal components...
PCA explained variance ratio: 0.7937


In [21]:
# Train-Test Split with Hyperparameter Tuning + 10 PCA Components
print("Train-Test Split + Tuning + 10 PCA Components:")
grid_search_5a = GridSearchCV(SVC(random_state=27), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_5a.fit(X_train_pca, y_train)
y_pred_5a = grid_search_5a.predict(X_test_pca)
accuracy_5a_train_test = accuracy_score(y_test, y_pred_5a)
print(f"Train-Test Split Accuracy: {accuracy_5a_train_test:.4f}")

Train-Test Split + Tuning + 10 PCA Components:
Train-Test Split Accuracy: 0.8033


In [22]:
# 10-Fold CV with Hyperparameter Tuning + 10 PCA Components
print("10-Fold CV + Tuning + 10 PCA Components:")
best_svm_5b = grid_search_5a.best_estimator_
cv_scores_5b = cross_val_score(best_svm_5b, X_pca, y, cv=10)
accuracy_5b_cv = cv_scores_5b.mean()
std_5b_cv = cv_scores_5b.std()
print(f"10-Fold CV Accuracy: {accuracy_5b_cv:.4f} ± {std_5b_cv:.4f}")

10-Fold CV + Tuning + 10 PCA Components:
10-Fold CV Accuracy: 0.8127 ± 0.0525


In [28]:
# Create Complete SVM Summary Table
print("COMPLETE SVM SUMMARY TABLE")

svm_results = pd.DataFrame({
  'Step': [
    'Step 1: Basic SVM',
    'Step 2: Basic SVM',
    'Step 3a: Hyperparameter Tuning',
    'Step 3b: Hyperparameter Tuning',
    'Step 4a: Tuning + 10 Best Features',
    'Step 4b: Tuning + 10 Best Features',
    'Step 5a: Tuning + 10 PCA Components',
    'Step 5b: Tuning + 10 PCA Components'
  ],
  'Method': [
    'Train-Test Split (70/30)',
    '10-Fold Cross-Validation',
    'Train-Test Split (70/30)',
    '10-Fold Cross-Validation',
    'Train-Test Split (70/30)',
    '10-Fold Cross-Validation',
    'Train-Test Split (70/30)',
    '10-Fold Cross-Validation'
  ],
  'Accuracy': [
    f"{accuracy_1_train_test:.4f}",
    f"{accuracy_2_cv:.4f} ± {std_2_cv:.4f}",
    f"{accuracy_3a_train_test:.4f}",
    f"{accuracy_3b_cv:.4f} ± {std_3b_cv:.4f}",
    f"{accuracy_4a_train_test:.4f}",
    f"{accuracy_4b_cv:.4f} ± {std_4b_cv:.4f}",
    f"{accuracy_5a_train_test:.4f}",
    f"{accuracy_5b_cv:.4f} ± {std_5b_cv:.4f}"
  ]
})

print(svm_results.to_string(index=False))

# Find best performing methods
train_test_scores = [
  ("Step 1 (Basic)", accuracy_1_train_test),
  ("Step 3a (+ Tuning)", accuracy_3a_train_test),
  ("Step 4a (+ Features)", accuracy_4a_train_test),
  ("Step 5a (+ PCA)", accuracy_5a_train_test)
]

cv_scores = [
  ("Step 2 (Basic)", accuracy_2_cv),
  ("Step 3b (+ Tuning)", accuracy_3b_cv),
  ("Step 4b (+ Features)", accuracy_4b_cv),
  ("Step 5b (+ PCA)", accuracy_5b_cv)
]

best_train_test = max(train_test_scores, key=lambda x: x[1])
best_cv = max(cv_scores, key=lambda x: x[1])

print(f"\nBEST RESULTS:")
print(f"   Best Train-Test Split: {best_train_test[0]} = {best_train_test[1]:.4f}")
print(f"   Best Cross-Validation: {best_cv[0]} = {best_cv[1]:.4f}")

COMPLETE SVM SUMMARY TABLE
                               Step                   Method        Accuracy
                  Step 1: Basic SVM Train-Test Split (70/30)          0.7507
                  Step 2: Basic SVM 10-Fold Cross-Validation 0.7519 ± 0.0033
     Step 3a: Hyperparameter Tuning Train-Test Split (70/30)          0.7867
     Step 3b: Hyperparameter Tuning 10-Fold Cross-Validation 0.7943 ± 0.0480
 Step 4a: Tuning + 10 Best Features Train-Test Split (70/30)          0.7784
 Step 4b: Tuning + 10 Best Features 10-Fold Cross-Validation 0.8010 ± 0.0369
Step 5a: Tuning + 10 PCA Components Train-Test Split (70/30)          0.8033
Step 5b: Tuning + 10 PCA Components 10-Fold Cross-Validation 0.8127 ± 0.0525

BEST RESULTS:
   Best Train-Test Split: Step 5a (+ PCA) = 0.8033
   Best Cross-Validation: Step 5b (+ PCA) = 0.8127


### Step 5: Training - Other Models

In [29]:
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold

# Define models with preprocessing pipelines
models = {
  "SGD": Pipeline([
    ("scaler", StandardScaler()),
    ("clf", SGDClassifier(loss="log_loss", alpha=1e-4, max_iter=2000, random_state=42))
  ]),
  "Random Forest": Pipeline([
    ("clf", RandomForestClassifier(
      n_estimators=100, max_depth=None, n_jobs=-1, random_state=42
    ))
  ]),
  "MLP": Pipeline([
    ("scaler", StandardScaler()),
    ("clf", MLPClassifier(
      hidden_layer_sizes=(100, 50),
      activation="relu",
      solver="adam",
      max_iter=500,
      random_state=42
    ))
  ]),
  "SVM (Best)": Pipeline([
    ("scaler", StandardScaler()),
    ("clf", grid_search.best_estimator_)
  ])
}

# Set up cross-validation
cv = KFold(n_splits=10, shuffle=True)

In [30]:
# Train models and collect results
model_results = []

for name, pipeline in models.items():
  print(f"\nTraining {name}:")

  # Train-test split accuracy
  pipeline.fit(X_train, y_train)
  y_pred = pipeline.predict(X_test)
  train_test_acc = accuracy_score(y_test, y_pred)

  # 10-fold cross-validation accuracy
  cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring="accuracy", n_jobs=-1)
  cv_mean = cv_scores.mean()
  cv_std = cv_scores.std()

  model_results.append({
    "Model": name,
    "Train-Test Split": f"{train_test_acc:.4f}",
    "10-Fold Cross-Validation": f"{cv_mean:.4f} ± {cv_std:.4f}"
  })

  print(f"  Train-test accuracy: {train_test_acc:.4f}")
  print(f"  10-fold CV accuracy: {cv_mean:.4f} ± {cv_std:.4f}")


Training SGD:
  Train-test accuracy: 0.7895
  10-fold CV accuracy: 0.7877 ± 0.0299

Training Random Forest:
  Train-test accuracy: 0.8033
  10-fold CV accuracy: 0.8126 ± 0.0253

Training MLP:
  Train-test accuracy: 0.7867
  10-fold CV accuracy: 0.8185 ± 0.0295

Training SVM (Best):
  Train-test accuracy: 0.7756
  10-fold CV accuracy: 0.8201 ± 0.0281


In [31]:
# Create results table
results_df = pd.DataFrame(model_results)
print("\nMODEL COMPARISON TABLE")
print(results_df.to_string(index=False))


MODEL COMPARISON TABLE
        Model Train-Test Split 10-Fold Cross-Validation
          SGD           0.7895          0.7877 ± 0.0299
Random Forest           0.8033          0.8126 ± 0.0253
          MLP           0.7867          0.8185 ± 0.0295
   SVM (Best)           0.7756          0.8201 ± 0.0281
