# Using classic machine learning models for movement pattern recognition

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

import seaborn as sns

import numpy as np

import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE

In [None]:
data = pd.read_csv('../../data/dataframes/labels_and_coordinates.csv')

# 1. Training models on all boulder data

## Preprocessing and split data

In [None]:
# Drop rows with missing values
data.dropna(inplace=True)

# Separate features and target variable
X = data.drop(columns=['frame', 'label'])
y = data['label']

# Encode categorical features
encoder = LabelEncoder()
X['boulder'] = encoder.fit_transform(X['boulder'])
X['camera'] = encoder.fit_transform(X['camera'])
X['participant'] = encoder.fit_transform(X['participant'])
X['repetition'] = encoder.fit_transform(X['repetition'])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.head()

## Models used: Logistic Regression, Decision Tree, KNN, Random Forest

In [None]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),  
    "Random Forest": RandomForestClassifier()
}

# Train and evaluate models
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Evaluating {name}...")
    print(classification_report(y_test, y_pred))



## Plotting evaluation

In [None]:
# Define the metrics for each model
metrics = {
    "Logistic Regression": [0.50, 0.29, 0.50, 0.36],
    "Decision Tree": [0.92, 0.92, 0.92, 0.92],
    "KNN": [0.94, 0.94, 0.94, 0.94],
    "Random Forest": [0.97, 0.97, 0.97, 0.97]
}

# Define the metrics labels
metric_labels = ["Accuracy", "Precision", "Recall", "F1-score"]

# Plot the metrics for each model
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20, 5))

teal_colors = ['#008080', '#009090', '#00A0A0', '#00B0B0']  # Teal color

for i, (name, metric_values) in enumerate(metrics.items()):
    ax = axes[i]
    ax.bar(metric_labels, metric_values, color=teal_colors)
    ax.set_title(name)
    ax.set_ylim(0, 1)  # Setting y-axis limit to [0, 1] for better visualization
    ax.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()


In [None]:
for name, metric_values in metrics.items():
    print(f"{name}: {metric_values}")

# 2. Training models with more preprocessing
- using only boulder W3 and W4
- dropping "no_movement_of_interest" and other labels

In [None]:
# -------- basic filtering of boulders and labels that are not of interest -----------
# Filter rows by boulder type
data = data[data['boulder'].isin(['W3', 'W4'])]

# Drop rows with missing values
data.dropna(inplace=True)

# Drop irrelevant labels
data = data[~data['label'].isin(['no_movement_of_interest', 'before_start_position', 'start_position'])]

In [None]:
# unique values from 'boulder' column
unique_boulder_values = data['boulder'].unique()
print("Unique values in 'boulder' column:", unique_boulder_values)

# unique values from 'label' column
unique_label_values = data['label'].unique()
print("Unique values in 'label' column:", unique_label_values)

In [None]:
#---------------------- some preprocessing -----------------------
# Separate features and target variable
X = data.drop(columns=['frame', 'label'])
y = data['label']

# Apply a rolling average to smooth sensor data
sensor_columns = X.columns[7:]  # Adjust index if needed
X[sensor_columns] = X[sensor_columns].rolling(window=5, min_periods=1).mean()

# Define categorical and numeric features
categorical_features = ['boulder', 'camera', 'participant', 'repetition']
numeric_features = [col for col in X.columns if col not in categorical_features]

# Preprocessor: encode categorical features and pass through numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Split the dataset into training and testing sets before SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing to training data
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train_preprocessed, y_train)

In [None]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000, class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier()
}

"""
# uncomment to use GrisSearch later on
# Define a pipeline and grid search for each model
param_grid = {
    "Logistic Regression": {'classifier__C': [0.01, 0.1, 1, 10, 100]},
    "Decision Tree": {'classifier__max_depth': [None, 10, 20, 30, 40, 50]},
    "KNN": {'classifier__n_neighbors': [3, 5, 7, 9]},
    "Random Forest": {'classifier__n_estimators': [50, 100, 200], 'classifier__max_depth': [None, 10, 20, 30]}
}
"""

In [None]:
for name, model in models.items():
    pipeline = Pipeline(steps=[('classifier', model)])

    """
    # Uncomment these lines to use GridSearchCV later
    grid_search = GridSearchCV(pipeline, param_grid[name], cv=5, scoring='f1_weighted', n_jobs=-1)
    print(f"Training {name}...")
    grid_search.fit(X_res, y_res)
    X_test_preprocessed = preprocessor.transform(X_test)
    y_pred = grid_search.predict(X_test_preprocessed)
    """
    
    print(f"Training {name}...")
    pipeline.fit(X_res, y_res)
    X_test_preprocessed = preprocessor.transform(X_test)
    y_pred = pipeline.predict(X_test_preprocessed)
    
    print(f"Evaluating {name}...")
    print(classification_report(y_test, y_pred))

In [None]:
# Define the metrics for each model
metrics = {
    "Logistic Regression": [0.83, 0.79, 0.89, 0.83],
    "Decision Tree": [0.93, 0.93, 0.93, 0.93],
    "KNN": [0.94, 0.94, 0.98, 0.95],
    "Random Forest": [0.98, 0.97, 0.96, 0.97]
}

# Define the metrics labels
metric_labels = ["Accuracy", "Precision", "Recall", "F1-score"]

# Plot the metrics for each model
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20, 5))

teal_colors = ['#008080', '#009090', '#00A0A0', '#00B0B0']  # Teal color

for i, (name, metric_values) in enumerate(metrics.items()):
    ax = axes[i]
    ax.bar(metric_labels, metric_values, color=teal_colors)
    ax.set_title(name)
    ax.set_ylim(0, 1)  # Setting y-axis limit to [0, 1] for better visualization
    ax.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
# Generate and display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=sorted(y_test.unique()), yticklabels=sorted(y_test.unique()))
plt.title(f'Confusion Matrix for {name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Count the occurrences of each unique label
label_counts = data['label'].value_counts()

# Display the counts
print(label_counts)