In [12]:
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
import joblib
import pandas as pd

In [13]:
landmarks_path = "../output"
model_path = "../models"
if not os.path.exists(model_path):
    os.makedirs(model_path)
pca_model_path = "../models/pca"
if not os.path.exists(pca_model_path):
    os.makedirs(pca_model_path)

In [14]:
exclude_categories = ["planting", "harvesting"]

In [15]:
# Load and preprocess data
dataframes = []
for file in os.listdir(landmarks_path):
    if file.endswith("augmented.csv") and not any(exclude in file for exclude in exclude_categories):
        label = file.split("_")[0]  # Extract label from filename
        df = pd.read_csv(os.path.join(landmarks_path, file))
        df['label'] = label  # Add label column
        df = df.drop(df.columns[0], axis=1)
        dataframes.append(df)

# Combine all dataframes
data = pd.concat(dataframes, ignore_index=True)

# Split each class data separately to maintain balance
train_dataframes = []
test_dataframes = []
for label, group in data.groupby('label'):
    X_group = group.drop(columns=['label'])
    y_group = group['label']
    X_train_group, X_test_group, y_train_group, y_test_group = train_test_split(
        X_group, y_group, test_size=0.3, stratify=y_group, random_state=1
    )
    train_dataframes.append(pd.concat([X_train_group, y_train_group], axis=1))
    test_dataframes.append(pd.concat([X_test_group, y_test_group], axis=1))

# Combine training and testing data
train_data = pd.concat(train_dataframes, ignore_index=True)
test_data = pd.concat(test_dataframes, ignore_index=True)
train_data.shape, test_data.shape

((81207, 133), (34805, 133))

In [16]:
# Separate features and labels
X_train = train_data.drop(columns=['label']).values
y_train = train_data['label'].values
X_test = test_data.drop(columns=['label']).values
y_test = test_data['label'].values
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((81207, 132), (81207,), (34805, 132), (34805,))

In [17]:
# Fit PCA on training data and transform both train and test
pca = PCA(n_components=0.97, svd_solver="full")  # retain 99.9% variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"Original feature dim: {X_train.shape[1]}")
print(f"Reduced feature dim: {X_train_pca.shape[1]}")

Original feature dim: 132
Reduced feature dim: 19


In [18]:
# Train models with PCA features
classifiers = {
    "MLPClassifier": MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=1)
}
for name, clf in classifiers.items():
    clf.fit(X_train_pca, y_train)
    y_pred = clf.predict(X_test_pca)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} PCA Accuracy: {accuracy:.2f}")
    joblib.dump(clf, f"{pca_model_path}/{name}_pca.joblib")

joblib.dump(pca, os.path.join(pca_model_path, "pca.joblib"))

MLPClassifier PCA Accuracy: 0.99


['../models/pca/pca.joblib']