In [34]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import joblib
import pandas as pd

In [27]:
landmarks_path = "../output"

In [28]:
import os

# Load and preprocess data
dataframes = []
for file in os.listdir(landmarks_path):
    if file.endswith(".csv"):
        label = file.split("_")[0]  # Extract label from filename
        df = pd.read_csv(os.path.join(landmarks_path, file))
        df['label'] = label  # Add label column
        df = df.drop(df.columns[0], axis=1)
        dataframes.append(df)

# Combine all dataframes
data = pd.concat(dataframes, ignore_index=True)

# Split each class data separately to maintain balance
train_dataframes = []
test_dataframes = []
for label, group in data.groupby('label'):
    X_group = group.drop(columns=['label'])
    y_group = group['label']
    X_train_group, X_test_group, y_train_group, y_test_group = train_test_split(
        X_group, y_group, test_size=0.1, random_state=42, stratify=y_group
    )
    train_dataframes.append(pd.concat([X_train_group, y_train_group], axis=1))
    test_dataframes.append(pd.concat([X_test_group, y_test_group], axis=1))

# Combine training and testing data
train_data = pd.concat(train_dataframes, ignore_index=True)
test_data = pd.concat(test_dataframes, ignore_index=True)
train_data.shape, test_data.shape

  data = pd.concat(dataframes, ignore_index=True)


((3224, 133), (359, 133))

In [29]:
# Separate features and labels
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3224, 132), (3224,), (359, 132), (359,))

In [30]:
y_train.value_counts(), y_test.value_counts()

(label
 watering    1737
 digging     1487
 Name: count, dtype: int64,
 label
 watering    193
 digging     166
 Name: count, dtype: int64)

In [31]:
y_train

0        digging
1        digging
2        digging
3        digging
4        digging
          ...   
3219    watering
3220    watering
3221    watering
3222    watering
3223    watering
Name: label, Length: 3224, dtype: object

In [None]:
# Train models with different classifiers
classifiers = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'KNeighbors': KNeighborsClassifier(n_neighbors=5),
    'SVC': SVC(kernel='linear', random_state=42)
}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")
    joblib.dump(clf, f"../models/{name}.joblib")

RandomForest Accuracy: 1.00
KNeighbors Accuracy: 1.00
SVC Accuracy: 0.98
