In [11]:
import os
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import LabelEncoder

# Function to preprocess pose data
def preprocess_pose_data(file_path, num_bins=40, target_rows=1000):
    col = [f'{i}' for i in range(0, 79)]
    pose_data = pd.read_csv(file_path, names=col, header=None)


    #pose_data.drop(['30', '31', '32', ...], axis=1, inplace=True)

    pose_data.fillna(0, inplace=True)

    rows_to_add = target_rows - len(pose_data)

    if rows_to_add > 0:
        n_iter = math.ceil(rows_to_add / len(pose_data))
        for _ in range(n_iter):
            additional_data = pose_data.iloc[0:rows_to_add, :]
            pose_data = pd.concat([pose_data, additional_data], ignore_index=True)

    elif rows_to_add < 0:
        pose_data = pose_data.iloc[0:target_rows, :]

    histogram_vector = np.array([])

    for joint in pose_data.columns:
        hist, _ = np.histogram(pose_data[joint], bins=num_bins, density=True)
        histogram_vector = np.concatenate((histogram_vector, hist))

    histogram_vector = histogram_vector.reshape(-1)

    return histogram_vector

label_encoder = LabelEncoder()

# Function to load and preprocess training data
def load_and_preprocess_train_data(folder_path):
    data = []
    labels = []

    for subdir, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(subdir, file)
            if file_path.endswith('.csv'):
                histogram_vector = preprocess_pose_data(file_path)
                data.append(histogram_vector)

                # Extract label from the filename
                label = os.path.basename(file).split('_')[1]
                labels.append(label)

    X_train = np.array(data)
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(labels)

    return X_train, y_train

# Function to load and preprocess test data with labels
def load_and_preprocess_test_data(folder_path):
    data = []
    labels = []

    for subdir, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(subdir, file)
            if file_path.endswith('.csv'):
                histogram_vector = preprocess_pose_data(file_path)
                data.append(histogram_vector)

                # Extract label from the filename for the test set
                file_basename = os.path.basename(file)
                split_result = file_basename.split('_')

                # Check if the split operation produced at least two elements
                if len(split_result) >= 2:
                    label = split_result[1]
                    labels.append(label)
                else:
                    # If there is no underscore, use the entire filename as the label
                    label = os.path.splitext(file_basename)[0]
                    labels.append(label)

    X_test = np.array(data)
    return X_test, labels

# Assuming your training data is in the folder '/content/learning-of-structured-data-fhws-ws2324 (1)/train/train'
folder_path_train = 'D:/MAI/Sem 2/LST/Portfolio 3/train'
X_train, y_train = load_and_preprocess_train_data(folder_path_train)

# Assuming your test data is in the folder '/content/learning-of-structured-data-fhws-ws2324 (1)/test/test'
folder_path_test = 'D:/MAI/Sem 2/LST/Portfolio 3/test'
X_test, test_labels = load_and_preprocess_test_data(folder_path_test)

# Fit the label_encoder on both training and test set labels
all_labels = np.concatenate([y_train, test_labels])
label_encoder.fit(all_labels)

# Print shapes and values for debugging
print("Shapes - X_train:", X_train.shape, " y_train:", y_train.shape)
print("Unique labels:", np.unique(y_train))
print("Shape of X_test:", X_test.shape)



Shapes - X_train: (1167, 3160)  y_train: (1167,)
Unique labels: [0 1 2 3 4]
Shape of X_test: (305, 3160)


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=1500, random_state=42)

# Train the classifier on the entire training set
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_test_pred = rf_classifier.predict(X_test)

# Assuming test_labels is the original class labels for your test data
# # Convert the predictions back to original class labels
y_test_pred_original = label_encoder.inverse_transform(y_test_pred)


X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

y_val_pred = rf_classifier.predict(X_val_split)

# Print the classification report for the validation split
print(classification_report(y_val_split, y_val_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       1.00      1.00      1.00        56
           2       1.00      1.00      1.00        55
           3       1.00      1.00      1.00        44
           4       1.00      1.00      1.00        42

    accuracy                           1.00       234
   macro avg       1.00      1.00      1.00       234
weighted avg       1.00      1.00      1.00       234



In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(rf_classifier, X_train, y_train, cv=3, scoring='accuracy')

# Print cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

Cross-validation scores: [0.94858612 0.92802057 0.89717224]
Mean cross-validation score: 0.9245929734361611


In [None]:
submission_df = pd.DataFrame({'ID': test_labels, 'action': y_test_pred_original})
submission_df.to_csv('/content/learning-of-structured-data-fhws-ws2324 (1)/submission.csv', index=False)