In [None]:


# Load the dataset
file_path = "path/to/your/dataset.csv"
df = pd.read_csv(file_path)

# Separate features (X) and target variable (y)
X = df.drop('activity', axis=1)
y = df['activity']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (optional, but often recommended)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Encode the target variable (convert activity labels to numerical values)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the DecisionTreeClassifier
classifier = DecisionTreeClassifier(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the DecisionTreeClassifier with the best hyperparameters
best_classifier = DecisionTreeClassifier(random_state=42, **best_params)
best_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_test_pred = best_classifier.predict(X_test)

# Evaluate the model on the test data
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nTest Accuracy after Hyperparameter Tuning: {test_accuracy:.2f}")
print("Classification Report (Test Data):")
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix (Test Data):")
print(confusion_matrix(y_test, y_test_pred))


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Load the preprocessed data
X_train = pd.read_csv("../Data/X_train_cleaned.csv")
y_train = pd.read_csv("../Data/y_train_cleaned.csv")['activity']
# Load the preprocessed test data
X_test = pd.read_csv("../Data/X_test_cleaned.csv")
y_test = pd.read_csv("../Data/y_test_cleaned.csv")['activity']

In [3]:
# Train the Random Forest classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)


NameError: name 'RandomForestClassifier' is not defined

In [4]:
# Make predictions on the training data
y_train_pred = classifier.predict(X_train)

In [5]:
# Evaluate the model on the training data
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.2f}")

Training Accuracy: 1.00


In [6]:
print("Classification Report (Training Data):")
print(classification_report(y_train, y_train_pred))

Classification Report (Training Data):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4053
           1       1.00      1.00      1.00      9060
           2       1.00      1.00      1.00     11296
           3       1.00      1.00      1.00      4782
           4       1.00      1.00      1.00      7469
           5       1.00      1.00      1.00      8996

    accuracy                           1.00     45656
   macro avg       1.00      1.00      1.00     45656
weighted avg       1.00      1.00      1.00     45656



In [7]:
print("Confusion Matrix (Training Data):")
print(confusion_matrix(y_train, y_train_pred))

Confusion Matrix (Training Data):
[[ 4053     0     0     0     0     0]
 [    0  9060     0     0     0     0]
 [    0     0 11296     0     0     0]
 [    0     0     0  4782     0     0]
 [    0     0     0     0  7469     0]
 [    0     0     0     0     0  8996]]


In [8]:
# Make predictions on the test data
y_test_pred = classifier.predict(X_test)

In [9]:
# Evaluate the model on the test data
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nTest Accuracy: {test_accuracy:.2f}")


Test Accuracy: 0.91


In [10]:
print("Classification Report (Test Data):")
print(classification_report(y_test, y_test_pred))

Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.85      0.66      0.74       945
           1       0.90      0.91      0.90      2266
           2       1.00      1.00      1.00      2821
           3       0.99      0.97      0.98      1216
           4       0.86      0.84      0.85      1887
           5       0.82      0.91      0.86      2279

    accuracy                           0.91     11414
   macro avg       0.90      0.88      0.89     11414
weighted avg       0.91      0.91      0.91     11414



In [11]:
print("Confusion Matrix (Test Data):")
print(confusion_matrix(y_test, y_test_pred))

Confusion Matrix (Test Data):
[[ 625   62    1    2  105  150]
 [  35 2067    0    3   58  103]
 [   1    0 2819    1    0    0]
 [   4    4    2 1174    9   23]
 [  44   79    1    3 1588  172]
 [  29   95    0    2   78 2075]]


# Hyper-Parameter Tuned Model

In [18]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [19]:
# Create the Random Forest classifier
classifier = RandomForestClassifier(random_state=42)


In [20]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [None, 10, 20],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 200]},
             scoring='accuracy')

In [21]:
# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [22]:
# Train the Random Forest classifier with the best hyperparameters
best_classifier = RandomForestClassifier(random_state=42, **best_params)
best_classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=42)

In [23]:
# Make predictions on the test data
y_test_pred = best_classifier.predict(X_test)


In [24]:
# Evaluate the model on the test data
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nTest Accuracy after Hyperparameter Tuning: {test_accuracy:.2f}")


Test Accuracy after Hyperparameter Tuning: 0.91


In [25]:
print("Classification Report (Test Data):")
print(classification_report(y_test, y_test_pred))

Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.85      0.66      0.74       945
           1       0.90      0.91      0.91      2266
           2       1.00      1.00      1.00      2821
           3       0.99      0.97      0.98      1216
           4       0.86      0.84      0.85      1887
           5       0.82      0.91      0.87      2279

    accuracy                           0.91     11414
   macro avg       0.90      0.88      0.89     11414
weighted avg       0.91      0.91      0.91     11414



In [26]:
print("Confusion Matrix (Test Data):")
print(confusion_matrix(y_test, y_test_pred))

Confusion Matrix (Test Data):
[[ 628   62    0    2  108  145]
 [  37 2067    0    3   54  105]
 [   1    0 2820    0    0    0]
 [   4    4    2 1175   11   20]
 [  43   77    0    4 1587  176]
 [  28   90    0    2   75 2084]]
