In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install tsfresh

In [None]:
!pip install boruta

In [18]:
#import all libraries
import os
import pandas as pd
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import make_forecasting_frame
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
# Directory paths for fall and ADL sequences
fall_dir = '/content/drive/MyDrive/Capstone/AccelerometerData/Fall'
adl_dir = '/content/drive/MyDrive/Capstone/AccelerometerData/ADL'

# Function to load CSV files and return a list of dataframes
def load_csv_data(directory,classification):
    data = []
    for file_name in os.listdir(directory):
        if file_name.endswith('.csv'):
            file_path = os.path.join(directory, file_name)
            # Specify column names explicitly
            df = pd.read_csv(file_path, header=None, names=['time', 'SV_total', 'Ax', 'Ay', 'Az'])
            df['time'] = pd.to_datetime(df['time'], unit='ms')
            df['classification'] = classification
            data.append(df)
    return data

# Load CSV files for fall and ADL sequences
fall_data = load_csv_data(fall_dir,0)
adl_data = load_csv_data(adl_dir,1)

combined_data = fall_data + adl_data

In [7]:
def add_sequence_id(df, sequence_id):
    df['sequence_id'] = sequence_id
    return df

time_series = [add_sequence_id(df, sequence_id=i) for i, df in enumerate(combined_data)]


In [None]:
time_series

In [None]:
def extract_tsfresh_features(ts_df):
    print(ts_df.head())
    features = extract_features(ts_df, column_id="sequence_id", column_sort="time")
    return features

features = pd.concat([extract_tsfresh_features(df) for df in time_series])

In [None]:
features

In [None]:
y = pd.Series([df['classification'][0] for df in time_series])

features.replace([np.inf, -np.inf], np.nan, inplace=True)

impute(features)
features_filtered = select_features(features, y)

In [None]:
features_filtered

In [13]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Calculate mutual information scores
mi_scores = mutual_info_classif(features_filtered, y)

# Select top 500 features based on mutual information scores
k_best = SelectKBest(mutual_info_classif, k=500)
selected_features = k_best.fit_transform(features_filtered, y)

# Get the indices of the selected features
selected_indices = k_best.get_support(indices=True)

# Filter the original features dataframe to keep only the selected features
selected_features_df = features_filtered.iloc[:, selected_indices]


In [None]:
selected_features_df

In [None]:
# Step 1: Remove Highly Correlated Features Using Pearson Correlation Coefficient
def remove_highly_correlated_features(features_df, threshold=0.85):
    corr_matrix = features_df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    features_df_filtered = features_df.drop(columns=to_drop)
    return features_df_filtered

def apply_boruta(features_df, y):
    # Initialize Random Forest classifier
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    boruta_selector = BorutaPy(rf, n_estimators=100, verbose=2, random_state=42)
    boruta_selector.fit(features_df.values, y.values)
    selected_features = features_df.columns[boruta_selector.support_].tolist()
    return features_df[selected_features]


# Step 3: Selection of Top Features Using Feature Importance
def select_top_features(features_df, y, n_top_features=5):
    selector = SelectKBest(score_func=f_classif, k=n_top_features)
    selector.fit(features_df, y)
    selected_indices = selector.get_support(indices=True)
    selected_features = features_df.columns[selected_indices].tolist()
    return features_df[selected_features]

# Assuming features_df is your DataFrame containing all the features and y is your target variable

# Step 1: Remove Highly Correlated Features
features_df_filtered = remove_highly_correlated_features(selected_features_df)

# Step 2: Boruta Algorithm
#boruta_features = apply_boruta(features_df_filtered, y)

# Step 3: Selection of Top Features Using Feature Importance
top_features = select_top_features(features_df_filtered, y, n_top_features=10)


In [None]:
top_features

In [20]:
# X = top_features
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Step 3: Train an ML classifier
# rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_classifier.fit(X_train, y_train)

# # Step 4: Evaluate the classifier
# y_pred = rf_classifier.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Step 1: Prepare the data
# Assuming top_features contains the selected features and y is the target variable

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(top_features, y, test_size=0.2, random_state=42)

# Step 3: Train the classifier
# Choose either RandomForestClassifier or GaussianNB
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# classifier = GaussianNB()

classifier.fit(X_train, y_train)

# Step 4: Evaluate the classifier
# Option 1: Evaluate using accuracy on the testing set
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Option 2: Perform k-fold cross-validation to get average accuracy
# k = 3 (as mentioned)
cv_scores = cross_val_score(classifier, top_features, y, cv=3)
average_accuracy = cv_scores.mean()
print("Average accuracy (k-fold cross-validation):", average_accuracy)


Accuracy: 1.0
Average accuracy (k-fold cross-validation): 0.9710144927536232
