In [1]:
import warnings
# Set the warnings to be ignored
warnings.filterwarnings('ignore')

import os
import sys
import logging
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model  import Ridge,Lasso,RidgeCV, LassoCV, ElasticNet, ElasticNetCV, LogisticRegression
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt

from ydata_profiling import ProfileReport
import seaborn as sns
import pickle

In [9]:
import os

def output_filler(df, path):
    # Split the path using the appropriate separator for the current OS
    path = os.path.normpath(path)
    path_parts = path.split(os.path.sep)

    # Get the second-to-last part of the path
    output = path_parts[-2]

    # Add a new "Output" column to the DataFrame and set it to the output value
    df["Output"] = output
        
    return df


In [10]:
# How output_filler works
path = "D:/Data-Science-D-drive/Datasets-D-drive/sensor_placement/bending1/dataset1.csv"
df = pd.read_csv("D:/Data-Science-D-drive/Datasets-D-drive/sensor_placement/bending1/dataset1.csv", sep = "\t")
output_filler(df,path)

Unnamed: 0,# Task: bending1,Output
0,# Frequency (Hz): 20,bending1
1,# Clock (millisecond): 250,bending1
2,# Duration (seconds): 120,bending1
3,"# Columns: time,avg_rss12,var_rss12,avg_rss13,...",bending1
4,"0,39.25,0.43,22.75,0.43,33.75,1.30",bending1
...,...,...
479,"118750,43.33,0.47,25.00,0.00,30.00,0.00",bending1
480,"119000,43.50,0.50,25.50,0.50,30.00,0.00",bending1
481,"119250,43.50,0.50,24.75,0.43,30.00,0.00",bending1
482,"119500,43.50,0.50,24.33,0.47,30.00,0.00",bending1


# Filling Output Column and Creating New DatFrames

In [60]:
# Configure the logging module
# Remove or disable existing logging handlers
root_logger = logging.getLogger()
for handler in root_logger.handlers:
    root_logger.removeHandler(handler)

# Configure the logging module with a time format
log_filename = "error.log"
log_level = logging.DEBUG
logging.basicConfig(filename=log_filename, level=log_level, format='%(asctime)s [%(levelname)s] %(message)s')  # Save errors+info to a file

op_filepath_list = []
        
try:
    logging.info(f"Reading Main Folder")
    main_folder_path = "D:\Data-Science-D-drive\Datasets-D-drive\sensor_placement"
    main_folder_path = os.path.normpath(main_folder_path)
    
    try:
        ##### LISTING AND STORING SUB-FOLDERS CONTENTS PRESENT IN MAIN FOLDER PATH I.E. BENDING1, BENDING2, SITTING, ETC ... #####
        contents = os.listdir(main_folder_path)

    except Exception as e:
        logging.error(f"Error listing contents in {main_folder_path}: {e}")

    else:
        for paths in contents:
            ##### STORING SUB-FOLDERS PATHS PRESENT IN MAIN FOLDER PATH #####
            sub_folder_path = os.path.join(main_folder_path, paths)

            try:
                #### LISTING CONTENTS PRESENT IN SUB-FOLDERS PATH I.E. DATASET1, DATASET2, DATASET3, ETC... ####
                sub_folder_contents = os.listdir(sub_folder_path)
                    
                # CREATING "OUTPUT" DIRECTORY IN EACH SUB-FOLDERS TO STORE OUTPUT CSV #   
                directory_name = "output"

                output_path = os.path.join(sub_folder_path, directory_name)
                os.makedirs(output_path, exist_ok=True)

            except Exception as e:
                logging.error(f"Error listing contents in {sub_folder_path}: {e}")

            else:
                path_list = []

                for item in sub_folder_contents:
                    #### STORING FILE'S PATH PRESENT IN SUB-FOLDERS I.E. DATASET1, DATASET2, DATASET3, ETC... ####
                    file_path = os.path.join(sub_folder_path, item)
                    path_list.append(file_path)
                            
                ### READING FILE PATHS IN SUB-FOLDERS ###
                for path in path_list:
                    logging.info("="*100)
                    try:
                        csv_content = pd.read_csv(path, sep=",", skiprows=4, header=0)
                        csv_content = csv_content.reset_index(drop=True)
                        csv_content = output_filler(csv_content, path)

                        logging.info(f"Processing {path}")
                        logging.info(csv_content.head(2).to_string(index=False))

                        ### STORING NEW FILE'S CREATED IN OUTPUT FOLDER I.E. NEW_DATASET1, NEW_DATASET2, NEW_DATASET3, ETC... ####
                        try:
                            file_name = os.path.split(path)
                            file_name = file_name[-1].split(".")[0]
                        

                            logging.info("Creating Output File Path")
                            output_file_path = os.path.join(output_path, f"new_{file_name}.csv")
                            op_filepath_list.append(output_file_path)
                            #csv_content.to_csv(output_file_path)
                        
                        except Exception as e:
                            logging.error(f"Error creating Output File Path for {file_name}: {e}")


                    except Exception as e:
                        logging.error(f"Error Reading contents in {path}: {e}")


except Exception as e:
    print(e)


# Concatenating New DataFrames

In [44]:
# Remove or disable existing logging handlers
root_logger = logging.getLogger()
for handler in root_logger.handlers:
    root_logger.removeHandler(handler)

# Configure the logging module with a time format
log_filename = "concatenation.log"
log_level = logging.DEBUG
logging.basicConfig(filename=log_filename, level=log_level, format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)s] %(message)s")

try:
    main_folder_path = "D:\Data-Science-D-drive\Datasets-D-drive\sensor_placement"
    main_folder_path = os.path.normpath(main_folder_path)
    logging.info(f"Reading Main Folder: {main_folder_path.split(os.path.sep)[-1].upper()}")

except Exception as e:
        logging.error(f"Error processing MAIN PATH {main_folder_path}: {e}")

# LISTING AND STORING SUB-FOLDERS CONTENTS PRESENT IN MAIN FOLDER PATH I.E. BENDING1, BENDING2, SITTING, ETC ...    
try:
    main_folder_contents = os.listdir(main_folder_path)

except Exception as e:
    logging.error(f"Error listing MAIN_FOLDER_CONTENTS of MAIN PATH {main_folder_path}: {e}")

# STORING SUB-FOLDERS PATHS PRESENT IN MAIN FOLDER PATH
try:
    sub_folder_path_list = []

    for item in main_folder_contents:
        
        sub_folder_path = os.path.join(main_folder_path, item)
        sub_folder_path_list.append(sub_folder_path)

except Exception as e:
        logging.error(f"Error Storing SUB_FOLDER_PATH {path}: {e}")

# LIST ALL FILES AND DIRECTORIES IN THE SUB_FOLDER

for sub_path in sub_folder_path_list:
    print(sub_path.split(os.path.sep)[-1].upper())
    logging.info(f"Listing FILE_CONTENTS of SUB_FOLDER_PATH: {sub_path.split(os.path.sep)[-1].upper()}")
    logging.info("="*100)     

    try:
        file_contents = os.listdir(sub_path)

    except Exception as e:
        logging.error(f"Error Listing FILE_CONTENTS of SUB_FOLDER_PATH {sub_path}: {e}")

    else:
        try:
            logging.info("Storing FILE_CONTENTS in SUB_FOLDER_PATH")

            files_path_list = []

            # Store the paths
            for item in file_contents:
                file_path = os.path.join(sub_path, item)
                files_path_list.append(file_path)

        except Exception as e:
                logging.error(f"Error Storing FILE_CONTENTS in SUB_FOLDER_PATH {path}")

        else:
            # Selecting output folder in sub_folder_files
            try:
                logging.info("Selecting output folder in sub_folder_files")

                for f_path in files_path_list:
                    if "output" in f_path:
                        logging.info("Storing OUTPUT_CONTENTS in OUTPUT_PATH")

                        # Store all files and directories in the output folder
                        output_path = f_path 
                        output_contents = os.listdir(output_path)

            except Exception as e:
                    logging.error(f"Error Storing OUTPUT_CONTENTS in OUTPUT_PATH {path}: {e}")

            else:        
                    try:
                        logging.info("Storing OUTPUT_FILE_PATH in OP_FILEPATH_LIST")              

                        op_filepath_list = []

                        # Store the paths
                        for item in output_contents:
                            output_file_path = os.path.join(output_path,item)
                            op_filepath_list.append(output_file_path)

                    except Exception as e:
                            logging.error(f"Error Storing OUTPUT_FILE_PATH in OP_FILEPATH_LIST {path}: {e}")

                    try: 
                        logging.info("Concatenating OUTPUT_FILEPATH of OP_FILEPATH_LIST")

                        final_df = pd.DataFrame()

                        for output_filepath in op_filepath_list:
                            df = pd.read_csv(output_filepath)
                            df = df.reset_index(drop=True)

                            final_df = pd.concat([final_df,df], ignore_index=True)
                            
                            # Get the second-to-last part of the path
                            path_parts = output_filepath.split(os.path.sep)
                            output_fname = path_parts[-1] 
                            print(f"{output_fname} Shape = {df.shape}")

                        final_df_path = os.path.join(output_path, "final_df.csv")

                        # Extract the first column as a DataFrame using iloc
                        first_column_df = final_df.iloc[:, [0]]
                        final_df = final_df.drop(first_column_df, axis=1)

                        #final_df.to_csv(final_df_path)

                    except Exception as e:
                        logging.error(f"Error Concatenating OUTPUT_FILEPATH of OP_FILEPATH_LIST {path}: {e}")

BENDING1
new_dataset1.csv Shape = (480, 9)
new_dataset2.csv Shape = (480, 9)
new_dataset3.csv Shape = (480, 9)
new_dataset4.csv Shape = (480, 9)
new_dataset5.csv Shape = (480, 9)
new_dataset6.csv Shape = (480, 9)
new_dataset7.csv Shape = (480, 9)
BENDING2
new_dataset1.csv Shape = (480, 9)
new_dataset2.csv Shape = (480, 9)
new_dataset3.csv Shape = (480, 9)
new_dataset4.csv Shape = (480, 9)
new_dataset5.csv Shape = (480, 9)
new_dataset6.csv Shape = (480, 9)
BENDINGTYPE.PDF
CYCLING
new_dataset1.csv Shape = (480, 9)
new_dataset10.csv Shape = (480, 9)
new_dataset11.csv Shape = (480, 9)
new_dataset12.csv Shape = (480, 9)
new_dataset13.csv Shape = (480, 9)
new_dataset15.csv Shape = (480, 9)
new_dataset2.csv Shape = (480, 9)
new_dataset3.csv Shape = (480, 9)
new_dataset4.csv Shape = (480, 9)
new_dataset5.csv Shape = (480, 9)
new_dataset6.csv Shape = (480, 9)
new_dataset7.csv Shape = (480, 9)
new_dataset8.csv Shape = (480, 9)
LYING
new_dataset1.csv Shape = (480, 9)
new_dataset10.csv Shape = (48

# Concatenating Sub-Folders Final Datasets:

In [21]:
# REMOVE OR DISABLE EXISTING LOGGING HANDLERS
root_logger = logging.getLogger()
for handler in root_logger.handlers:
    root_logger.removeHandler(handler)

# CONFIGURE THE LOGGING MODULE WITH A TIME FORMAT:
log_filename = "concatenation_final_df.log"
log_level = logging.DEBUG
logging.basicConfig(filename=log_filename, level=log_level, format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)s] %(message)s")

try:
    main_folder_path = "D:\Data-Science-D-drive\Datasets-D-drive\sensor_placement"
    main_folder_path = os.path.normpath(main_folder_path)
    logging.info(f"Reading Main Folder: {main_folder_path.split(os.path.sep)[-1].upper()}")

except Exception as e:
        logging.error(f"Error processing MAIN PATH {main_folder_path}: {e}")

# LISTING AND STORING SUB-FOLDERS CONTENTS PRESENT IN MAIN FOLDER PATH I.E. BENDING1, BENDING2, SITTING, ETC ...    
try:
    main_folder_contents = os.listdir(main_folder_path)

except Exception as e:
    logging.error(f"Error listing MAIN_FOLDER_CONTENTS of MAIN PATH {main_folder_path}: {e}")

# STORING SUB-FOLDERS PATHS PRESENT IN MAIN FOLDER PATH
try:
    sub_folder_path_list = []

    for item in main_folder_contents:
        
        sub_folder_path = os.path.join(main_folder_path, item)
        sub_folder_path_list.append(sub_folder_path)

except Exception as e:
        logging.error(f"Error Storing SUB_FOLDER_PATH {path}: {e}")

# LIST ALL FILES AND DIRECTORIES IN THE SUB_FOLDER
output_df = pd.DataFrame()

for sub_path in sub_folder_path_list:
    print(sub_path.split(os.path.sep)[-1].upper())
    logging.info(f"Listing FILE_CONTENTS of SUB_FOLDER_PATH: {sub_path.split(os.path.sep)[-1].upper()}")
    logging.info("="*100)     

    try:
        file_contents = os.listdir(sub_path)

    except Exception as e:
        logging.error(f"Error Listing FILE_CONTENTS of SUB_FOLDER_PATH {sub_path}: {e}")

    else:
        try:
            logging.info("Storing FILE_CONTENTS in SUB_FOLDER_PATH")

            files_path_list = []

            # Store the paths
            for item in file_contents:
                file_path = os.path.join(sub_path, item)
                files_path_list.append(file_path)

        except Exception as e:
                logging.error(f"Error Storing FILE_CONTENTS in SUB_FOLDER_PATH {path}")

        else:
            # Selecting output folder in sub_folder_files
            try:
                logging.info("Selecting output folder in sub_folder_files")

                for f_path in files_path_list:
                    if "output" in f_path:
                        logging.info("Storing OUTPUT_CONTENTS in OUTPUT_PATH")

                        # Store all files and directories in the output folder
                        output_path = f_path 
                        output_contents = os.listdir(output_path)

            except Exception as e:
                    logging.error(f"Error Storing OUTPUT_CONTENTS in OUTPUT_PATH {path}: {e}")

            else:        
                    try:
                        logging.info("Storing OUTPUT_FILE_PATH in OP_FILEPATH_LIST")              

                        op_filepath_list = []

                        # Store the paths
                        for item in output_contents:
                            output_file_path = os.path.join(output_path,item)
                            op_filepath_list.append(output_file_path)

                    except Exception as e:
                            logging.error(f"Error Storing OUTPUT_FILE_PATH in OP_FILEPATH_LIST {path}: {e}")
######################################################
                    try: 
                        logging.info("Selecting and Storing final_df.csv from OP_FILEPATH_LIST")

                        for output_filepath in op_filepath_list:                
                            if "final_df" in output_filepath:
                                df = pd.read_csv(output_filepath)
                                df = df.reset_index(drop=True)

                                output_df = pd.concat([output_df,df], ignore_index=True)
                                
                                # Get the second-to-last part of the path
                                path_parts = output_filepath.split(os.path.sep)
                                output_fname = path_parts[-1] 
                                print(f"{output_fname} Shape = {df.shape}")                                

                    except Exception as e:
                        logging.error(f"Error Concatenating OUTPUT_FILEPATH of OP_FILEPATH_LIST {output_filepath}: {e}")
try:
    logging.info("Creating a path + Saving Output DF")
    output_df_path = os.path.join(main_folder_path, "output_df.csv")

    # Extract the first column as a DataFrame using iloc
    first_column_df = output_df.iloc[:, [0]]
    output_df = output_df.drop(first_column_df, axis=1)

    #output_df.to_csv(output_df_path)

except Exception as e:
     logging.error("Error while saving Output DF: {e}")

BENDING1
final_df.csv Shape = (3360, 9)
BENDING2
final_df.csv Shape = (2880, 9)
BENDINGTYPE.PDF
CYCLING
final_df.csv Shape = (6240, 9)
LYING
final_df.csv Shape = (7200, 9)
SENSORSPLACEMENT.PDF
SITTING
final_df.csv Shape = (6719, 9)
STANDING
final_df.csv Shape = (7200, 9)
WALKING
final_df.csv Shape = (7200, 9)


# Verifying Output DF's Classes:

In [20]:
output_df["Output"].unique()

array(['bending1', 'bending2', 'cycling', 'lying', 'sitting', 'standing',
       'walking'], dtype=object)

# EDA:

In [2]:
df1 = pd.read_csv("D:\Data-Science-D-drive\Datasets-D-drive\sensor_placement\output_df.csv")

In [3]:
df1["Output"].unique()

array(['bending1', 'bending2', 'cycling', 'lying', 'sitting', 'standing',
       'walking'], dtype=object)

In [4]:
df1.drop(columns="Unnamed: 0",inplace=True)

In [5]:
df1

Unnamed: 0,# Columns: time,avg_rss12,var_rss12,avg_rss13,var_rss13,avg_rss23,var_rss23,Output
0,0,39.25,0.43,22.75,0.43,33.75,1.30,bending1
1,250,39.25,0.43,23.00,0.00,33.00,0.00,bending1
2,500,39.25,0.43,23.25,0.43,33.00,0.00,bending1
3,750,39.50,0.50,23.00,0.71,33.00,0.00,bending1
4,1000,39.50,0.50,24.00,0.00,33.00,0.00,bending1
...,...,...,...,...,...,...,...,...
40794,118750,31.50,1.66,12.50,3.20,14.25,4.44,walking
40795,119000,27.33,1.25,11.33,0.94,20.00,4.00,walking
40796,119250,37.80,7.68,14.20,2.48,17.25,0.83,walking
40797,119500,33.75,1.30,15.75,5.21,16.50,2.69,walking


In [6]:
type(df1)

pandas.core.frame.DataFrame

In [12]:
profile = ProfileReport(df1)

# Save the report to an HTML file
profile.to_file("output_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset: 100%|██████████| 54/54 [00:10<00:00,  5.35it/s, Completed]                       
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.65s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.76s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 100.03it/s]


In [13]:
profile



In [47]:
df1.columns

Index(['# Columns: time', 'avg_rss12', 'var_rss12', 'avg_rss13', 'var_rss13',
       'avg_rss23', 'var_rss23', 'Output'],
      dtype='object')

In [48]:
col_list = list(df1.columns)

for i in col_list:
    if df1[i].isna().sum() > 0:
        df1[i].fillna(df1[i].median(),inplace=True)

In [49]:
profile = ProfileReport(df1)

In [50]:
profile

Summarize dataset: 100%|██████████| 53/53 [01:57<00:00,  2.22s/it, Completed]                       
Generate report structure: 100%|██████████| 1/1 [00:09<00:00,  9.42s/it]
Render HTML: 100%|██████████| 1/1 [00:02<00:00,  2.49s/it]


