# Data Preprocessing


**Creation date:** 7/6/2022

**Author:** Farzaneh

**Modification history:** No modification

## Importing Libraries

In [9]:
import numpy as np #mathematical operations
import pandas as pd #data manipulation and analysis
import dp_functions as func #importing my data preprocessing functions
from dp_parameters import Options #importing my parameters
from matplotlib import pyplot as plt #data visualization library for 2D and 3D plots
import seaborn as sns #plotting statistical graphics
import os # operating system interfaces
import gzip

In [2]:
#Reading all users data in one file
'''
uuid_list=func.uuid_list('user_data_file')
alldata=pd.DataFrame()
for i, uuid in enumerate(uuid_list):
    X = pd.read_csv('user_data_file/%s.features_labels.csv.gz' % uuid, compression='gzip')
    X['uuid']=uuid
    alldata=pd.concat([alldata,X],axis=0)
alldata.to_csv(opts.resultpath, index = False)    
'''

## Importing Datasets

In [10]:
# Reading the data
opts=Options()
dataset= pd.read_csv(opts.filepath)

In [11]:
#Exploring the dataset
func.inf_(dataset,opts)

────────────────────────────────────────
Dimensions of the dataset:
(377346, 279)
────────────────────────────────────────
Number of instance: 377346
                                        
────────────────────────────────────────
Features: 217
                                        
────────────────────────────────────────
Feature matrix : (377346 , 217)
                                        
────────────────────────────────────────
Binary labels: 51
                                        
────────────────────────────────────────
Binary label matrix : (377346 , 51)
                                        
────────────────────────────────────────
Categorical label matrix : (377346 , 0)
                                        
────────────────────────────────────────
Datatypes of the columns:
                                        
float64    276
int64        2
object       1
dtype: int64
────────────────────────────────────────
General info of the data:
                          

Unnamed: 0,timestamp,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,...,label:ELEVATOR,label:OR_standing,label:AT_SCHOOL,label:PHONE_IN_HAND,label:PHONE_IN_BAG,label:PHONE_ON_TABLE,label:WITH_CO-WORKERS,label:WITH_FRIENDS,label_source,uuid
0,1444079161,0.996815,0.003529,-0.002786,0.006496,0.995203,0.996825,0.998502,1.748756,6.684605,...,,0.0,,,,1.0,1.0,,2,00EABED2-271D-49D8-B599-1D4A09240601
1,1444079221,0.996864,0.004172,-0.00311,0.00705,0.994957,0.996981,0.998766,1.935573,6.684603,...,,0.0,,,,1.0,1.0,,2,00EABED2-271D-49D8-B599-1D4A09240601
2,1444079281,0.996825,0.003667,0.003094,0.006076,0.994797,0.996614,0.998704,2.03178,6.684605,...,,0.0,,,,1.0,1.0,,2,00EABED2-271D-49D8-B599-1D4A09240601
3,1444079341,0.996874,0.003541,0.000626,0.006059,0.99505,0.996907,0.99869,1.865318,6.684605,...,,0.0,,,,1.0,1.0,,2,00EABED2-271D-49D8-B599-1D4A09240601
4,1444079431,0.997371,0.037653,0.043389,0.102332,0.995548,0.99686,0.998205,0.460806,6.683904,...,,0.0,,,,1.0,1.0,,2,00EABED2-271D-49D8-B599-1D4A09240601


## Checking summary Statistics of the data

In [12]:
#Summary Statistics of the data
func.stat(dataset)

────────────────────────────────────────
Summary Statistics for Numerical data:
────────────────────────────────────────
          timestamp  raw_acc:magnitude_stats:mean  \
count  3.773460e+05                 377056.000000   
mean   1.445839e+09                      1.002223   
std    5.907009e+06                      0.079623   
min    1.433537e+09                      0.018148   
25%    1.441438e+09                      0.992556   
50%    1.444974e+09                      1.001258   
75%    1.448696e+09                      1.012745   
max    1.464899e+09                      3.185837   

       raw_acc:magnitude_stats:std  raw_acc:magnitude_stats:moment3  \
count                377056.000000                    377056.000000   
mean                      0.038832                         0.037772   
std                       0.096109                         0.113198   
min                       0.000030                        -0.493806   
25%                       0.001709            

## Handling Duplicates

In [13]:
# Removing duplicates from dataset
dataset=func.duplicate(dataset)

******************** There are not any duplicates in data ********************


## Handling zero entry columns

In [14]:
#Remove zero columns before missing value imputation
dataset , zero_col_nam=func.zer_col(dataset)

******************** The number of columns with all zero entries :  3 ********************
['discrete:battery_plugged:is_wireless'
 'discrete:battery_state:is_unknown'
 'discrete:wifi_status:is_reachable_via_wwan']
******************** New dataframe (no zero columns) dimension: (377346, 276) ********************


### Features and Labels


In [15]:
#splitting features by type
data_no_label=func.features(dataset,opts) #All features dataframe
columns_no_label=list(data_no_label.columns) # features name 
colnames_numerics= list(data_no_label.select_dtypes(include=np.number).columns.tolist()) # Numeric features name
colnames_binary_only=list(func.binary_feat(data_no_label)) # Binary features name
colnames_float_only= list(set(colnames_numerics )- set(colnames_binary_only)) # Float features name

## Encoding the categorical labels

In [16]:
# Category or Binary Labels
if opts.label_type == 'binary':
    label=dataset.loc[:,opts.binary_label_variables]
else:
    label=dataset.loc[:,opts.category_label_variabels]
    label=func.label_converter(label,opts)
    print('Encoded labels:',label, sep='\n')

## Missing values detection and imputation

In [17]:
# missing values detection and imputation of labels
#Assumption on paper=When a label was not reported (e.g., NaN) it is a 'negative' example. 
#So missing values of labels will be filled by zero.
dataset.loc[:,opts.binary_label_variables]=func.miss_val(dataset.loc[:,opts.binary_label_variables],opts.miss_imputer_lab,opts)

Total # of missing values: 10713244
────────────────────────────────────────
# Of missing values for each column:
────────────────────────────────────────
label:LYING_DOWN                  73623
label:SITTING                     70752
label:FIX_walking                 70752
label:FIX_running                236476
label:BICYCLING                  242163
label:SLEEPING                    92078
label:LAB_WORK                   327951
label:IN_CLASS                   268283
label:IN_A_MEETING               146133
label:LOC_main_workplace         175619
label:OR_indoors                 180540
label:OR_outside                 225848
label:IN_A_CAR                   202650
label:ON_A_BUS                   208314
label:DRIVE_-_I_M_THE_DRIVER     215082
label:DRIVE_-_I_M_A_PASSENGER    246259
label:LOC_home                    22012
label:FIX_restaurant             219764
label:PHONE_IN_POCKET            242982
label:OR_exercise                136509
label:COOKING                    168000
label

In [18]:
#Missing value detection and imputation of features
dataset.loc[:,columns_no_label]=func.miss_val(dataset.loc[:,columns_no_label],opts.miss_imputer_feat,opts)

Total # of missing values: 13338938
────────────────────────────────────────
# Of missing values for each column:
────────────────────────────────────────
raw_acc:magnitude_stats:mean               290
raw_acc:magnitude_stats:std                290
raw_acc:magnitude_stats:moment3            290
raw_acc:magnitude_stats:moment4            290
raw_acc:magnitude_stats:percentile25       290
                                         ...  
lf_measurements:proximity               156397
lf_measurements:relative_humidity       351407
lf_measurements:battery_level              319
lf_measurements:screen_brightness       156397
lf_measurements:temperature_ambient     350896
Length: 214, dtype: int64
  
******************** We need to impute missing values ********************
******************** Select an imputation method from dp_parameters.py ********************
******************** The missing values imputation method:     Filling in by Zero  ********************


In [19]:
#Remove zero columns after missing value imputation
dataset,zero_col_nam=func.zer_col(dataset)

******************** The number of columns with all zero entries :  1 ********************
['lf_measurements:proximity']
******************** New dataframe (no zero columns) dimension: (377346, 275) ********************


In [20]:
# Updating features name after missing value imputation and removing new zero columns
data_no_label=data_no_label.drop(zero_col_nam.values, inplace=False ,axis=1)
columns_no_label=list(data_no_label.columns) # Updated features name 
colnames_numerics= list(data_no_label.select_dtypes(include=np.number).columns.tolist()) # Updated numeric features name
colnames_binary_only=list(func.binary_feat(data_no_label)) # Updated binary features name
colnames_float_only= list(set(colnames_numerics )- set(colnames_binary_only)) # Updated float features name

## Outliers detection and treatment


In [21]:
# Detectection and treatment of outliers
no_outlier=func.outliers_IQR(dataset.loc[:,colnames_float_only],opts.quantile)
final_dataset=dataset.iloc[(no_outlier.index)]

                     ******************** There are 108662 outliers in data ********************
max outlier value: 
audio_naive:mfcc2:mean                                   NaN
proc_gyro:magnitude_stats:moment4                  12.660385
proc_gyro:magnitude_spectrum:log_energy_band2            NaN
raw_magnet:3d:mean_y                             3139.164709
proc_gyro:3d:ro_xy                                       NaN
                                                    ...     
proc_gyro:magnitude_spectrum:log_energy_band1            NaN
raw_acc:3d:ro_xz                                         NaN
raw_magnet:3d:std_x                              2184.339347
raw_magnet:magnitude_stats:time_entropy                  NaN
raw_acc:magnitude_stats:percentile75                3.958338
Length: 190, dtype: float64
min outlier value: 
audio_naive:mfcc2:mean                                   NaN
proc_gyro:magnitude_stats:moment4                   5.864626
proc_gyro:magnitude_spectrum:log_energy_ba

## Scaling the Numerical Variables


In [22]:
# Scaling specific columns
#final_dataset.loc[:,colnames_float_only]=func.scaling_method(final_dataset.loc[:,colnames_float_only],opts)

## Evaluation

In [23]:
# Evaluate the cleaned data
func.evaluat(final_dataset)

******************** There are  0  duplicates in cleaned data ********************
******************** There are  0  missing values in cleaned data ********************
******************** There are not any outliers in cleaned data ********************
******************** The numerical features have been scaled ********************


## Save to CSV file

In [None]:
# Save cleaned data
final_dataset.to_csv(opts.resultpath, index = False)
#'Extrasensory_cleaned.csv' zero imputation
#'Extrasensory_cleaned2.csv' mean imputation