In [1]:
import warnings
# Set the warnings to be ignored
warnings.filterwarnings('ignore')

import os
import sys
import logging
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model  import Ridge,Lasso,RidgeCV, LassoCV, ElasticNet, ElasticNetCV, LogisticRegression
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt

from ydata_profiling import ProfileReport
import seaborn as sns
import pickle

# EDA:

In [90]:
df = pd.read_csv("D:\Data-Science-D-drive\Datasets-D-drive\sensor_placement\output_df.csv")

In [91]:
df["Output"].unique()

array(['bending1', 'bending2', 'cycling', 'lying', 'sitting', 'standing',
       'walking'], dtype=object)

In [92]:
df.head()

Unnamed: 0.1,Unnamed: 0,# Columns: time,avg_rss12,var_rss12,avg_rss13,var_rss13,avg_rss23,var_rss23,Output
0,0,0,39.25,0.43,22.75,0.43,33.75,1.3,bending1
1,1,250,39.25,0.43,23.0,0.0,33.0,0.0,bending1
2,2,500,39.25,0.43,23.25,0.43,33.0,0.0,bending1
3,3,750,39.5,0.5,23.0,0.71,33.0,0.0,bending1
4,4,1000,39.5,0.5,24.0,0.0,33.0,0.0,bending1


In [93]:
df.rename(columns={"# Columns: time":"Time"},inplace=True)

In [94]:
df1 = df.copy()

In [95]:
df1.head()

Unnamed: 0.1,Unnamed: 0,Time,avg_rss12,var_rss12,avg_rss13,var_rss13,avg_rss23,var_rss23,Output
0,0,0,39.25,0.43,22.75,0.43,33.75,1.3,bending1
1,1,250,39.25,0.43,23.0,0.0,33.0,0.0,bending1
2,2,500,39.25,0.43,23.25,0.43,33.0,0.0,bending1
3,3,750,39.5,0.5,23.0,0.71,33.0,0.0,bending1
4,4,1000,39.5,0.5,24.0,0.0,33.0,0.0,bending1


In [96]:
df1.drop(columns="Unnamed: 0",inplace=True)

In [97]:
df1.head()

Unnamed: 0,Time,avg_rss12,var_rss12,avg_rss13,var_rss13,avg_rss23,var_rss23,Output
0,0,39.25,0.43,22.75,0.43,33.75,1.3,bending1
1,250,39.25,0.43,23.0,0.0,33.0,0.0,bending1
2,500,39.25,0.43,23.25,0.43,33.0,0.0,bending1
3,750,39.5,0.5,23.0,0.71,33.0,0.0,bending1
4,1000,39.5,0.5,24.0,0.0,33.0,0.0,bending1


In [98]:
df1.isnull().sum()

Time           0
avg_rss12    480
var_rss12    480
avg_rss13    480
var_rss13    480
avg_rss23    480
var_rss23    480
Output         0
dtype: int64

In [100]:
test = df1[~df1.Time.str.isnumeric()]

In [104]:
test

Unnamed: 0,Time,avg_rss12,var_rss12,avg_rss13,var_rss13,avg_rss23,var_rss23,Output
4800,0 32.50 0.50 0.00 0.00 19.00 1.00,,,,,,,bending2
4801,250 32.50 0.50 0.00 0.00 18.50 0.50,,,,,,,bending2
4802,500 32.75 0.43 1.00 0.00 18.00 0.00,,,,,,,bending2
4803,750 32.50 0.50 0.00 0.00 17.50 0.50,,,,,,,bending2
4804,1000 32.50 0.50 7.50 0.50 17.50 0.87,,,,,,,bending2
...,...,...,...,...,...,...,...,...
5275,118750 28.67 0.47 4.67 1.25 17.33 0.47,,,,,,,bending2
5276,119000 27.50 0.50 5.50 2.50 17.25 1.30,,,,,,,bending2
5277,119250 28.00 0.00 6.67 0.94 17.00 1.00,,,,,,,bending2
5278,119500 28.00 0.00 5.00 0.82 17.00 0.71,,,,,,,bending2


In [111]:
pd.read_csv(r"D:\Data-Science-D-drive\sensor_placement_LogisticRegression\data\sensor_placement\bending2\dataset4.csv",sep="\t")

Unnamed: 0,# Task: bending2
0,# Frequency (Hz): 20
1,# Clock (millisecond): 250
2,# Duration (seconds): 120
3,"# Columns: time,avg_rss12,var_rss12,avg_rss13,..."
4,0 32.50 0.50 0.00 0.00 19.00 1.00
...,...
479,118750 28.67 0.47 4.67 1.25 17.33 0.47
480,119000 27.50 0.50 5.50 2.50 17.25 1.30
481,119250 28.00 0.00 6.67 0.94 17.00 1.00
482,119500 28.00 0.00 5.00 0.82 17.00 0.71


In [40]:
df1.columns

Index(['# Columns: time', 'avg_rss12', 'var_rss12', 'avg_rss13', 'var_rss13',
       'avg_rss23', 'var_rss23', 'Output'],
      dtype='object')

In [41]:
col_list = list(df1.columns)

for i in col_list:
    if df1[i].isna().sum() > 0:
        df1[i].fillna(df1[i].median(),inplace=True)

In [42]:
df1.isnull().sum()

# Columns: time    0
avg_rss12          0
var_rss12          0
avg_rss13          0
var_rss13          0
avg_rss23          0
var_rss23          0
Output             0
dtype: int64

In [43]:
# Checking for duplicated records
df1.duplicated().sum()

3359

In [44]:
df1.drop_duplicates(inplace=True)

In [45]:
df1.duplicated().sum()

0

In [46]:
df1.shape[0]

37440

In [47]:
df.shape[0]

40799

In [54]:
100-(df1.shape[0]/df.shape[0]*100)

8.233044927571754

We have lost 8.23% of the data after removing duplicates

In [58]:
for i in df1.columns:
    print(i,"---",df1[i].dtype)

# Columns: time --- object
avg_rss12 --- float64
var_rss12 --- float64
avg_rss13 --- float64
var_rss13 --- float64
avg_rss23 --- float64
var_rss23 --- float64
Output --- object


In [60]:
df1[["# Columns: time"]].sample(10)

Unnamed: 0,# Columns: time
12874,98500
39464,26250
37132,43250
11877,89250
15844,1000
24264,66000
24034,8500
37209,62500
20545,96250
38317,99500


Renaming the column as it would cause errors due to space in the column name

In [64]:
df1.rename(columns={"# Columns: time":"Time"},inplace=True)

In [65]:
df1

Unnamed: 0,Time,avg_rss12,var_rss12,avg_rss13,var_rss13,avg_rss23,var_rss23,Output
0,0,39.25,0.43,22.75,0.43,33.75,1.30,bending1
1,250,39.25,0.43,23.00,0.00,33.00,0.00,bending1
2,500,39.25,0.43,23.25,0.43,33.00,0.00,bending1
3,750,39.50,0.50,23.00,0.71,33.00,0.00,bending1
4,1000,39.50,0.50,24.00,0.00,33.00,0.00,bending1
...,...,...,...,...,...,...,...,...
40794,118750,31.50,1.66,12.50,3.20,14.25,4.44,walking
40795,119000,27.33,1.25,11.33,0.94,20.00,4.00,walking
40796,119250,37.80,7.68,14.20,2.48,17.25,0.83,walking
40797,119500,33.75,1.30,15.75,5.21,16.50,2.69,walking


In [85]:
df[~df.Time.str.isnumeric()]

AttributeError: 'DataFrame' object has no attribute 'Time'

In [69]:
df1[~df1.Time.str.isnumeric()]

Unnamed: 0,Time,avg_rss12,var_rss12,avg_rss13,var_rss13,avg_rss23,var_rss23,Output
4800,0 32.50 0.50 0.00 0.00 19.00 1.00,40.0,0.5,15.0,0.83,15.75,1.0,bending2
4801,250 32.50 0.50 0.00 0.00 18.50 0.50,40.0,0.5,15.0,0.83,15.75,1.0,bending2
4802,500 32.75 0.43 1.00 0.00 18.00 0.00,40.0,0.5,15.0,0.83,15.75,1.0,bending2
4803,750 32.50 0.50 0.00 0.00 17.50 0.50,40.0,0.5,15.0,0.83,15.75,1.0,bending2
4804,1000 32.50 0.50 7.50 0.50 17.50 0.87,40.0,0.5,15.0,0.83,15.75,1.0,bending2
...,...,...,...,...,...,...,...,...
5275,118750 28.67 0.47 4.67 1.25 17.33 0.47,40.0,0.5,15.0,0.83,15.75,1.0,bending2
5276,119000 27.50 0.50 5.50 2.50 17.25 1.30,40.0,0.5,15.0,0.83,15.75,1.0,bending2
5277,119250 28.00 0.00 6.67 0.94 17.00 1.00,40.0,0.5,15.0,0.83,15.75,1.0,bending2
5278,119500 28.00 0.00 5.00 0.82 17.00 0.71,40.0,0.5,15.0,0.83,15.75,1.0,bending2


In [81]:
path = r"D:\Data-Science-D-drive\sensor_placement_LogisticRegression\data\sensor_placement\bending2\dataset2.csv"

In [82]:
data = pd.read_csv(path,sep="\t")

In [83]:
data.head(10)

Unnamed: 0,# Task: bending2
0,# Frequency (Hz): 20
1,# Clock (millisecond): 250
2,# Duration (seconds): 120
3,"# Columns: time,avg_rss12,var_rss12,avg_rss13,..."
4,"0,27.00,0.00,24.00,0.00,19.00,1.00"
5,"250,27.00,0.00,24.00,0.00,19.75,0.83"
6,"500,27.00,0.00,23.50,0.50,20.00,1.22"
7,"750,27.00,0.00,24.00,0.00,14.75,1.48"
8,"1000,27.00,0.00,23.75,0.43,14.50,1.50"
9,"1250,27.00,0.00,23.33,0.47,17.33,2.05"


In [12]:
#fig,ax = plt.subplots(figsize = (20,25))
#sns.boxplot(data = df1, ax = ax)