In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing as preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

%matplotlib inline
pd.set_option('display.max_columns', 200)

# The Data

In [2]:
data = pd.read_csv('data/csh101/csh101.ann.features.csv')

In [3]:
data.sample(5)

Unnamed: 0,lastSensorEventHours,lastSensorEventSeconds,lastSensorDayOfWeek,windowDuration,timeSinceLastSensorEvent,prevDominantSensor1,prevDominantSensor2,lastSensorID,lastSensorLocation,lastMotionLocation,complexity,activityChange,areaTransitions,numDistinctSensors,sensorCount-Bathroom,sensorCount-Bedroom,sensorCount-Chair,sensorCount-DiningRoom,sensorCount-Hall,sensorCount-Ignore,sensorCount-Kitchen,sensorCount-LivingRoom,sensorCount-Office,sensorCount-OutsideDoor,sensorCount-WorkArea,sensorElTime-Bathroom,sensorElTime-Bedroom,sensorElTime-Chair,sensorElTime-DiningRoom,sensorElTime-Hall,sensorElTime-Ignore,sensorElTime-Kitchen,sensorElTime-LivingRoom,sensorElTime-Office,sensorElTime-OutsideDoor,sensorElTime-WorkArea,activity
174615,5,20901,4,55,1,6,7,6,6,6,0.783777,0.309091,0,0,0.0,0.0,0,0.0,0,8.0,26.35,0.0,0,0.0,0,6155.708008,6021.710417,86400,82.914401,86400,29.290457,0.0,94.098119,86400,6098.440445,86400,Morning_Meds
176882,11,42949,4,104,14,0,1,0,0,0,0.987138,0.625,0,0,14.11,0.0,0,0.0,0,20.24,0.0,0.0,0,0.0,0,0.0,276.582983,86400,908.150037,86400,17.62249,861.922265,849.367096,86400,21936.23671,86400,Bathe
309035,10,36450,6,54,1,6,6,6,6,6,0.468996,0.851852,0,0,0.0,0.0,0,0.0,0,3.46,30.89,0.0,0,0.0,0,683.062219,682.802233,86400,150.085196,86400,1.364625,0.0,155.641257,86400,681.175963,86400,Cook_Breakfast
105878,9,32867,3,509,0,5,7,5,5,7,0.987138,0.548134,0,0,0.0,0.0,0,0.0,0,19.78,0.0,14.57,0,0.0,0,4751.660352,4752.628838,86400,4549.859736,86400,0.0,4550.66005,0.226674,86400,4718.718847,86400,Other_Activity
14814,17,63054,6,37,1,1,0,7,7,7,1.375279,0.648649,2,0,0.0,23.57,0,0.0,0,6.72,0.0,2.01,0,2.05,0,269.419654,2.938068,86400,17980.79576,86400,2.997586,927.213614,0.0,86400,0.897488,86400,Leave_Home


In [17]:
feature_names = data.columns[:-1]

# Data Visualization

In [None]:
# plt.figure(figsize=(16,16))
# sns.countplot(x='prevDominantSensor1', hue='activity', data=data)
# plt.title('Activity Count per Previous Dominant Sensor 1')
# plt.legend(loc='upper right')
# plt.show()

In [None]:
# plt.figure(figsize=(16,16))
# sns.countplot(x='prevDominantSensor2', hue='activity', data=data)
# plt.title('Activity Count per Previous Dominant Sensor 2')
# plt.legend(loc='upper right')
# plt.show()

In [None]:
# plt.figure(figsize=(16,16))
# sns.countplot(x='lastSensorID', hue='activity', data=data)
# plt.title('Activity Count per Last Sensor ID')
# plt.legend(loc='upper right')
# plt.show()

In [None]:
# data.isna().sum()

In [180]:
# len(data)

321428

In [None]:
# data.dtypes

In [6]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
lastSensorEventHours,321428.0,12.529724,5.751412,0.0,9.0,12.0,17.0,23.0
lastSensorEventSeconds,321428.0,46924.419995,20770.004798,0.0,33548.0,43553.0,62899.0,86399.0
lastSensorDayOfWeek,321428.0,3.098949,2.02585,0.0,1.0,3.0,5.0,6.0
windowDuration,321428.0,464.204251,1177.60847,8.0,49.0,111.0,442.0,23266.0
timeSinceLastSensorEvent,321428.0,16.007103,111.117938,0.0,1.0,1.0,4.0,12183.0
prevDominantSensor1,321428.0,4.311939,2.789936,0.0,1.0,5.0,7.0,9.0
prevDominantSensor2,321428.0,4.311846,2.790064,0.0,1.0,5.0,7.0,9.0
lastSensorID,321428.0,4.535482,2.57849,0.0,1.0,5.0,7.0,9.0
lastSensorLocation,321428.0,4.535482,2.57849,0.0,1.0,5.0,7.0,9.0
lastMotionLocation,321428.0,4.341016,3.17982,-1.0,1.0,6.0,7.0,9.0


In [5]:
X = data.values[:, :-1]

In [None]:
# X

## Normalizing the Data

In [6]:
# X = preprocessing.RobustScaler().fit_transform(X)
X = preprocessing.normalize(X, norm='l2')

In [7]:
X.shape

(321428, 36)

In [None]:
# X.dtype

In [18]:
X = pd.DataFrame(X, dtype='float', columns=feature_names)

In [8]:
y = data.values[:, -1]

In [9]:
y = y.reshape(len(y), 1)
y.shape

(321428, 1)

In [None]:
# y.dtype

In [12]:
y = pd.DataFrame(y, columns=['y'])

In [13]:
activities = y['y'].unique()
# activities

In [14]:
encoded = y['y'].astype('category').cat.codes
# encoded.unique()
y['y'] = encoded

In [None]:
# y

In [None]:
# X

In [None]:
# y

In [19]:
df = pd.concat((X,y), axis=1)
df

Unnamed: 0,lastSensorEventHours,lastSensorEventSeconds,lastSensorDayOfWeek,windowDuration,timeSinceLastSensorEvent,prevDominantSensor1,prevDominantSensor2,lastSensorID,lastSensorLocation,lastMotionLocation,complexity,activityChange,areaTransitions,numDistinctSensors,sensorCount-Bathroom,sensorCount-Bedroom,sensorCount-Chair,sensorCount-DiningRoom,sensorCount-Hall,sensorCount-Ignore,sensorCount-Kitchen,sensorCount-LivingRoom,sensorCount-Office,sensorCount-OutsideDoor,sensorCount-WorkArea,sensorElTime-Bathroom,sensorElTime-Bedroom,sensorElTime-Chair,sensorElTime-DiningRoom,sensorElTime-Hall,sensorElTime-Ignore,sensorElTime-Kitchen,sensorElTime-LivingRoom,sensorElTime-Office,sensorElTime-OutsideDoor,sensorElTime-WorkArea,y
0,0.000038,0.146788,0.000015,0.000496,0.000000,0.000000,0.000000,0.000019,0.000019,0.000034,3.618089e-06,0.000001,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000083,0.0,0.0,0.0,0.000048,0.0,0.329723,0.329723,0.329723,0.329723,0.329723,0.000000e+00,0.329723,0.329723,0.329723,0.000007,0.329723,26
1,0.000038,0.146791,0.000015,0.000500,0.000004,0.000000,0.000000,0.000034,0.000034,0.000034,3.618087e-06,0.000001,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000084,0.0,0.0,0.0,0.000047,0.0,0.329722,0.329722,0.329722,0.329722,0.329722,2.130144e-07,0.329722,0.329722,0.329722,0.000000,0.329722,26
2,0.000038,0.147213,0.000015,0.000920,0.000431,0.000000,0.000000,0.000019,0.000019,0.000034,3.617855e-06,0.000001,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000083,0.0,0.0,0.0,0.000048,0.0,0.329701,0.329701,0.329701,0.329701,0.329701,0.000000e+00,0.329701,0.329701,0.329701,0.000433,0.329701,26
3,0.000038,0.147228,0.000015,0.000931,0.000015,0.000000,0.000000,0.000019,0.000019,0.000034,3.617848e-06,0.000001,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000083,0.0,0.0,0.0,0.000048,0.0,0.329701,0.329701,0.329701,0.329701,0.329701,0.000000e+00,0.329701,0.329701,0.329701,0.000449,0.329701,26
4,0.000038,0.147228,0.000015,0.000927,0.000000,0.000000,0.000000,0.000034,0.000034,0.000034,3.705128e-06,0.000001,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000079,0.0,0.0,0.0,0.000052,0.0,0.329701,0.329701,0.329701,0.329701,0.329701,2.415363e-07,0.329701,0.329701,0.329701,0.000000,0.329701,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321423,0.000119,0.445144,0.000000,0.000642,0.000005,0.000005,0.000005,0.000005,0.000005,0.000005,2.448987e-07,0.000003,0.0,0.0,0.0,0.000172,0.0,0.0,0.0,0.000006,0.0,0.0,0.0,0.000000,0.0,0.002833,0.000000,0.447537,0.016919,0.447537,3.083684e-04,0.016927,0.007173,0.447537,0.007415,0.447537,19
321424,0.000119,0.445354,0.000000,0.000756,0.000269,0.000005,0.000005,0.000005,0.000005,0.000005,2.448658e-07,0.000001,0.0,0.0,0.0,0.000172,0.0,0.0,0.0,0.000006,0.0,0.0,0.0,0.000000,0.0,0.003100,0.000000,0.447477,0.017184,0.447477,5.755184e-04,0.017191,0.007439,0.447477,0.007681,0.447477,19
321425,0.000119,0.445362,0.000000,0.000761,0.000010,0.000005,0.000005,0.000005,0.000005,0.000005,2.448646e-07,0.000001,0.0,0.0,0.0,0.000172,0.0,0.0,0.0,0.000006,0.0,0.0,0.0,0.000000,0.0,0.003110,0.000000,0.447475,0.017194,0.447475,5.853458e-04,0.017201,0.007449,0.447475,0.007691,0.447475,19
321426,0.000119,0.445370,0.000000,0.000767,0.000010,0.000005,0.000005,0.000005,0.000005,0.000005,2.448633e-07,0.000001,0.0,0.0,0.0,0.000172,0.0,0.0,0.0,0.000006,0.0,0.0,0.0,0.000000,0.0,0.003123,0.000000,0.447473,0.017207,0.447473,5.988222e-04,0.017215,0.007462,0.447473,0.007705,0.447473,19


In [20]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
lastSensorEventHours,321428.0,6.8e-05,3e-05,0.0,5.1e-05,6.7e-05,9.2e-05,0.00012
lastSensorEventSeconds,321428.0,0.256215,0.10659,0.0,0.190081,0.244007,0.340982,0.447203
lastSensorDayOfWeek,321428.0,1.7e-05,1.1e-05,0.0,6e-06,1.7e-05,2.7e-05,3.5e-05
windowDuration,321428.0,0.002554,0.006506,4.308151e-05,0.000273,0.000617,0.002439,0.12902
timeSinceLastSensorEvent,321428.0,8.8e-05,0.000619,0.0,5e-06,6e-06,2.2e-05,0.067839
prevDominantSensor1,321428.0,2.4e-05,1.5e-05,0.0,6e-06,2.8e-05,3.7e-05,5.2e-05
prevDominantSensor2,321428.0,2.4e-05,1.5e-05,0.0,6e-06,2.8e-05,3.7e-05,5.2e-05
lastSensorID,321428.0,2.5e-05,1.4e-05,0.0,6e-06,2.8e-05,3.7e-05,5.2e-05
lastSensorLocation,321428.0,2.5e-05,1.4e-05,0.0,6e-06,2.8e-05,3.7e-05,5.2e-05
lastMotionLocation,321428.0,2.4e-05,1.8e-05,-5.715012e-06,5e-06,3.4e-05,3.9e-05,5.2e-05


In [None]:
# df.dtypes

In [None]:
# df.isna().sum()

In [None]:
# type(df.values[:,:-1])

## Splitting the Data

In [21]:
X_train, X_test, y_train, y_test = train_test_split(df.values[:,:-1], df.values[:,-1], test_size=0.20, random_state=11)

In [22]:
print(X_train.shape, y_train.shape)

(257142, 36) (257142,)


In [17]:
# dftrain = pd.DataFrame(X_train, columns=col_names[:-1])
# df_y_train = pd.DataFrame(y_train, columns=['y'])
# dftest = pd.DataFrame(X_test, columns=col_names[:-1])
# df_y_test = pd.DataFrame(y_test, columns=['y'])

In [18]:
# cls = AdaBoostClassifier(n_estimators=1000, learning_rate=2)

In [19]:
# cls.fit(X_train, y_train)

In [20]:
# cls.score(X_test, y_test)

In [21]:
# predictions = cls.predict(X_test)

In [22]:
# predictions

In [23]:
# predictions.shape

In [24]:
# y_test.shape

In [27]:
cls = LogisticRegression(multi_class="multinomial", solver="saga", max_iter=500).fit(X_train, y_train)

In [28]:
cls.score(X_test, y_test)

0.40473820116355036

In [36]:
np.histogram(cls.predict(X_test), bins=np.unique(cls.predict(X_test)))

(array([ 1642,  2886,   199, 42292,    78,    16,  2274,   152, 14747]),
 array([ 0.,  3., 18., 19., 20., 22., 24., 25., 27., 33.]))