SPARK MLLIB

Dataset is available at https://archive.ics.uci.edu/ml/machine-learning-databases/00231/

This dataset is called Physical Activity Monitoring, a record of human's physical activities and the corresponding physical reading, which consists of heart rate (bpm) and IMU sensory data on hand, chest, and ankle. There are 1 label-activity, and 52 features. There is also a timestamp for each reading (shown as column 0) in the dataset below.

There are 9 subjects (participants) and 27 activities, which are labelled a such:
- 1 lying 
– 2 sitting
– 3 standing
– 4 walking
– 5 running
– 6 cycling
– 7 Nordic walking
– 9 watching TV
– 10 computer work
– 11 car driving
– 12 ascending stairs
– 13 descending stairs
– 16 vacuum cleaning
– 17 ironing
– 18 folding laundry
– 19 house cleaning
– 20 playing soccer
– 24 rope jumping
– 0 other (transient activities, in between performing different activities, e.g. going from one location to the
next activity's location, or waiting for the preparation of some equipment.)

In [1]:
import numpy as np
import pandas as pd

In [2]:
#loading the data

subject1 = pd.read_csv("protocol/subject101.dat", header=None, delimiter=r"\s+")
subject2 = pd.read_csv("protocol/subject102.dat", header=None, delimiter=r"\s+")
subject3 = pd.read_csv("protocol/subject103.dat", header=None, delimiter=r"\s+")
subject4 = pd.read_csv("protocol/subject104.dat", header=None, delimiter=r"\s+")
subject5 = pd.read_csv("protocol/subject105.dat", header=None, delimiter=r"\s+")
subject6 = pd.read_csv("protocol/subject106.dat", header=None, delimiter=r"\s+")
subject7 = pd.read_csv("protocol/subject107.dat", header=None, delimiter=r"\s+")
subject8 = pd.read_csv("protocol/subject108.dat", header=None, delimiter=r"\s+")
subject9 = pd.read_csv("protocol/subject109.dat", header=None, delimiter=r"\s+")

In [6]:
subject5.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
0,5.7,0,,34.0,2.22755,9.65418,2.38862,2.31968,9.60752,2.58278,...,-0.017907,0.00934,0.050097,-32.7091,31.4772,44.2318,0.255373,0.783075,0.084602,0.560732
1,5.71,0,,34.0,2.3755,9.57647,2.31412,2.45657,9.62177,2.71852,...,-0.070091,0.002312,0.053833,-33.0782,30.9814,44.5148,0.251163,0.785583,0.076319,0.560314
2,5.72,0,,34.0,2.94208,9.53415,2.32275,2.78876,9.64961,2.76342,...,-0.084468,0.034249,0.030462,-32.5619,30.6982,44.5169,0.250643,0.786419,0.072038,0.559941
3,5.73,0,,34.0,3.47541,9.75837,2.40696,3.3308,9.66073,2.68734,...,-0.030789,0.058615,0.055252,-32.8212,30.969,44.6575,0.250917,0.786967,0.071013,0.559179
4,5.74,0,,34.0,3.54617,9.83232,2.25382,3.67642,9.71848,2.50565,...,-0.020063,0.020903,0.059653,-33.1869,30.0856,44.5154,0.249631,0.786602,0.066861,0.560777


In [7]:
subject5.describe()

#we've got a warning because there are lots of missing data (NaN)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
count,374783.0,374783.0,34223.0,373242.0,373242.0,373242.0,373242.0,373242.0,373242.0,373242.0,...,372803.0,372803.0,372803.0,372803.0,372803.0,372803.0,372803.0,372803.0,372803.0,372803.0
mean,1879.61,5.874485,109.403354,34.727542,-4.335374,5.379049,3.375515,-4.186691,5.397473,3.621513,...,0.014629,-0.04012,0.024625,-32.127083,2.2804,19.474495,0.309257,0.027852,0.412471,0.005505
std,1081.906773,6.360833,24.924682,0.489405,5.756766,5.896408,3.78662,5.755631,5.758072,3.784624,...,1.096377,0.623888,2.100005,14.785967,21.877278,15.931513,0.186918,0.621047,0.286939,0.480054
min,5.7,0.0,70.0,33.75,-90.7606,-65.5387,-38.9078,-61.2147,-61.805,-36.3214,...,-13.3856,-7.70104,-14.0196,-102.849,-76.0628,-82.8949,7e-06,-0.916602,-0.860089,-0.884023
25%,942.655,0.0,,,,,,,,,...,,,,,,,,,,
50%,1879.61,4.0,,,,,,,,,...,,,,,,,,,,
75%,2816.565,7.0,,,,,,,,,...,,,,,,,,,,
max,3753.52,24.0,182.0,35.5,42.8916,154.576,76.6396,32.0678,62.2598,55.5517,...,11.5109,6.00254,14.4827,70.9417,80.1814,104.832,0.875651,0.903924,0.923082,0.893878


In [7]:
#listing the number of observations for each subject.

print(subject1[0].count())
print(subject2[0].count())
print(subject3[0].count())
print(subject4[0].count())
print(subject5[0].count())
print(subject6[0].count())
print(subject7[0].count())
print(subject8[0].count())
print(subject9[0].count())

376417
447000
252833
329576
374783
361817
313599
408031
8477


In [8]:
#remove NaN

subject1.dropna(axis=0, how='any', inplace=True)
subject2.dropna(axis=0, how='any', inplace=True)
subject3.dropna(axis=0, how='any', inplace=True)
subject4.dropna(axis=0, how='any', inplace=True)
subject5.dropna(axis=0, how='any', inplace=True)
subject6.dropna(axis=0, how='any', inplace=True)
subject7.dropna(axis=0, how='any', inplace=True)
subject8.dropna(axis=0, how='any', inplace=True)
subject9.dropna(axis=0, how='any', inplace=True)

In [9]:
#let us see how many observations are left after all missing data were removed

print(subject1[0].count())
print(subject2[0].count())
print(subject3[0].count())
print(subject4[0].count())
print(subject5[0].count())
print(subject6[0].count())
print(subject7[0].count())
print(subject8[0].count())
print(subject9[0].count())

34090
40316
22990
29781
33894
32800
28410
36753
769


In [None]:
#with the exception of the last participan, looks like we still have considerably lots of observation, 
#even after the missing data removed

In [14]:
#let us go to each subject and see the percentage of each activities performed

print("Percentage of activities in Subject 1")
for i in range(25):
    print(i, (len(subject1[subject1[1] == i])/len(subject1)) * 100)
print()
print("Total samples: ", len(subject1))

Percentage of activities in Subject 1
0 33.73423291287768
1 7.289527720739219
2 6.292167791141097
3 5.781754180111469
4 5.7524200645350545
5 5.558814901730713
6 6.29803461425638
7 5.353476092695805
8 0.0
9 0.0
10 0.0
11 0.0
12 4.25638017013787
13 3.995306541507773
14 0.0
15 0.0
16 6.127896743913171
17 6.29803461425638
18 0.0
19 0.0
20 0.0
21 0.0
22 0.0
23 0.0
24 3.2619536520973895

Total samples:  34090


In [17]:
print("Percentage of activities in Subject 2")
for i in range(25):
    print(i, (len(subject2[subject2[1] == i])/len(subject2)) * 100)
print()
print("Total samples: ", len(subject2))

Percentage of activities in Subject 2
0 41.23672983430896
1 5.268379799583292
2 5.060025796209942
3 5.754539140787776
4 7.158448258755828
5 1.9744022224427025
6 5.583391209445381
7 6.456493699771802
8 0.0
9 0.0
10 0.0
11 0.0
12 3.9314416112709596
13 3.442801865264411
14 0.0
15 0.0
16 4.685484671098323
17 6.5284254390316505
18 0.0
19 0.0
20 0.0
21 0.0
22 0.0
23 0.0
24 2.9194364520289713

Total samples:  40316


In [16]:
print("Percentage of activities in Subject 3")
for i in range(25):
    print(i, (len(subject3[subject3[1] == i])/len(subject3)) * 100)
print()
print("Total samples: ", len(subject3))

Percentage of activities in Subject 3
0 31.09612875163114
1 8.764680295780774
2 11.431056981296216
3 8.107872988255764
4 11.248368856024358
5 0.0
6 0.0
7 0.0
8 0.0
9 0.0
10 0.0
11 0.0
12 4.11048281861679
13 6.05045672031318
14 0.0
15 0.0
16 8.073075250108742
17 11.117877337973031
18 0.0
19 0.0
20 0.0
21 0.0
22 0.0
23 0.0
24 0.0

Total samples:  22990


In [18]:
print("Percentage of activities in Subject 4")
for i in range(25):
    print(i, (len(subject4[subject4[1] == i])/len(subject4)) * 100)
print()
print("Total samples: ", len(subject4))

Percentage of activities in Subject 4
0 29.94526711661798
1 7.061549309962728
2 7.8170645713710085
3 7.558510459689064
4 9.193781269937208
5 0.0
6 6.950740404956181
7 8.176354051240724
8 0.0
9 0.0
10 0.0
11 0.0
12 5.120714549545012
13 4.388704207380545
14 0.0
15 0.0
16 6.138141768241496
17 7.649172291058057
18 0.0
19 0.0
20 0.0
21 0.0
22 0.0
23 0.0
24 0.0

Total samples:  29781


In [19]:
print("Percentage of activities in Subject 5")
for i in range(25):
    print(i, (len(subject5[subject5[1] == i])/len(subject5)) * 100)
print()
print("Total samples: ", len(subject5))

Percentage of activities in Subject 5
0 27.44438543695049
1 6.384610845577389
2 7.2166165102968085
3 5.950905765032159
4 8.20204165929073
5 6.558682952734997
6 6.573434826222931
7 7.004189532070574
8 0.0
9 0.0
10 0.0
11 0.0
12 3.8443382309553313
13 3.425385023898035
14 0.0
15 0.0
16 6.58228595031569
17 8.806868472295982
18 0.0
19 0.0
20 0.0
21 0.0
22 0.0
23 0.0
24 2.0062547943588838

Total samples:  33894


In [20]:
print("Percentage of activities in Subject 6")
for i in range(25):
    print(i, (len(subject6[subject6[1] == i])/len(subject6)) * 100)
print()
print("Total samples: ", len(subject6))

Percentage of activities in Subject 6
0 30.917682926829272
1 6.5
2 6.347560975609756
3 6.698170731707317
4 7.076219512195121
5 6.2073170731707314
6 5.676829268292683
7 7.384146341463414
8 0.0
9 0.0
10 0.0
11 0.0
12 3.6981707317073167
13 3.1432926829268295
14 0.0
15 0.0
16 5.859756097560975
17 10.417682926829269
18 0.0
19 0.0
20 0.0
21 0.0
22 0.0
23 0.0
24 0.07317073170731707

Total samples:  32800


In [21]:
print("Percentage of activities in Subject 7")
for i in range(25):
    print(i, (len(subject7[subject7[1] == i])/len(subject7)) * 100)
print()
print("Total samples: ", len(subject7))

Percentage of activities in Subject 7
0 25.888771559310104
1 8.243576205561421
2 3.9352340725096795
3 8.25061598028863
4 10.369588173178458
5 1.1193241816261879
6 7.261527631115803
7 9.162266807462162
8 0.0
9 0.0
10 0.0
11 0.0
12 5.670538542766631
13 3.738120380147835
14 0.0
15 0.0
16 6.923618444209785
17 9.436818021823301
18 0.0
19 0.0
20 0.0
21 0.0
22 0.0
23 0.0
24 0.0

Total samples:  28410


In [22]:
print("Percentage of activities in Subject 8")
for i in range(25):
    print(i, (len(subject8[subject8[1] == i])/len(subject8)) * 100)
print()
print("Total samples: ", len(subject8))

Percentage of activities in Subject 8
0 35.72225396566267
1 6.013114575680897
2 5.697494082115746
3 6.236225614235573
4 7.564008380268278
5 3.801050254401001
6 6.290642940712322
7 7.060648110358338
8 0.0
9 0.0
10 0.0
11 0.0
12 2.9004435012107854
13 2.4025249639485216
14 0.0
15 0.0
16 6.026718907300084
17 8.16259897151253
18 0.0
19 0.0
20 0.0
21 0.0
22 0.0
23 0.0
24 2.1222757325932577

Total samples:  36753


In [23]:
print("Percentage of activities in Subject 9")
for i in range(25):
    print(i, (len(subject9[subject9[1] == i])/len(subject9)) * 100)
print()
print("Total samples: ", len(subject9))

Percentage of activities in Subject 9
0 24.187256176853055
1 0.0
2 0.0
3 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0
9 0.0
10 0.0
11 0.0
12 0.0
13 0.0
14 0.0
15 0.0
16 0.0
17 0.0
18 0.0
19 0.0
20 0.0
21 0.0
22 0.0
23 0.0
24 75.81274382314695

Total samples:  769


In [24]:
#subject 9 is useless.

In [None]:
#Looking at Subject 1 - 8, most of them only do certain activities
#On the machine learning model, we shall filter them, only selecting activities we deem significant.