In [13]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import matplotlib as mpl
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np 
import time

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

%matplotlib notebook
plt.style.use('ggplot')

In [14]:
X = pd.read_csv("data/dataset-har-PUC-Rio-ugulino.csv", delimiter=";")
X.gender = X.gender.map({"Man": 0, "Woman": 1})

#Decimal points were written as commas. Convert them to periods.
X.how_tall_in_meters = pd.to_numeric(X.how_tall_in_meters.str.replace(",", "."))
X.body_mass_index = pd.to_numeric(X.body_mass_index.str.replace(",", "."))

X.z4 = pd.to_numeric(X.z4, errors="coerce")
X[pd.isnull(X).any(axis=1)]
X.dropna(inplace=True)

y = X["class"]
y = pd.get_dummies(y)
X.drop(["user", "class"], axis=1, inplace=True)

X.describe()


Unnamed: 0,user,gender,age,how_tall_in_meters,weight,body_mass_index,x1,y1,z1,x2,y2,z2,x3,y3,z3,x4,y4,z4,class
122076,jose_carlos,0,75,1.67,67,24.0,-8,101,-120,-13,91,-101,17,123,-108,-207,-82,,standingup


Unnamed: 0,gender,age,how_tall_in_meters,weight,body_mass_index,x1,y1,z1,x2,y2,z2,x3,y3,z3,x4,y4,z4
count,165632.0,165632.0,165632.0,165632.0,165632.0,165632.0,165632.0,165632.0,165632.0,165632.0,165632.0,165632.0,165632.0,165632.0,165632.0,165632.0,165632.0
mean,0.612044,38.264925,1.639712,70.819431,26.188535,-6.649319,88.293591,-93.164449,-87.827956,-52.065911,-175.055647,17.423517,104.517056,-93.881641,-167.641211,-92.625235,-159.650985
std,0.487286,13.183821,0.05282,11.296557,2.995781,11.616273,23.895881,39.409487,169.435606,205.160081,192.817111,52.635546,54.155987,45.38977,38.311336,19.968653,13.22102
min,0.0,28.0,1.58,55.0,22.0,-306.0,-271.0,-603.0,-494.0,-517.0,-617.0,-499.0,-506.0,-613.0,-702.0,-526.0,-537.0
25%,0.0,28.0,1.58,55.0,22.0,-12.0,78.0,-120.0,-35.0,-29.0,-141.0,9.0,95.0,-103.0,-190.0,-103.0,-167.0
50%,1.0,31.0,1.62,75.0,28.4,-6.0,94.0,-98.0,-9.0,27.0,-118.0,22.0,107.0,-90.0,-168.0,-91.0,-160.0
75%,1.0,46.0,1.71,83.0,28.6,0.0,101.0,-64.0,4.0,86.0,-29.0,34.0,120.0,-80.0,-153.0,-80.0,-153.0
max,1.0,75.0,1.71,83.0,28.6,509.0,533.0,411.0,473.0,295.0,122.0,507.0,517.0,410.0,-13.0,86.0,-43.0


In [15]:
#Initialise random forest classifier.
model = RandomForestClassifier(n_estimators=30, max_depth=10, oob_score=True, random_state=0)

#Split data into training and testing partitions.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

#Fit the random forest classifier.
print("Fitting...")
s = time.time()
model.fit(X_train, y_train)
print("Fitting completed in: {}s".format(time.time() - s))

#Calculate out of bag score.
score = model.oob_score_
print("OOB Score: {}".format(round(score*100, 3)))

#Calculate the score using the test data.
print("Scoring...")
s = time.time()
score = model.score(X_test, y_test)
print("Score: {}".format(round(score*100, 3)))
print("Scoring completed in: {}s.".format(time.time() - s))

Fitting...


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=True, random_state=0, verbose=0, warm_start=False)

Fitting completed in: 5.049790620803833s
OOB Score: 98.744
Scoring...
Score: 95.687
Scoring completed in: 0.40392041206359863s.
