In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import os

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

## Reading Data into Dataframes

In [3]:
dirpath = '/Users/elizaackroyd/Downloads/UCIHARDataset/'

In [4]:
X_test_fn = os.path.join(dirpath, 'test', 'X_test.txt')
X_train_fn = os.path.join(dirpath, 'train', 'X_train.txt')
y_test_fn = os.path.join(dirpath, 'test', 'y_test.txt')
y_train_fn = os.path.join(dirpath, 'train', 'y_train.txt')
features_fn = os.path.join(dirpath, 'features.txt')

# Methods to read files irregardless of irregularities in delimiters -- spaces in this case. Second method reads data into dataframe and converts entries from strings to floats.


In [5]:
def file_2_list(filename):
    fd = open(filename, 'r')
    file_list = fd.readlines()
    fd.close()
    return file_list

def file_2_df(filename):
    return pd.DataFrame([[float(x) for x in i.split()] for i in file_2_list(filename)])

In [6]:
X_test_df = file_2_df(X_test_fn)
X_train_df = file_2_df(X_train_fn)
y_test_df = file_2_df(y_test_fn)
y_train_df = file_2_df(y_train_fn)
col_names_list = file_2_list(features_fn)

In [7]:
col_names = [i.split()[1] for i in col_names_list]
X_test_df.columns = col_names
X_train_df.columns = col_names
y_test_df.columns = ['label']
y_train_df.columns = ['labe']

## Modeling using a Random Forest Classifier

In [8]:
X_train_df.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.074323,-0.298676,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,0.158075,-0.595051,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,0.414503,-0.390748,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,0.404573,-0.11729,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,0.087753,-0.351471,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892


In [9]:
y_train_df.head()

Unnamed: 0,labe
0,5.0
1,5.0
2,5.0
3,5.0
4,5.0


In [10]:
X_test_df.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0,0.257178,-0.023285,-0.014654,-0.938404,-0.920091,-0.667683,-0.952501,-0.925249,-0.674302,-0.894088,...,0.071645,-0.33037,-0.705974,0.006462,0.16292,-0.825886,0.271151,-0.720009,0.276801,-0.057978
1,0.286027,-0.013163,-0.119083,-0.975415,-0.967458,-0.944958,-0.986799,-0.968401,-0.945823,-0.894088,...,-0.401189,-0.121845,-0.594944,-0.083495,0.0175,-0.434375,0.920593,-0.698091,0.281343,-0.083898
2,0.275485,-0.02605,-0.118152,-0.993819,-0.969926,-0.962748,-0.994403,-0.970735,-0.963483,-0.93926,...,0.062891,-0.190422,-0.640736,-0.034956,0.202302,0.064103,0.145068,-0.702771,0.280083,-0.079346
3,0.270298,-0.032614,-0.11752,-0.994743,-0.973268,-0.967091,-0.995274,-0.974471,-0.968897,-0.93861,...,0.116695,-0.344418,-0.736124,-0.017067,0.154438,0.340134,0.296407,-0.698954,0.284114,-0.077108
4,0.274833,-0.027848,-0.129527,-0.993852,-0.967445,-0.978295,-0.994111,-0.965953,-0.977346,-0.93861,...,-0.121711,-0.534685,-0.846595,-0.002223,-0.040046,0.736715,-0.118545,-0.692245,0.290722,-0.073857


In [11]:
y_test_df.head()

Unnamed: 0,label
0,5.0
1,5.0
2,5.0
3,5.0
4,5.0


In [12]:
model = RandomForestClassifier()

In [13]:
model.fit(X_train_df, y_train_df)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [14]:
y_pred = model.predict(X_test_df)

In [15]:
confusion_matrix(list(y_test_df['label']), y_pred)

array([[478,  12,   6,   0,   0,   0],
       [ 40, 425,   6,   0,   0,   0],
       [ 16,  48, 356,   0,   0,   0],
       [  0,   0,   0, 435,  56,   0],
       [  0,   0,   0,  40, 492,   0],
       [  0,   0,   0,   0,   0, 537]])

In [16]:
accuracy_score(list(y_test_df['label']), y_pred)

0.9239904988123515