In [1]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from xgboost import XGBClassifier

In [2]:
activities = [1, 3, 4]
users = [1, 3, 5, 7, 8, 11, 14, 17, 19]
n_iters = 20

# Feature Importance - Subject

In [3]:
name_dataframe = pd.read_csv('../../../data/features.txt', delimiter = '\n', header = None)
names = name_dataframe.values.tolist()
names = [k for row in names for k in row] #List of column names

data = pd.read_csv('../../../data/X_train.txt', delim_whitespace = True, header = None) #Read in dataframe
data.columns = names #Setting column names

X_train = data

y_train_activity = pd.read_csv('../../../data/y_train.txt', header = None)
y_train_activity.columns = ['Activity']

y_train_subject = pd.read_csv('../../../data/subject_train.txt', header = None)
y_train_subject.columns = ['Subject']

GAN_data = pd.concat([X_train, y_train_activity, y_train_subject], axis = 1)
print(GAN_data['Subject'].value_counts())
GAN_data = GAN_data[GAN_data['Activity'].isin(activities)]
GAN_data = GAN_data[GAN_data['Subject'].isin(users)]

25    409
21    408
26    392
30    383
28    382
27    376
23    372
17    368
16    366
19    360
1     347
29    344
3     341
15    328
6     325
14    323
22    321
11    316
7     308
5     302
8     281
Name: Subject, dtype: int64


In [4]:
X_train = GAN_data.iloc[:,:-2].values
y_train = GAN_data.iloc[:,-1].values #Subject Importance

In [5]:
for k in range(len(y_train)):
    if y_train[k] == 1:
        y_train[k] = 0
    elif y_train[k] == 3:
        y_train[k] = 1
    elif y_train[k] == 5:
        y_train[k] = 2
    elif y_train[k] == 7:
        y_train[k] = 3
    elif y_train[k] == 8:
        y_train[k] = 4
    elif y_train[k] == 11:
        y_train[k] = 5
    elif y_train[k] == 14:
        y_train[k] = 6
    elif y_train[k] == 17:
        y_train[k] = 7
    else:
        y_train[k] = 8

In [6]:
from matplotlib import pyplot

total_sum = np.zeros(561,)

for k in range(n_iters):
    model = RandomForestClassifier()
    # fit the model
    model.fit(X_train, y_train)
    importance = model.feature_importances_
    total_sum = total_sum + importance

averaged_importance = total_sum / n_iters

In [7]:
averaged_importance = list(averaged_importance)

In [8]:
def importance_filter(averaged_importance):
    return averaged_importance > 0.01

output = [idx for idx, element in enumerate(averaged_importance) if importance_filter(element)]
print(output)

[41, 42, 50, 51, 53, 54, 55, 57, 58, 474, 558, 559, 560]


In [9]:
sub_features = []

for index in range(len(output)):
    sub_features.append(GAN_data.columns[output[index]])

In [10]:
sub_features

['42 tGravityAcc-mean()-Y',
 '43 tGravityAcc-mean()-Z',
 '51 tGravityAcc-max()-Y',
 '52 tGravityAcc-max()-Z',
 '54 tGravityAcc-min()-Y',
 '55 tGravityAcc-min()-Z',
 '56 tGravityAcc-sma()',
 '58 tGravityAcc-energy()-Y',
 '59 tGravityAcc-energy()-Z',
 '475 fBodyGyro-bandsEnergy()-1,8',
 '559 angle(X,gravityMean)',
 '560 angle(Y,gravityMean)',
 '561 angle(Z,gravityMean)']

# Feature Importance - Activity

In [11]:
name_dataframe = pd.read_csv('../../../data/features.txt', delimiter = '\n', header = None)
names = name_dataframe.values.tolist()
names = [k for row in names for k in row] #List of column names

data = pd.read_csv('../../../data/X_train.txt', delim_whitespace = True, header = None) #Read in dataframe
data.columns = names #Setting column names

#X_train = data.loc[:,'1 tBodyAcc-mean()-X':'40 tBodyAcc-correlation()-Y,Z'] #Selecting only acceleration columns

X_train = data

y_train_activity = pd.read_csv('../../../data/y_train.txt', header = None)
y_train_activity.columns = ['Activity']

y_train_subject = pd.read_csv('../../../data/subject_train.txt', header = None)
y_train_subject.columns = ['Subject']

GAN_data = pd.concat([X_train, y_train_activity, y_train_subject], axis = 1)
GAN_data = GAN_data[GAN_data['Activity'].isin(activities)]
GAN_data = GAN_data[GAN_data['Subject'].isin(users)]

In [12]:
X_train = GAN_data.iloc[:,:-2].values
y_train = GAN_data.iloc[:,-2].values #Activity Importance

In [13]:
for k in range(len(y_train)):
    if y_train[k] == 1:
        y_train[k] = 0
    elif y_train[k] == 3:
        y_train[k] = 1
    else:
        y_train[k] = 2

In [14]:
from matplotlib import pyplot

total_sum = np.zeros(561,)

for k in range(n_iters):
    model = RandomForestClassifier()
    # fit the model
    model.fit(X_train, y_train)
    importance = model.feature_importances_
    total_sum = total_sum + importance
    
# summarize feature importance
# for i,v in enumerate(importance):
#     print('Feature: %0d, Score: %.5f' % (i,v))
# # plot feature importance
# pyplot.bar([x for x in range(len(importance))], importance)
# pyplot.show()

averaged_importance = total_sum / n_iters

In [15]:
averaged_importance = list(averaged_importance)

In [16]:
output_act = [idx for idx, element in enumerate(averaged_importance) if importance_filter(element)]
print(output_act)

[3, 9, 16, 201, 202, 214, 215, 265, 268, 271, 281, 310, 314, 381, 503, 504, 508]


In [17]:
act_features = []

for index in range(len(output_act)):
    act_features.append(GAN_data.columns[output_act[index]])

In [18]:
act_features

['4 tBodyAcc-std()-X',
 '10 tBodyAcc-max()-X',
 '17 tBodyAcc-energy()-X',
 '202 tBodyAccMag-std()',
 '203 tBodyAccMag-mad()',
 '215 tGravityAccMag-std()',
 '216 tGravityAccMag-mad()',
 '266 fBodyAcc-mean()-X',
 '269 fBodyAcc-std()-X',
 '272 fBodyAcc-mad()-X',
 '282 fBodyAcc-energy()-X',
 '311 fBodyAcc-bandsEnergy()-1,16',
 '315 fBodyAcc-bandsEnergy()-1,24',
 '382 fBodyAccJerk-bandsEnergy()-1,8',
 '504 fBodyAccMag-std()',
 '505 fBodyAccMag-mad()',
 '509 fBodyAccMag-energy()']

In [19]:
def common_data(list1, list2):
    result = False
  
    # traverse in the 1st list
    for x in list1:
  
        # traverse in the 2nd list
        for y in list2:
    
            # if one common
            if x == y:
                result = True
                return result 
                  
    return result

In [20]:
result = common_data(output_act, output)

if result == False:
    print("No common indices for activity/subject classification.")
else:
    print("At least one index is shared between the two lists.")

No common indices for activity/subject classification.
