In [442]:
import os
import sys
import numpy as np
import pandas as pd
from matplotlib import pylab as plt
import scipy.integrate
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


In [6]:
import logging
logging.basicConfig(filename='movement_classifier.log', level=logging.DEBUG, 
                    format='%(asctime)s %(levelname)s %(name)s %(message)s')
logger = logging.getLogger(__name__)

In [244]:
def read_csv(file, delimeter):
    NUM_COLUMNS = 5
    try:
        df = pd.read_csv(file, delimiter=delimeter)
        if df.shape[1] == NUM_COLUMNS:    
            return df
        logger.error('%d columns are required: %s\n', NUM_COLUMNS, file)
    except Exception as exc:
        logger.error('Parsing CSV error with delimeter="%s": %s\n%s\n', delimeter, exc, file)
    return None

def read_data(directory):
    cwd = os.getcwd()
    data = []
    
    for path in sorted(os.listdir(directory)):
        full_path = os.path.join(cwd, directory, path)
        if os.path.isfile(full_path):
            df1 = read_csv(full_path, ';')
            df2 = read_csv(full_path, ',')
            df = df2 if df1 is None else df1

            if df is None:
                logger.error('Skipping corrupted dataset: %s\n', full_path)
                continue
                
            data.append((df, full_path.split('\\')[-1]))
    
    print('{:.2f}% succesfully parsed'.format(len(data) / len(os.listdir(directory)) * 100))
    return data
    
data = read_data('data/Public-data')

99.74% succesfully parsed


In [373]:
testing_data = read_data('data/kaggle_data_open')

100.00% succesfully parsed


In [374]:
len(data), len(testing_data)

(760, 183)

In [121]:
def rotation_matrix_from_vectors(vec1, vec2):
    a, b = (vec1 / numpy.linalg.norm(vec1)).reshape(3), (vec2 / numpy.linalg.norm(vec2)).reshape(3)
    v = numpy.cross(a, b)
    if any(v):
        c = numpy.dot(a, b)
        s = numpy.linalg.norm(v)
        mat = numpy.array([[0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0]])
        return numpy.eye(3) + mat + mat.dot(mat) * ((1 - c) / (s ** 2))
    else:
        return numpy.eye(3)

In [479]:
def integrate(lst, dt):
    res = []
    s = 0
    for elem in lst:
        s += elem * dt
        res.append(s)
    return res

In [462]:
def fourier_transform(y, fs):
    n = len(y) # length of the signal
    k = np.arange(n)
    T = n/fs
    frq = k/T # two sides frequency range
    frq = frq[:len(frq)//2] # one side frequency range

    Y = np.fft.fft(y)/n # dft and normalization
    Y = Y[:n//2]
    
    yabs = abs(Y)
    
    return frq, yabs

In [262]:
CLASSES = {'стояние': ['тояни', 'месте', 'stay', 'око', 'tand'], 'ходьба': ['одьб', 'аг', 'отьб'], 'бег': ['ег', 'running'], 'велосипед' : ['елоси'], 'лестница' : ['естн', 'одъ', 'tair'], 
           'автомобиль' : ['втомо', 'ашин'], 'метро' : ['метро'], 'автобус' : ['автоб'], 'самокат' : ['амок']}

ANSWERS = {'стояние' : 0, 'ходьба' : 1, 'бег' : 2, 'велосипед' : 3, 'лестница' : 4, 
           'автомобиль' : 5, 'метро' : 6, 'автобус' : 7, 'самокат' : 8}

In [None]:
def get_type(name):
    return name.split("_")[-2]

In [265]:
def get_action(name):
    for act, classes in CLASSES.items():
        for cls in classes:
            if name.lower().find(cls) != -1:
                return act
    return None

In [266]:
grouped_tracks = {ans: [] for ans in ANSWERS}

for df, target in data:
    action = get_action(target)
    if action:
        grouped_tracks[get_action(target)].append(df)

In [579]:
def process_track(df, *, track, start=-1, end=-1, cut_ratio=0.2, plotting=True, training=True):
    result_row = pd.Series({'amplitude': None, 'freq_1': None, 'freq_2': None, 'freq_1_amp': None, 'freq_2_amp': None, 'freq': None})
    
    start = int(len(df)*cut_ratio) if start == -1 else start
    end = -int(len(df)*cut_ratio) if end == -1 else end
    
    df = df.applymap(lambda x: str(x).replace(',','.'))
    
    try:
        df['time'] = df['time'].astype(float)
        df['gFx'] = df['gFx'].astype(float)
        df['gFy'] = df['gFy'].astype(float)
        df['gFz'] = df['gFz'].astype(float)
    except Exception as exc:
        logger.error('Cannot process track: %s\n%s\n', exc, track)
        return result_row
    
    print(track)
    cropped_df = df.iloc[start: end]
    acc_df = cropped_df[['gFx', 'gFy', 'gFz']]
    
    # Plotting
    fig, ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9 = [None] * 10
    
    if plotting:
        fig, ((ax1, ax2, ax3), (ax4, ax5, ax6), (ax7, ax8, ax9)) = plt.subplots(nrows=3, ncols=3, figsize=(15,10))
    
    if plotting:
        ax1.plot(acc_df['gFx'], c='g')
        ax1.plot(acc_df['gFy'], c='r')
        ax1.plot(acc_df['gFz'], c='b')
    
    mean_acc_v = np.array(acc_df.mean())
    
    origin_acc_v = np.array([0, 0, 1])
    rotation_mat = rotation_matrix_from_vectors(mean_acc_v, origin_acc_v)
    
    def rotate(acc):
        v = rotation_mat @ acc
        return pd.Series({'gFx': v[0], 'gFy': v[1], 'gFz': v[2]})
    
    rotated_acc_df = pd.DataFrame(acc_df.apply(rotate, axis=1))
    
    #np.array(rotated_acc_df.mean())
    rotated_acc_df['gFz'] = rotated_acc_df['gFz'] - 1.0
    
    dt = cropped_df['time'].diff().mean()
    fs = int(1.0 / dt)
    
    if fs == 0:
        return result_row
    
    print("Frequency:", fs)
    
    x = integrate(integrate(rotated_acc_df['gFx'], dt), dt)
    y = integrate(integrate(rotated_acc_df['gFy'], dt), dt)
    z = integrate(integrate(rotated_acc_df['gFz'], dt), dt)
    
    # Plotting
    if plotting:
        ax7.plot(x)
        ax8.plot(y)
        ax9.plot(z)
    
    acc = np.sqrt(acc_df.gFx**2 + acc_df.gFy**2 + acc_df.gFz**2)
    amp = max(acc)-min(acc)
    
    result_row['amplitude'] = amp
    
    if training:
        if get_action(track) not in ANSWERS:
            return None
        result_row['true_ans'] = ANSWERS[get_action(track)]
    
    def process_fft(X, ax):
        frq, yabs = fourier_transform(X, fs)
        # Plotting
        if plotting:
            ax.plot(frq, yabs)
            ax.set_xlim([0, 30])
            ax.set_ylim([0, 0.1])
        return frq, yabs
    
    
    frq, acc_freqs = process_fft(acc.values, ax3)
    acc_freqs_sorted = sorted(zip(frq, acc_freqs), key=lambda x: x[1], reverse=True)
    
    result_row['freq'] = fs
    result_row['freq_1'] = acc_freqs_sorted[1][0]
    result_row['freq_2'] = acc_freqs_sorted[2][0]
    result_row['freq_1_amp'] = acc_freqs_sorted[1][1]
    result_row['freq_2_amp'] = acc_freqs_sorted[2][1]
    
    process_fft(acc_df['gFx'].values, ax4)
    process_fft(acc_df['gFy'].values, ax5)
    process_fft(acc_df['gFz'].values, ax6)
    
    # Plotting
    if plotting:
        ax2.plot(acc)
        plt.show()
    
    return result_row

In [41]:
def get_track_num(target):
    return int(target.split('_')[1].split('.')[0])

In [484]:
testing_data = sorted(testing_data, key=lambda x: get_track_num(x[1]))

In [482]:
def df_for_training(raw_data):
    X_df = pd.DataFrame()

    for track_df, target in raw_data:
        track_row = process_track(track_df, track=target, plotting=False, training=True)
        X_df = X_df.append(track_row, ignore_index=True)
    return X_df

In [394]:
def df_for_testing(raw_data):
    X_df = pd.DataFrame()

    for track_df, target in raw_data:
        track_row = process_track(track_df, track=target, plotting=False, training=False)
        track_row['track_num'] = get_track_num(target)
        X_df = X_df.append(track_row, ignore_index=True)
    return X_df

In [582]:
X_df = df_for_training(data)

Aleksandr Pogodaev - Погодаев_бег_1
Frequency: 499
Aleksandr Pogodaev - Погодаев_бег_2
Frequency: 499
Aleksandr Pogodaev - Погодаев_бег_3
Frequency: 499
Aleksandr Pogodaev - Погодаев_бег_4
Frequency: 499
Aleksandr Pogodaev - Погодаев_езда_на_велосипеде_1
Frequency: 499
Aleksandr Pogodaev - Погодаев_езда_на_велосипеде_2
Frequency: 499
Aleksandr Pogodaev - Погодаев_езда_на_велосипеде_3
Frequency: 499
Aleksandr Pogodaev - Погодаев_езда_на_велосипеде_4
Frequency: 499
Aleksandr Pogodaev - Погодаев_подъем_по_лестнице_1
Frequency: 499
Aleksandr Pogodaev - Погодаев_подъем_по_лестнице_2
Frequency: 499
Aleksandr Pogodaev - Погодаев_подъем_по_лестнице_3
Frequency: 499
Aleksandr Pogodaev - Погодаев_подъем_по_лестнице_4
Frequency: 499
Aleksandr Pogodaev - Погодаев_стояние_на_месте_1
Frequency: 499
Aleksandr Pogodaev - Погодаев_стояние_на_месте_2
Frequency: 499
Aleksandr Pogodaev - Погодаев_стояние_на_месте_3
Frequency: 499
Aleksandr Pogodaev - Погодаев_стояние_на_месте_4
Frequency: 499
Aleksandr Po

Frequency: 100
Polina Roshchina - Рощина_автомобиль_4.csv
Frequency: 100
Polina Roshchina - Рощина_бег_1.csv
Frequency: 7
Polina Roshchina - Рощина_бег_2.csv
Frequency: 821
Polina Roshchina - Рощина_бег_3.csv
Frequency: 1829
Polina Roshchina - Рощина_бег_4.csv
Frequency: 16
Polina Roshchina - Рощина_велосипед_1.csv
Frequency: 453
Polina Roshchina - Рощина_велосипед_2.csv
Frequency: 7
Polina Roshchina - Рощина_велосипед_3.csv
Frequency: 1558
Polina Roshchina - Рощина_лестница_1.csv
Frequency: 7
Polina Roshchina - Рощина_лестница_2.csv
Frequency: 619
Polina Roshchina - Рощина_лестница_3.csv
Frequency: 100
Polina Roshchina - Рощина_лестница_4.csv
Frequency: 100
Polina Roshchina - Рощина_стояние_1.csv
Frequency: 13
Polina Roshchina - Рощина_стояние_2.csv
Frequency: 100
Polina Roshchina - Рощина_стояние_3.csv
Frequency: 100
Polina Roshchina - Рощина_стояние_4.csv
Frequency: 76
Polina Roshchina - Рощина_ходьба_1.csv
Frequency: 66
Polina Roshchina - Рощина_ходьба_2.csv
Frequency: 100
Polina R

Frequency: 100
Данила Котельников - Котельников_автомобиль_3.csv
Frequency: 100
Данила Котельников - Котельников_автомобиль_4.csv
Frequency: 100
Данила Котельников - Котельников_бег_1.csv
Frequency: 100
Данила Котельников - Котельников_бег_2.csv
Frequency: 100
Данила Котельников - Котельников_бег_3.csv
Frequency: 100
Данила Котельников - Котельников_бег_4.csv
Frequency: 100
Данила Котельников - Котельников_велосипед_1.csv
Frequency: 23
Данила Котельников - Котельников_велосипед_2.csv
Frequency: 100
Данила Котельников - Котельников_велосипед_3.csv
Frequency: 100
Данила Котельников - Котельников_велосипед_4.csv
Frequency: 100
Данила Котельников - Котельников_стояние_1.csv
Frequency: 100
Данила Котельников - Котельников_стояние_2.csv
Frequency: 100
Данила Котельников - Котельников_стояние_3.csv
Frequency: 100
Данила Котельников - Котельников_стояние_4.csv
Frequency: 100
Данила Котельников - Котельников_ходьба_1.csv
Frequency: 100
Данила Котельников - Котельников_ходьба_2.csv
Frequency: 10

Frequency: 21
Ольга Сучкова - МитинаОВ_ходьба_2.csv
Frequency: 78
Ольга Сучкова - МитинаОВ_ходьба_3.csv
Frequency: 549
Ольга Сучкова - МитинаОВ_ходьба_4.csv
Frequency: 527
Павел Зеленский - zelenskii_метро_1
Frequency: 251
Павел Зеленский - zelenskii_метро_2
Frequency: 251
Павел Зеленский - zelenskii_метро_3
Frequency: 251
Павел Зеленский - zelenskii_метро_4
Frequency: 251
Павел Зеленский - zelenskii_стояние_1
Frequency: 251
Павел Зеленский - zelenskii_стояние_2
Frequency: 251
Павел Зеленский - zelenskii_стояние_3
Frequency: 251
Павел Зеленский - zelenskii_стояние_4
Frequency: 251
Пыркова - Пыркова_лестница_1.csv
Frequency: 100
Пыркова - Пыркова_стояние_1.csv
Frequency: 100
Пыркова - Пыркова_ходьба_1.csv
Frequency: 100
Родион - Имаев_стояние_1.csv
Frequency: 100
Родион - Имаев_ходьба_1.csv
Frequency: 100
Родион - Имаев_ходьба_2.csv
Frequency: 100
Родион - Имаев_ходьба_3.csv
Frequency: 100
Рома - романоид_бег_1.csv
Frequency: 99
Рома - романоид_бег_2.csv
Frequency: 99
Рома - романоид_бе

In [601]:
X_test_df = df_for_testing(testing_data)

track_0.csv
Frequency: 207
track_1.csv
Frequency: 395
track_2.csv
Frequency: 250
track_3.csv
Frequency: 499
track_4.csv
Frequency: 395
track_5.csv
Frequency: 100
track_6.csv
Frequency: 99
track_7.csv
Frequency: 251
track_8.csv
Frequency: 100
track_9.csv
Frequency: 198
track_10.csv
Frequency: 834
track_12.csv
Frequency: 100
track_13.csv
Frequency: 98
track_14.csv
Frequency: 100
track_15.csv
Frequency: 247
track_16.csv
Frequency: 99
track_17.csv
Frequency: 394
track_18.csv
Frequency: 408
track_19.csv
Frequency: 99
track_20.csv
Frequency: 184
track_21.csv
Frequency: 398
track_22.csv
Frequency: 500
track_23.csv
Frequency: 390
track_24.csv
Frequency: 233
track_25.csv
Frequency: 98
track_26.csv
Frequency: 499
track_27.csv
Frequency: 248
track_28.csv
Frequency: 100
track_29.csv
Frequency: 99
track_30.csv
Frequency: 406
track_31.csv
Frequency: 500
track_32.csv
Frequency: 99
track_34.csv
Frequency: 247
track_35.csv
Frequency: 407
track_36.csv
Frequency: 507
track_37.csv
Frequency: 242
track_38.

In [586]:
X_df.shape

(751, 7)

In [587]:
X_df.dropna(inplace=True)

In [588]:
X_df.shape

(645, 7)

In [589]:
X_df['true_ans'] = X_df['true_ans'].astype(int)

In [618]:
X_df.head()

Unnamed: 0,index,amplitude,freq,freq_1,freq_1_amp,freq_2,freq_2_amp,true_ans
0,0,8.700749,499.0,3.952816,0.404179,2.620889,0.286208,2
1,1,8.515114,499.0,3.952863,0.409035,2.622196,0.290147,2
2,2,8.636395,499.0,3.939474,0.443656,2.626316,0.306911,2
3,3,7.978906,499.0,3.94519,0.485288,2.61535,0.280812,2
4,4,1.935292,499.0,0.955434,0.102151,0.98838,0.093211,3


In [602]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV

In [619]:
X_df['true_ans'].value_counts()

0    168
1    166
2    110
4    107
3     44
5     22
7     13
6     12
8      3
Name: true_ans, dtype: int64

In [592]:
X_df.reset_index(inplace=True)

In [620]:
import random

ind_0 = random.sample(X_df[X_df.true_ans == 0].index.tolist(), 44)
ind_1 = random.sample(X_df[X_df.true_ans == 1].index.tolist(), 44)
ind_2 = random.sample(X_df[X_df.true_ans == 2].index.tolist(), 44)
ind_3 = random.sample(X_df[X_df.true_ans == 3].index.tolist(), 44)
ind_4 = random.sample(X_df[X_df.true_ans == 4].index.tolist(), 44)
ind_5 = random.sample(X_df[X_df.true_ans == 5].index.tolist(), 12)
ind_6 = random.sample(X_df[X_df.true_ans == 6].index.tolist(), 12)
ind_7 = random.sample(X_df[X_df.true_ans == 7].index.tolist(), 12)
ind_8 = random.sample(X_df[X_df.true_ans == 8].index.tolist(), 3)

ind_balanced = ind_0 + ind_1 + ind_2 + ind_3 + ind_4 + ind_5 + ind_6 + ind_7 + ind_8
len(ind_balanced), len(X_df)

(259, 645)

In [621]:
X_df_balanced = X_df.iloc[ind_balanced]

Training

In [652]:
X = X_df_balanced[['amplitude', 'freq_1', 'freq_2', 'freq_1_amp', 'freq_2_amp']]
y = X_df_balanced['true_ans']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

'''
tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
params = {'max_depth': range(2, 20)}
clf = GridSearchCV(tree, params, cv=5)
best_model = clf.fit(X_train, y_train)

print(best_model.best_estimator_.get_params('max_depth'))
y_pred = best_model.predict(X_test)
accuracy_score(y_test, y_pred)
'''

clf = DecisionTreeClassifier(criterion='entropy', random_state=0, max_depth=8)
clf.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=8, random_state=0)

Testing

In [624]:
X_test_df.head()

Unnamed: 0,amplitude,freq,freq_1,freq_1_amp,freq_2,freq_2_amp,track_num
0,0.22579,207.0,2.821377,0.003561,2.771879,0.002932,0
1,1.03732,395.0,6.462029,0.009785,6.836146,0.00833,1
2,1.022796,250.0,1.666102,0.074564,1.637863,0.072924,2
3,0.22279,499.0,2.645287,0.000882,2.616218,0.000868,3
4,0.340241,395.0,6.077493,0.001908,6.48513,0.001796,4


In [606]:
X_test_df['track_num'] = X_test_df['track_num'].astype(int)

In [607]:
X_test_df.shape

(183, 7)

In [633]:
X_test_df.amplitude.isnull().sum(), X_test_df.freq_1.isnull().sum(), X_test_df.freq_2.isnull().sum(), X_test_df.freq_1_amp.isnull().sum(), X_test_df.freq_2_amp.isnull().sum()

(13, 13, 13, 14, 14)

In [634]:
X_test_df_copy = X_test_df.copy()

In [635]:
X_test_df['amplitude'].fillna(X_test_df['amplitude'].median(), inplace=True)

In [636]:
X_test_df['freq_1'].fillna(X_test_df['freq_1'].median(), inplace=True)

In [637]:
X_test_df['freq_2'].fillna(X_test_df['freq_2'].median(), inplace=True)

In [638]:
X_test_df['freq_1_amp'].fillna(X_test_df['freq_1_amp'].median(), inplace=True)

In [639]:
X_test_df['freq_2_amp'].fillna(X_test_df['freq_2_amp'].median(), inplace=True)

In [640]:
X_test_df.amplitude.isnull().sum(), X_test_df.freq_1.isnull().sum(), X_test_df.freq_2.isnull().sum()

(0, 0, 0)

In [653]:
X_test = X_test_df[['amplitude', 'freq_1', 'freq_2', 'freq_1_amp', 'freq_2_amp']]
y_pred = clf.predict(X_test)

In [654]:
y_pred.shape

(183,)

In [655]:
res = pd.DataFrame({'track_num' : X_test_df['track_num'], 'action' : y_pred})
res.head(10)

Unnamed: 0,track_num,action
0,0,7
1,1,5
2,2,4
3,3,0
4,4,0
5,5,7
6,6,8
7,7,6
8,8,0
9,9,7


In [656]:
res.to_csv('tests/submission_5.csv', index=False)