In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


# Importing datasets

In [None]:
df_trts = pd.read_csv('train_time_series.csv')

In [None]:
df_trl = pd.read_csv('train_labels.csv')

# Merging Datasets based on timestamp

In [None]:
df = pd.merge(df_trl,df_trts, on=['timestamp'])

In [None]:
df.head()

# Keeping only useful columns

In [None]:
df = df[['timestamp','label','x','y','z']]

In [None]:
df.shape

In [None]:
df.isnull().sum()

# Check x,y,z for skewes

In [None]:
df[['x','y','z']].skew()

# Transforming variables to avoid skew

In [None]:
df['z_norm'] = np.log10(df['z']+np.absolute(df['z'].min()) + 1)

In [None]:
df.skew()

# Splitting subdataset for training and testing

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
input_variables = ['x','y','z_norm']
X = df[input_variables]
y = df[['label']]
X_train, X_test,y_train,y_test = train_test_split(X,y,train_size = 0.5, random_state = 1)

## uso x, y , z_norm

 # Adjusting logistic regression

In [None]:
clf = LogisticRegression()

In [None]:
clf = LogisticRegression()

In [None]:
clf.fit(X_train,y_train)

In [None]:
print('logistic regression score=',clf.score(X_test,y_test))

In [None]:
def accuracy (estimator,X,y):
    fitted = estimator.fit(X,y)
    predicted = estimator.fit(X,y).predict(X)
    corr = accuracy_score(y,predicted)
    return corr

In [None]:
print('logistic regression accuracy=',accuracy(clf,X_train,y_train))

# Adjusting random forest classification

In [None]:
random_forest = RandomForestClassifier(max_depth=4,random_state=0)

In [None]:
print('random_forest_accuracy=',accuracy(random_forest,X_train,y_train))

In [None]:
print('Random Forest Score',random_forest.score(X_test,y_test))

# uso x , y, z

In [None]:
input_variables = ['x','y','z']
X = df[input_variables]
y = df[['label']]
X_train, X_test,y_train,y_test = train_test_split(X,y,train_size = 0.5, random_state = 1)

In [None]:
clf.fit(X_train,y_train)
print('logistic regression score=',clf.score(X_test,y_test))
print('logistic regression accuracy=',accuracy(clf,X_train,y_train))

In [None]:
random_forest.fit(X_train,y_train)
print('random_forest_accuracy=',accuracy(random_forest,X_train,y_train))
print('Random Forest Score',random_forest.score(X_test,y_test))

# uso x,y

In [None]:
def testing(df,input_variables):
    X = df[input_variables]
    y = df[['label']]
    X_train, X_test,y_train,y_test = train_test_split(X,y,train_size = 0.5, random_state = 1)
    clf.fit(X_train,y_train)
    print('logistic regression score=',clf.score(X_test,y_test))
    print('logistic regression accuracy=',accuracy(clf,X_train,y_train))
    random_forest.fit(X_train,y_train)
    print('random_forest_accuracy=',accuracy(random_forest,X_train,y_train))
    print('Random Forest Score',random_forest.score(X_test,y_test))

In [None]:
input_variables = ['x', 'y']
testing(df,input_variables)

# uso x,z

In [None]:
testing(df,['x','z'])

# uso y , z

In [None]:
testing(df,['y','z'])

# cambio depth de random forest a 3

In [None]:
random_forest = RandomForestClassifier(max_depth=3,random_state=0)

In [None]:
#testing x, y ,z
testing(df,['x','y','z'])

In [None]:
#testing x,y
testing(df,['x','y'])

In [None]:

plotting_variables = ['x', 'y', 'z']

axes = pd.plotting.scatter_matrix(df[plotting_variables], alpha=0.15, \
       color=(0,0,0), hist_kwds={"color":(0,0,0)}, facecolor=(1,0,0))
# show the plot.
plt.show()

In [None]:
np.corrcoef(df['x'],df['z'])

In [None]:
df.corr()

In [None]:
df.head(50)

In [None]:
plt.scatter(df['timestamp'],df['label'])

In [None]:
df.timestamp.describe()

In [None]:
df_trts['UTC time'].describe()

## Voy a agarrar la base con los datos de acelerometro y llenar los gaps de label entre mediciones cuando el label sea el mismo 

In [None]:
df2 = pd.merge(df_trts,df_trl, on=['timestamp'], how = 'left')

In [None]:
df2.head(100)

In [None]:
df3 = df2

In [None]:
df3['label']=df3['label'].where(df3['label'].notnull(), other=(df3['label'].fillna(method='ffill')+df3['label'].fillna(method='bfill'))/2)

In [None]:
df3 = df3[['timestamp','x','y','z','label']]

In [None]:
df3.label = df3['label'].replace([1.5,2.5,3.5],np.nan)

In [None]:
df3.head()

In [None]:
df3 = df3.dropna()

In [None]:
df3.head(100)

In [None]:
df3 = df3.reset_index()

In [None]:
df3 = df3.drop('index',axis = 1)

In [None]:
df3

In [None]:
df3['acceleration'] = (df3.x**2 + df3.y**2 + df3.z**2)**0.5

In [None]:
df3.head()

In [None]:
plt.bar(x = df3['label'], height = np.mean(df3['acceleration']))

In [None]:
pd.pivot_table(df3, values = ['acceleration','x','y','z'],index = 'label',aggfunc=['mean','std'])

In [None]:
df3.tail()

In [None]:
testing(df3,['x','y','z','acceleration'])

In [None]:
df3['acceleration_xz'] = (df3.x**2 + df3.z**2)**0.5

In [None]:
pd.pivot_table(df3, values = ['acceleration','acceleration_xz','x','y','z'],index = 'label',aggfunc=['mean','std'])

In [None]:
testing(df3,['acceleration_xz','x','z'])

In [None]:
testing(df3,['acceleration_xz','y'])

In [None]:
df3.skew()

In [None]:
df3['z_log'] = np.log10(1+df3['z'])

In [None]:
df3['acceleration_xz_log'] = np.log10(1+df3['acceleration_xz'])

In [None]:
df3.head()

In [None]:
df3.skew()

In [None]:
df3.z.describe()

# df4 elimine el cuartil inferior y el cuartil superior de z porque tenian muchos outliers

In [None]:

# Get names of indexes for which column Age has value 30
indexNames = df3[df3['z'] > 0.2].index
 
# Delete these row indexes from dataFrame
df4 = df3.drop(indexNames )

In [None]:
# Get names of indexes for which column Age has value 30
indexNames = df4[df4['z']< -0.12].index
 
# Delete these row indexes from dataFrame
df4.drop(indexNames , inplace=True)

In [None]:
df4.z.skew()

In [None]:
df4.head()

In [None]:
df3.shape

In [None]:
df4.shape

In [None]:
df4 = df4.dropna()

In [None]:
df4.shape

In [None]:
df4.skew()

In [None]:
testing(df4,['y','acceleration_xz'])

In [None]:
plt.plot(df3.z)

In [None]:
plt.plot(df3.y)

In [None]:
plt.plot(df3.x)

In [None]:
plt.scatter(df.x,df.y,c = df.label, cmap = 'winter')


In [None]:
plt.plot(df4.timestamp,df4.acceleration_xz,'bo')

# Empiezo a trabajar con movelets

In [None]:
df3.to_csv(r'df3.csv')

In [None]:
df_label1_1 = df3.loc[120:220]
df_label1_1.head()

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df_label1_1.timestamp,df_label1_1.x)
plt.plot(df_label1_1.timestamp,df_label1_1.y)
plt.plot(df_label1_1.timestamp,df_label1_1.z)
plt.plot(df_label1_1.timestamp,df_label1_1.acceleration)
plt.legend()
plt.show()

In [None]:
df3.head()

In [None]:
df_label2_1 = df3.loc[83:183]
df_label2_1.tail()

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df_label2_1.timestamp,df_label2_1.x)
plt.plot(df_label2_1.timestamp,df_label2_1.y)
plt.plot(df_label2_1.timestamp,df_label2_1.z)
plt.plot(df_label2_1.timestamp,df_label2_1.acceleration)
plt.legend()
plt.show()

In [None]:
df_label3_1 = df3.loc[1582:1682]
df_label3_1.tail(10)

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df_label3_1.timestamp,df_label3_1.x)
plt.plot(df_label3_1.timestamp,df_label3_1.y)
plt.plot(df_label3_1.timestamp,df_label3_1.z)
plt.plot(df_label3_1.timestamp,df_label3_1.acceleration)
plt.legend()
plt.show()

In [None]:
df_label4_1 = df3.loc[961:1061]
df_label4_1.tail(20)

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df_label4_1.timestamp,df_label4_1.x)
plt.plot(df_label4_1.timestamp,df_label4_1.y)
plt.plot(df_label4_1.timestamp,df_label4_1.z)
plt.plot(df_label4_1.timestamp,df_label4_1.acceleration)
plt.legend()
plt.show()

In [None]:
i = 0
j= 10
movelet_1_x = []
for n in range(30):
    movelet_1_x.append(df_label1_1.x[i:j])
    i+=1
    j+=1


In [None]:
i = 0
j= 10
movelet_1_y = []
for n in range(30):
    movelet_1_y.append(df_label1_1.y[i:j])
    i+=1
    j+=1

In [None]:
i = 0
j= 10
movelet_1_z = []
for n in range(30):
    movelet_1_z.append(df_label1_1.z[i:j])
    i+=1
    j+=1

In [None]:
def movelet_generator(df):
    df = df.reset_index()
    axis = ['x','y','z','acceleration']
    i = 0
    j= 10
    movelet = pd.DataFrame(columns = axis)
    for n in range(30):
        temp_dict = {}
        movelet.x = temp_dict(x)
        
        for a in axis:
            temp = df[a][i:j]
            temp_dict[a] = temp
        i+=1
        j+=1
    return movelet

    

In [None]:
movelet_dict_1 = movelet_generator(df_label1_1)

In [None]:
movelet_dict_1['x'] = [1,2,3,4,5,6,7,8,9]

In [None]:
movelet_dict_1['x'].iloc[0] = df_label1_1['x'][0:10]

In [None]:
movelet_dict_1['x']

In [None]:
m = pd.DataFrame(['x','y','z'])

In [None]:
df_label1_1 = df_label1_1.reset_index()
m['x'] = df_label1_1['x'][0:10]

In [None]:
m['x'] = df_label1_1['x'][0:10]

In [None]:
m['x']

In [None]:
toto = df_label1_1['x'][0:10]

In [None]:
toto

In [None]:
m['x'] = m['x'].append (toto, ignore_index = True)

In [None]:
m.x

In [None]:
movelet_dict_1['label']  =1

In [None]:
movelet_dict_2 = movelet_generator(df_label2_1)

In [None]:
movelet_dict_2['label'] = 2

In [None]:
movelet_dict_3 = movelet_generator(df_label3_1)

In [None]:
movelet_dict_3['label']  = 3

In [None]:
movelet_dict_4 = movelet_generator(df_label4_1)

In [None]:
movelet_dict_4['label']  = 4

In [None]:
movelet_dict_4.acceleration.loc[0]

In [None]:
movelet_dict = pd.concat([movelet_dict_1,movelet_dict_2,movelet_dict_3,movelet_dict_4], axis=0, join='outer', ignore_index=True, keys=None,
          levels=None, names=None, verify_integrity=False, copy=True)

In [None]:
movelet_dict_1


In [None]:
movelet_dict_4['z'][29]

In [None]:
X_test = df3[['x','y','z','acceleration','label']][657:667]

In [None]:
X_test.head()

In [None]:
#ahora tengo que sacar la diferencia entre el movelet incongnita y el diccionario de movelet

In [None]:
mdictx =movelet_dict['x'][0:10][0].reset_index(drop = True)

In [None]:
mdictx

In [None]:
X_test_x = X_test['x'].reset_index(drop = True)

In [None]:
diff = mdictx - X_test_x

In [None]:
diff

In [None]:
np.sum(diff**2)

In [None]:
def label_prediction(X_test,movelet_dict):
    #x_testx = X_test['x'].reset_index(drop = True)
    #x_testy = X_test['y'].reset_index(drop = True)
    #x_testz = X_test['z'].reset_index(drop =True)
    #x_testa = X_test['acceleration'].reset_index(drop =True)
    min_label_predict = []
    axis = ['x','y','z','acceleration']
    for i in range(movelet_dict.shape[0]):
        for j in axis:
            differences= []
            mdict = movelet_dict[j][i].reset_index(drop = True)
            diff = np.sum((mdict - (X_test[j].reset_index(drop = True))**2))
            differences.append(diff)
            min_loc = differences.index(min(differences))
            min_label = movelet_dict['label'][min_loc]
            min_label_predict.append(min_label)
    return min_label_predict
    


In [None]:
labels = label_prediction(X_test,movelet_dict )

In [None]:
min_label_predict = []
axis = ['x','y','z','acceleration']
for j in axis:
    differences= []
    for i in range(movelet_dict.shape[0]):
        mdict = movelet_dict[j][i].reset_index(drop = True)
        diff = np.sum((mdict - (X_test[j].reset_index(drop = True))**2))
        differences.append(diff)
    min_loc = differences.index(min(differences))
    min_label = movelet_dict['label'][min_loc]
    min_label_predict.append(min_label)
print(min_label_predict)


In [None]:
min_loc

In [None]:
x_testx = X_test['x'].reset_index(drop = True) 
differences = []
for i in range(movelet_dict.shape[0]):
    mdict_x = movelet_dict['x'][i].reset_index(drop = True)
    diff = np.sum((mdict_x - x_testx)**2)
    differences.append(diff)
    min_loc = differences.index(min(differences))
    min_label = movelet_dict['label'][min_loc]

In [None]:
movelet_dict

In [None]:
min_loc

In [None]:
movelet_dict.label.loc[110]

In [None]:
#HAsta aca obtuve el label para el eje x, tengo que hacer lo mismo para 'y' y 'z' y definir de acuerdo al que sea mayoritario.
#si tengo los 3 distintos tendria que ver cual es para cada uno el segundo mejor ajuste...

In [None]:
x_testy = X_test['y'].reset_index(drop = True) 
differences = []
for i in range(movelet_dict.shape[0]):
    mdict_y = movelet_dict['y'][i].reset_index(drop = True)
    diff = np.sum((mdict_y - x_testy)**2)
    differences.append(diff)
    min_loc = differences.index(min(differences))
    min_label = movelet_dict['label'][min_loc]

In [None]:
movelet_dict.label.loc[min_loc]

In [None]:
differences[min_loc]

In [None]:
movelet_dict['index_m'] = range(0,movelet_dict.shape[0])

In [None]:
plt.figure(figsize=(12,8))
plt.plot(range(10),movelet_dict.x[9], color = 'grey')
plt.plot(range(10),movelet_dict.y[9], color = 'yellow')
plt.plot(range(10),movelet_dict.z[9], color = 'green')
plt.plot(range(10),x_testy, color = 'orange')
plt.plot(range(10),x_testx, color = 'black')
plt.plot(range(10),X_test['z'].reset_index(drop = True) , color = 'blue')
plt.plot(range(10),X_test['acceleration'].reset_index(drop = True) , color = 'red')
#plt.plot(range(10),movelet_dict.acceleration[9].reset_index(drop = True) , color = 'red')

#plt.plot(df_label4_1.timestamp,df_label4_1.acceleration)
plt.legend()
plt.show()

In [None]:
movelet_dict.z[9]

In [None]:
movelet_dict.tail()