In [1]:
'''
Load the training dataframe
Sample for 10000 rows
Convert the rows to appropriate format for fast DTW 
Pairwise DTW on all samples - 10^6 copmutations, ~10^-3
'''

'\nLoad the training dataframe\nSample for 10000 rows\nConvert the rows to appropriate format for fast DTW \nPairwise DTW on all samples - 10^6 copmutations, ~10^-3\n'

In [2]:
import sklearn.metrics.pairwise as skm
import sklearn.preprocessing as skp
import pandas as pd
import numpy as np
import fastdtw
from timeit import default_timer as timer
import sklearn.cluster as skc
from sklearn.model_selection import train_test_split

In [3]:
def dtw(arr1, arr2):
    error, _ = fastdtw.fastdtw(arr1, arr2, radius=1, dist=2)
    return error

## Read-in and Normalize data

In [4]:
rawData_df = pd.read_csv('input/train_1.csv')
print(rawData_df.shape)
# Fill all NaN with zeros
rawData_df.fillna(value=0.0,inplace=True)
rawData_df.drop('Page',axis=1,inplace=True)

# Shuffle the dataframe 
rawData_df = rawData_df.sample(frac=1)

# Could also scale [0,1/N] or z-normalize
scaled_data = skp.MinMaxScaler(feature_range=(0, 1), copy=True).fit_transform(rawData_df)

scaledData_df = pd.DataFrame(data=scaled_data,            # values
                index=rawData_df.index.values,            # 1st column as index
                columns=rawData_df.columns)               # 1st row as the column names


(145063, 551)


## Split the dataframe

In [5]:
dataRows_df = scaledData_df.iloc[:1000]

# Write some sample rows to file
dataRows_df.to_csv(path_or_buf='Processing/Data_Rows.txt', header=False, sep=' ',index=False, index_label=False, line_terminator=' ', na_rep=0.0)



In [None]:
splitPoints = list(range(10000, len(scaledData_df), 10000))
for i, val in enumerate(splitPoints):
    tmpQueryRows = scaledData_df.iloc[val:val+10000]
    filePath = 'Processing/Query_Rows_' + i + '.txt' 
    queryRows_df.to_csv(path_or_buf=filePath, header=False, sep=' ',index=False, index_label=False, line_terminator=' ', na_rep=0.0)
    

## Write our files for offline processing

In [7]:
## Pairwise DTF of all "data-rows" (10000 rows)

In [8]:
# n_jobs = -1 to turn on parallel
start = timer()
K = skm.pairwise_kernels(dataRows_df, metric=dtw, n_jobs=1)
end = timer()
print(end - start)      

np.savetxt("Processing/DTW_Matrix.out.gz", K, delimiter=',')

# FASTDTW ON SCALE 4E-4 PER PAIR
# AT 10^3 SAMPLES, 10^6 ELS, SO 10^2 SECONDS. Tolerance for production up to 10^4 elements 
# several hours of runtime

383.9774313321104


AttributeError: module 'numpy' has no attribute 'savetext'

In [10]:
np.savetxt("Processing/DTW_Matrix.out.gz", K, delimiter=',')


In [None]:
## Run Hierarchical clustering on the matrix 
cluster = skc.AgglomerativeClustering(n_clusters=10, affinity='euclidean', connectivity=K, compute_full_tree=True, linkage='ward')
labels = cluster.fit_predict(K)