In [4]:
!python3 -m pip install PyMySQL
!python3 -m pip install SQLAlchemy
!python3 -m pip install google-cloud-storage
!python3 -m pip install --upgrade --quiet scikit-sound
!python3 -m pip install --upgrade --quiet pygame
!sudo apt-get -y install ffmpeg
!sudo apt-get -y install python3-pymysql
!python3 -m pip install apache-beam[gcp]

Successfully installed apache-beam-2.16.0 avro-python3-1.9.1 crcmod-1.7 dill-0.3.0 docopt-0.6.2 fastavro-0.21.24 fasteners-0.15 google-apitools-0.5.28 google-cloud-bigquery-1.17.1 google-cloud-bigtable-1.0.0 google-cloud-datastore-1.7.4 google-cloud-pubsub-1.0.2 hdfs-2.5.8 httplib2-0.12.0 mock-2.0.0 monotonic-1.5 oauth2client-3.0.0 pbr-5.4.3 pymongo-3.9.0 pyyaml-3.13


In [None]:
import apache_beam as beam
from apache_beam.options.pipeline_options import GoogleCloudOptions, PipelineOptions, StandardOptions
import pickle
import numpy as np

options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'wdp-ds'
google_cloud_options.job_name = 'heavy-dtw'
google_cloud_options.staging_location = 'gs://wdp-data/binaries'
google_cloud_options.temp_location = 'gs://wdp-data/temp'
options.view_as(StandardOptions).runner = 'DataflowRunner'



outputs_prefix = 'gs://wdp-data/distances.txt'
with beam.Pipeline(options=options) as p:
    
    latent = pickle.load( open( "data.p", "rb" ) )
    def dtw_jobs():
        for i in range(len(latent)):
            for j in range(len(latent)):
                if i < j and abs(len(latent[i][0]) - len(latent[j][0])):
                    yield (i,j)

    def dtw(job):
        import numpy as np
        GAP_PENALTY = 10.0
        id_i = job[0]
        id_j = job[1]
        x = latent[id_i][0]
        y = latent[id_j][0]
        n = x.shape[0]
        m = y.shape[0]
        w = max(round((n + m) / 10), abs(n - m) + 2)

        dp = np.ones((n + 1, m + 1)) * float('inf')
        dp[0,0] = 0.0
        for i in range(1, n + 1):
            for j in range(max(1, i-w), min(m + 1, i+w)):
                dp[i][j] =  np.sqrt(np.sum(np.square(x[i - 1, :] - y[j - 1, :]))) + min([
                    dp[i - 1][j - 1],
                    GAP_PENALTY + dp[i - 1][j    ],
                    GAP_PENALTY + dp[i    ][j - 1]
                ])
        if np.isinf(dp[n][m]):
            print('\t\tERROR: inf in warping')
        return (id_i, id_j, dp[n][m] / (n + m))

    jobs = p | 'Create Jobs' >> beam.Create(dtw_jobs())
    distances = (
        jobs 
        | 'PairwiseDistance' >> beam.Map(dtw)
    )
    (
      distances
      | 'Format results' >> beam.Map(lambda result: '{}, {}, {}'.format(result[0], result[1], result[2]))
      | 'Write results' >> beam.io.WriteToText(outputs_prefix, num_shards=1)
    )
    result = p.run()
    result.wait_until_finish()

