## Teste das bibliotecas de Clustering de Time-Series

In [42]:
# importar bibliotecas
from dtaidistance import dtw
from dtaidistance import dtw_visualisation as dtwvis
from dtaidistance import clustering

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.sql.functions import col
import numpy as np
import pandas as pd

In [43]:
# criar contexto e configuração para o Spark
conf = SparkConf().setAppName("Covid-19 EDA")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=Covid-19 EDA, master=local[*]) created by __init__ at <ipython-input-3-37d8f282b498>:3 

In [84]:
# ler arquivo de casos mundiais confirmados para dataframe Spark
confirmed_world_cases = sqlContext.read.csv('../datasets/time_series_covid19_confirmed_global.csv', sep=',', header=True)

In [85]:
# converte colunas de datas em linhas com melt
confirmed_world_cases_pd = confirmed_world_cases.toPandas().melt(id_vars=["Province/State", "Country/Region", "Lat", "Long"], var_name="date", value_name="Value")

In [86]:
# converte tipos das colunas para os formatos corretos

confirmed_world_cases_pd.Lat = confirmed_world_cases_pd.Lat.astype(float)
confirmed_world_cases_pd.Long = confirmed_world_cases_pd.Long.astype(float)
confirmed_world_cases_pd.date = pd.to_datetime(confirmed_world_cases_pd.date)
confirmed_world_cases_pd.Value = confirmed_world_cases_pd.Value.astype(int)

In [87]:
# Recria Spark dataframes, com estrtutura nova
schema_jhu = StructType([StructField('province_state', StringType(), True),
                         StructField('country_region', StringType(), True),
                         StructField('lat', FloatType(), True),
                         StructField('long', FloatType(), True),
                         StructField('date', TimestampType(), True),
                         StructField('value', IntegerType(), True)])

confirmed_world_cases = sqlContext.createDataFrame(confirmed_world_cases_pd, schema=schema_jhu)

In [88]:
# remover linhas onde value seja igual a zero
confirmed_world_cases = confirmed_world_cases.filter("value != 0")

In [89]:
# Criar coluna com qtde_dias desde o ínicio, pra cada país
column_list = ['province_state', 'country_region']

confirmed_world_cases = confirmed_world_cases.select("province_state","country_region", "lat", "long", "date", \
                                             "value", F.row_number()\
                                             .over(Window.partitionBy([col(x) for x in column_list]) \
                                             .orderBy(confirmed_world_cases['date'])).alias("num_days"))

In [90]:
# Criar Janela de Dados por País, Província e Numero de Dias (poderia ser a data também)
my_window = Window.partitionBy("country_region", "province_state").orderBy("num_days")

In [97]:
# Criar coluna para casos confirmados no dia
confirmed_world_cases = confirmed_world_cases.withColumn("prev", F.lag(confirmed_world_cases.value).over(my_window))
confirmed_world_cases = confirmed_world_cases.withColumn("on_day", F.when(F.isnull(confirmed_world_cases.value - confirmed_world_cases.prev), confirmed_world_cases.value)
                              .otherwise(confirmed_world_cases.value - confirmed_world_cases.prev))

# Remover as colunas de lag
confirmed_world_cases = confirmed_world_cases.drop(*['prev'])

In [98]:
# Preparar dataframe para clustering

# remover colunas de province_state, lat, long, date
columns_to_drop = ['province_state', 'lat', 'long', 'date']
clustering_df = confirmed_world_cases.drop(*columns_to_drop)

In [99]:
clustering_df.summary().show()

+-------+--------------+-----------------+-----------------+------------------+
|summary|country_region|            value|         num_days|            on_day|
+-------+--------------+-----------------+-----------------+------------------+
|  count|         18889|            18889|            18889|             18889|
|   mean|          null|5867.937900365292|39.74816030493938| 217.1474932500397|
| stddev|          null|43852.27773708838|26.28890594010864|1546.5062526308423|
|    min|   Afghanistan|                1|                1|            -10034|
|    25%|          null|               16|               18|                 0|
|    50%|          null|              141|               36|                 2|
|    75%|          null|              833|               56|                28|
|    max|      Zimbabwe|          1329260|              110|             36188|
+-------+--------------+-----------------+-----------------+------------------+



In [101]:
# criar um dataframe, convertendo num_days em colunas (pivot)
clustering_pivot_df = clustering_df.groupby("country_region").pivot("num_days").sum("on_day")

In [102]:
# criar numpy array, com values
clustering_array_np = np.array(clustering_pivot_df.select(clustering_pivot_df.columns[10:20]).limit(20).collect(), dtype=np.double)

In [103]:
clustering_array_np.shape

(20, 10)

In [104]:
# imprimir array sem notação científica
np.set_printoptions(suppress=True)
clustering_array_np

array([[   0.,    0.,    2.,    2.,    0.,    1.,    0.,    1.,    0.,
           0.],
       [   1.,    2.,    0.,    2.,    5.,    4.,    0.,    5.,   10.,
           4.],
       [   0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.],
       [   0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.],
       [   0.,    0.,    6.,    0.,   14.,    0.,    2.,    5.,    0.,
           7.],
       [   0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.],
       [   1.,    0.,    0.,    0.,    0.,    0.,    1.,    0.,    0.,
           0.],
       [   0.,   12.,    1.,  -15.,    0.,    0.,    0.,    3.,    0.,
           0.],
       [   0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.],
       [   0.,    3.,    0.,    7.,    0.,    7.,    0.,    2.,    0.,
           2.],
       [   0.,    1.,    0.,    0.,    1.,    4.,   11.,    3.,   21.,
           1.],
       [   1.,    2.,    4.,    0.,   12., 

In [107]:
# Clustering
model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {})
cluster_idx = model1.fit(clustering_array_np)
# Augment Hierarchical object to keep track of the full tree
model2 = clustering.HierarchicalTree(model1)
cluster_idx = model2.fit(clustering_array_np)


 95%|█████████▌| 19/20 [00:00<00:00, 14032.71it/s]

 95%|█████████▌| 19/20 [00:00<00:00, 12977.00it/s]


In [109]:
# SciPy linkage clustering
model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {})
cluster_idx = model3.fit(clustering_array_np)

In [110]:
cluster_idx

In [111]:
model3.plot("plot.png")

(None, None)