In [1]:
import pandas as pd
import itertools
from scipy.signal import savgol_filter
from tqdm.notebook import tqdm
import json
from collections import defaultdict
import pickle

In [2]:
df = pd.read_parquet("tmp/data.parquet")

In [3]:
countries = df.Country_Region.unique().tolist()

In [4]:
countries[:5]

['China', 'Romania', 'Kuwait', 'Bahrain', 'Russia']

In [5]:
df.head()

Unnamed: 0_level_0,Country_Region,Confirmed
Last_Update,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-23,China,703.0
2020-03-08,China,20632.0
2020-03-12,China,279.0
2020-03-13,China,266.0
2020-03-14,China,12471.0


In [17]:
from consts import predict_days
cache = {}
    
def get_new_cases(d, c):
    """Взятие новых кейсов из датафрейма d для страны c и их сглаживание фильтром"""
    c1 = cache.get(c)
    if not c1 is None:
        return c1
    
    new_v = d[d["Country_Region"] == c]["Confirmed"]
    c1 = (new_v - new_v.shift(1)).dropna()
    try:
        f = savgol_filter(c1.to_numpy(), 51, 2)
        c1 = pd.Series(f, index=new_v.index[:-1])
        if predict_days > 1:
            c2 = c1.iloc[:-predict_days]
        cache[c] = c2
        return c2, c1.iloc[-predict_days:]
    except ValueError as e:
        return pd.Series(), pd.Series()

In [18]:
new_cases = {c: get_new_cases(df, c) for c in countries}

In [8]:
pairs = list(itertools.combinations(countries, 2))

In [9]:
result = defaultdict(dict)
threshold = 0.4
for c_one, c_two in tqdm(pairs):
    c1, _ = new_cases[c_one]
    c2, _ = new_cases[c_two]
    values = []
    for i in range(-30, 30, 1):
        values.append((c1.shift(i).corr(c2), i))
    similarity, lag = max(values, key=lambda x: x[0])
    if (lag < 0):
        c_two, c_one = c_one, c_two
    if (similarity > threshold) and (lag not in [-30, 30, 0]):
        result[c_two][c_one] = {'similarity': similarity, 'lag': lag}

  0%|          | 0/7021 [00:00<?, ?it/s]

In [13]:
#
with open('tmp/lags.json', 'w') as f:
    json.dump(result, f, sort_keys=True, indent='  ')

In [14]:
with open('tmp/cases.json', 'wb') as f:
    pickle.dump(new_cases, f)

In [12]:
#with open('tmp/cases.json', 'rb') as f:
#    w = pickle.load(f)