In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from statsmodels.tsa.vector_ar.var_model import VARProcess

random_generator = np.random.default_rng(0)


In [3]:
def five_ancestor_structure(lags, nvar):
    flag = False
    matrix = np.zeros((lags, nvar, nvar))
    # forall dim2, choose 5 pairs (lag,var) to activate
    for target in range(nvar):
        candidate = list(range(nvar))
        candidate.remove(target)
        for k in range(5):
            source = random_generator.choice(candidate)
            candidate.remove(source)
            lag = random_generator.choice(range(lags))
            matrix[lag, target, source] = (0.5+random_generator.random()*0.4) * random_generator.choice([-1,1])
        for lag in range(lags):
            matrix[lag, target, target] = (0.5+random_generator.random()*0.4) * random_generator.choice([-1,1])
    while not flag:
        matrix = matrix*0.95
        process = VARProcess(matrix, None, np.identity(nvar))
        flag = process.is_stable()
    return VARProcess(matrix, None, np.identity(nvar))


def build_ground_truth(var_names, process, add_index=0):
    all_relations = []
    for var in var_names:
        matrix = process.coefs
        matrix = matrix[:, int(var), :] != 0.
        lags, v = matrix.nonzero()
        for i in range(len(v)):
            source,effect,lag = str(v[i]+add_index), str(int(var)+add_index), lags[i]+1 # lag direction from source code
            all_relations.append((source,effect,lag))
    return all_relations
    
datas = []
ground_truth = []
for i in range(100):
    print(i)
    nbvars = 100
    lags=5
    process = five_ancestor_structure(lags, nbvars)
    initial_values = random_generator.random(size=(lags,nbvars))
    data = process.simulate_var(steps=4000, seed=i, initial_values=initial_values)[-3500:]
    relations = build_ground_truth([str(x) for x in range(nbvars)], process, add_index=i*nbvars)
    datas.append(data)
    ground_truth.extend(relations)
    

data = np.concatenate(datas,axis=1)
df = pd.DataFrame(data)
df.columns = [str(x) for x in df.columns]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [4]:
df.to_csv("./returns/data_0.csv",index=False)

In [5]:
ground_truth = pd.DataFrame(ground_truth)
ground_truth

Unnamed: 0,0,1,2
0,0,0,1
1,66,0,1
2,0,0,2
3,0,0,3
4,28,0,3
...,...,...,...
99995,9950,9999,4
99996,9996,9999,4
99997,9999,9999,4
99998,9926,9999,5


In [6]:
ground_truth.to_csv("./ground_truths/data_0.csv",index=False,header=False)