## Libraries, configuration

In [None]:
!pip install signed_backbones

In [None]:
import pandas as pd
import numpy as np
import random
from scipy.linalg import orthogonal_procrustes
import signed_backbones as sb


## Read data, extract backbones

In [None]:
data = pd.read_csv('input/migration.csv', header = None).fillna(method='ffill')
data = data.loc[data[3] != 0,:].reset_index(drop=True)


In [None]:
for year in range(2008, 2021):
  df = data.loc[data[0] == year, :]
  bb = sb.extract(df.loc[:, [1,2,3]], directed=True, significance_threshold= 0, vigor_threshold=0, return_weights=True, return_significance=True)
  pd.DataFrame(bb).to_csv('output/backbones/{}.csv'.format(year),index = False)

## Learn and align embeddings

In [None]:
n_epochs = 100
learning_rate = 0.01

cost_by_vector_year = []

for vector_size in range(2,13):
  for year in range(2008, 2021):
    bb = pd.read_csv('output/backbones/{}.csv'.format(year)).iloc[:, [0,1,2]]

    coccur = []
    for x, y in bb.iterrows():
      coccur.append((y[0], y[1], y[2]))
    scores = coccur.copy()

    ctrs = list(set(bb['0']) | set(bb['1']))
    n_nodes = len(ctrs)
    # print(n_nodes, "non")
    W = (np.random.rand(n_nodes, vector_size) - 0.5) / float(vector_size + 1)
    w = dict(zip(ctrs, W))
    # print(len(coccur))
    cost_info = []
    for i in range(n_epochs):
      global_cost = 0
      random.shuffle(coccur)
      for (v_main, v_context, score) in coccur:
        norm_main, norm_context  = np.linalg.norm(w[v_main]), np.linalg.norm(w[v_context])
        cosine = np.dot(w[v_main], w[v_context]) / (norm_main * norm_context)
        cost = np.abs(cosine - score)
        global_cost += cost

        grad_main    = ((score - cosine) / np.abs(score - cosine)) * (cosine * (w[v_main]    / (norm_main**2))    - (w[v_context] / (norm_context * norm_main)))
        grad_context = ((score - cosine) / np.abs(score - cosine)) * (cosine * (w[v_context] / (norm_context**2)) - (w[v_main] /    (norm_main * norm_context)))

        w[v_main]    -= (learning_rate * grad_main )
        w[v_context] -= (learning_rate * grad_context )

      # print(global_cost)  
      for ctr in w:
        w[ctr] = w[ctr] / np.linalg.norm(w[ctr])
      cost_info.append(global_cost)
    
    avg_cost = global_cost / len(coccur)
    print(year, vector_size, global_cost, avg_cost)
    cost_by_vector_year.append([year, vector_size, global_cost, avg_cost])
    pd.DataFrame(w).transpose().to_csv('output/emb/{}_{}.csv'.format(year, vector_size), index = True)
    pd.DataFrame(cost_info).to_csv('output/emb/{}_{}_cost.csv'.format(year, vector_size), index = True)


In [None]:
for vector_size in range(2,13):
  embs = dict()
  for year in range(2008, 2021):
    embs.update({year:pd.read_csv('output/emb/{}_{}.csv'.format(year, vector_size), index_col= 0)})

  for year in range(2008, 2020):
    A = embs[year].loc[embs[year].index.isin(embs[year+1].index), :]
    B = embs[year+1].loc[embs[year+1].index.isin(embs[year].index), :]

    R, _ = orthogonal_procrustes(B, A)

    embs[year+1] = embs[year+1] @ R


  for year in range(2008, 2021):
    embs[year].to_csv('output/emb/{}_{}_aligned.csv'.format(year, vector_size))
