In [1]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pytorch_lightning

Collecting pytorch_lightning
  Downloading pytorch_lightning-1.4.0-py3-none-any.whl (913 kB)
[K     |████████████████████████████████| 913 kB 4.9 MB/s 
Collecting torchmetrics>=0.4.0
  Downloading torchmetrics-0.4.1-py3-none-any.whl (234 kB)
[K     |████████████████████████████████| 234 kB 68.3 MB/s 
Collecting tensorboard!=2.5.0,>=2.2.0
  Downloading tensorboard-2.4.1-py3-none-any.whl (10.6 MB)
[K     |████████████████████████████████| 10.6 MB 58.6 MB/s 
[?25hCollecting pyDeprecate==0.3.1
  Downloading pyDeprecate-0.3.1-py3-none-any.whl (10 kB)
Collecting PyYAML>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 54.6 MB/s 
Collecting future>=0.17.1
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 62.3 MB/s 
[?25hCollecting fsspec[http]!=2021.06.0,>=2021.05.0
  Downloading fsspec-2021.7.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 5

# Preparation

## Import Packages

In [None]:
import torch 
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch import bernoulli, tensor, randint, ones, cat, cuda
import pandas as pd
import numpy as np
import csv, json, os, random
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm

## Initialize File Readers


In [None]:
def read_csv(file_dir):

  lst = []
  with open(file_dir, 'r') as f:
    reader = csv.reader(f)
    i=0
    for row in reader:
      fact = []
      for i in range(len(row)):
        if i == 3:
          fact.append(float(row[i]))
        else:
          fact.append(int(row[i]))
      lst.append(fact)
  return lst

def read_dict(file_dir):
  d = {}
  with open(file_dir, 'r') as f:
    reader = csv.reader(f)
    for k, v in reader:
      d[k] = v
  return d

# Modules

## TransD Model 


In [None]:

class TransD(pl.LightningModule):
  """
  Paramters: 
    self.dim_e: dimension of entity embedding 
    self.dim_r: dimension of relation embedding
    self.ent_total: number of entities in knowledge graph
    self.rel_total: number of relations in knowledge graph 
    max_norm: max of L2 norm for embedding
    gamma: margin hyperparameter for loss function 
    beta: hyperparameter for focusE scoring function, beta=0 indicates the use of 
      number edge weightings, beta=1 indicates conventional KGE architecture 
  
  Attributes: 
    ent_embeddings: vector embeddings for entities 
    rel_embeddings: vector embeddings for relations 
    ent_proj_embeddings: vector embeddings for entity projections 
    rel_proj_embeddings: vector embeddings for relation projections 


  """
  def __init__(self, dim_r, dim_e, ent_total, rel_total, max_norm, gamma, beta, 
               sampler, num_classes, lr, train_dl, val_dl):
    super(TransD, self).__init__()

    # Initialize model parameters
    self.dim_e = dim_e
    self.dim_r = dim_r
    self.ent_total = ent_total 
    self.rel_total = rel_total 
    self.max_norm = max_norm
    self.gamma = gamma
    self.beta = beta
    self.sampler = sampler
    self.num_classes = num_classes
    self.lr = lr
    self.train_dl = train_dl
    self.val_dl = val_dl


    # Create embeddings
    self.ent_embeddings = nn.Embedding(self.ent_total+1, self.dim_e, 
                                       max_norm=self.max_norm)
    self.rel_embeddings = nn.Embedding(self.rel_total+1, self.dim_r,  
                                       max_norm=self.max_norm)
    self.ent_proj_embeddings = nn.Embedding(self.ent_total +1, self.dim_e, 
                                       max_norm=self.max_norm)
    self.rel_proj_embeddings = nn.Embedding(self.rel_total+1, self.dim_r,  
                                       max_norm=self.max_norm)
    
    
  def forward(self, h, t, r, w): 
    """
    Parameters:
      h: torch.Tensor, shape: (N x 1)
        Head entities in current batch
      t: torch.Tensor, shape: (N x 1)
        Tail entities in current batch 
      r: torch.Tensor, shape: (N x 1)
        Relations in current batch 
      w: torch.Tensor, shape: (N x 1)
        Relation weights in current batch 
    Returns: 
      projected_h: torch.Tensor, shape: (N x dim_r x 1)
        Head projection vector
      projected_t: torch.Tensor, shape: (N x dim_r x 1)
        Tail projection vector
      r_e: torch.Tensor, shape: (N x dim_r)
        Relation vector embedding 
      w: torch.Tensor, shape: (N x 1)
        Relation weights in current batch
    """
   

    h_e = self.ent_embeddings(h) # N x dim_e
    t_e = self.ent_embeddings(t) # N x dim_e
    r_e = self.rel_embeddings(r) # N x dim_e
    h_proj_e = self.ent_proj_embeddings(h) # N x dim_e
    t_proj_e = self.ent_proj_embeddings(t) # N x dim_e
    r_proj_e = self.rel_proj_embeddings(r) # N x dim_r
    
    

    #print(r_proj_e.shape)
    M_rh = self.projection(r_proj_e, h_proj_e) # N x dim_r x dim_e
    M_rt = self.projection(r_proj_e, t_proj_e) # N x dim_r x dim_e

    # proj_h =
    projected_h = torch.matmul(M_rh, h_e.view(-1, self.dim_e, 1)) # N x dim_r x 1
    projected_t = torch.matmul(M_rt, t_e.view(-1, self.dim_e, 1)) # N x dim_r x 1

    return projected_h.view(-1, self.dim_r), projected_t.view(-1,self.dim_r), r_e, w
  
  def score(self, projected_h, projected_t, r_e, weight, pos):
    """
    Parameters: 
      projected_h: torch.Tensor, shape: (N x dim_r x 1)
        Head projection vector
      projected_t: torch.Tensor, shape: (N x dim_r x 1)
        Tail projection vector
      r_e: torch.Tensor, shape: (N x dim_r)
        Relation vector embedding 
      w: torch.Tensor, shape: (N x 1)
        Relation weights in current batch
    Returns: 
      score: float, shape: (N) 
        Scoring function 
    """
    
    transD_score = -1*(projected_h + r_e - projected_t).square().sum(dim=1)
    
    if pos:
      alpha = self.beta + (1-weight)*(1-self.beta) 
    else:
      alpha = self.beta + weight*(1-self.beta)

    m = nn.Softplus()
    g = m(transD_score)

    score = alpha*g
    
    return score
    

  def projection(self,r_proj_e, e_proj):
    """
    Parameters:
    r_proj_e: torch.Tensor, shape: (N x dim_r)
      Relation projection vector embedding 
    e_proj: torch.Tensor, shape (N x dim_e) 
      Entity projection vector embedding (N x dim_e)
    Returns:
      matrix_map: torch.Tensor, shape(N x dim_r x dim_e) 
        Mapping matrix that projects entity to relation space 
    """
    #print(r_proj_e.shape)
    N, dim_r = r_proj_e.shape
    dim_e = e_proj.shape[1] 
    
    matrix_map = r_proj_e.view(N,dim_r,1).matmul(e_proj.view(N,1,dim_e)) + torch.eye(dim_r, dim_e, device=e_proj.device)
    
    return matrix_map

  
  def loss(self, pos, neg):
    """
    Parameters: 
      neg: torch.Tensor, shape: (N)
        Score of negative triplets 
      pos: torch.Tensor, shape (N)
        Score of golden triplets
      gamma: float [0,1], dtype: float 
        Margin separating golden and negative triplets 
    Returns:
      loss: dtype: float
        Margin-based ranking loss 
    """
    loss = (pos.view(-1,1) + neg.view(1,-1)+ self.gamma).relu().sum()

    return loss/(pos.shape[0]*neg.shape[0])
  
  def configure_optimizers(self):
    optimizer = torch.optim.AdamW(self.parameters(), lr=(self.lr or self.lr)) 
    return optimizer

  def training_step(self, train_batch, batch_idx):
    h, t, r, w = train_batch[0], train_batch[1], train_batch[2], train_batch[3]
    

    neg_h, neg_t = self.sampler.corrupt_batch(h,t,r)
    
    # Push tensors to GPU
   
    # p_proj_h, p_proj_t, p_r_e, w = self.forward(h.cuda(),t.cuda(),r.cuda(), w.cuda())
    p_proj_h, p_proj_t, p_r_e, w = self.forward(h,t,r, w)
    n_proj_h, n_proj_t, n_r_e, w = self.forward(neg_h, neg_t, r, w)

    pos_score = self.score(p_proj_h, p_proj_t, p_r_e, w, True)
    neg_score = self.score(n_proj_h, n_proj_t, n_r_e, w, False)
    loss = self.loss(pos_score, neg_score)
    self.log('train_loss', loss)

    return loss

  def validation_step(self, val_batch, batch_idx):

    h, t, r, w = val_batch[0], val_batch[1], val_batch[2], val_batch[3]

    
    neg_h, neg_t = self.sampler.corrupt_batch(h,t,r)
    
    p_proj_h, p_proj_t, p_r_e, w = self.forward(h,t,r, w)
    n_proj_h, n_proj_t, n_r_e, w = self.forward(neg_h, neg_t, r, w)
    # p_proj_h, p_proj_t, p_r_e, w = self.forward(h.cuda(),t.cuda(),r.cuda(), w.cuda())
    # n_proj_h, n_proj_t, n_r_e, w = self.forward(neg_h.cuda(), neg_t.cuda(), r.cuda(), w.cuda())

    pos_score = self.score(p_proj_h, p_proj_t, p_r_e, w, True)
    neg_score = self.score(n_proj_h, n_proj_t, n_r_e, w, False)
    loss = self.loss(pos_score, neg_score)

    self.log('val_loss', loss)
  def train_dataloader(self):

    return self.train_dl

  def val_dataloader(self):

    return self.val_dl



## Bernoulli Sampler


In [None]:
class BernoulliNegativeSampler():
  def __init__(self, kg, n_neg, batch_size, rel_total, ent_total):
    super(BernoulliNegativeSampler, self).__init__() 

    self.kg = kg
    self.n_neg = n_neg 
    self.batch_size = batch_size
    self.rel_total = rel_total
    self.ent_total = ent_total
    #self.bern_probs = self.get_bern_probs(kg)

  def corrupt_batch(self, heads, tails, relations, n_neg=1):
    """
    Parameters:  
      heads: tensor containing the integer key of heads of the realtions in current
        batch (N)
      tails: tensor containing the integer key of tails of the relations in current
        batch (N)
      relations: tensor containing the integer key of relations in the current batch. (N)
      n_neg - number of negative samples to create from each fact 
    Returns: 
      neg_heads: tensor containing the integer key of negatively sampled heads of 
        the realtions in the current batch. (N)
      neg_tails: tensor containing the integer key of negatively sampled tails of 
        the realations in the current batch. (N) 
    """
    
    device = heads.device

    neg_heads = heads.repeat(n_neg) # N x n_neg
    neg_tails = tails.repeat(n_neg) # N x n_neg
    facts = torch.cat((heads.view(-1,1), tails.view(-1,1), relations.view(-1, 1)), dim=1)
    #print(facts.shape)
    bern_probs = self.evaluate_probabilites(facts)
    #print(bern_probs.keys())

    mask = bernoulli(bern_probs[relations].repeat(n_neg)).double()
    n_h_cor = int(mask.sum().item())
    # if mask == 1, corupt heads
    neg_heads[mask==1] = randint(1, self.ent_total, (n_h_cor,), device=device)
    # if mask == 0, corupt tails
    neg_tails[mask==0] = randint(1, self.ent_total, (heads.shape[0] * n_neg - n_h_cor,), device=device)

    return neg_heads.long(), neg_tails.long() 


  def evaluate_probabilites(self, facts):
    """
    Parameters:
      kg: torch.Tensor, shape (N x 3)
        All knowledge graph triples 
    Returns: 
      tensor with relation probabilities 
    """
    bern_probs = self.get_bern_probs(facts)

    temp = [] 
    for i in range(self.rel_total):
      if i in bern_probs.keys():
        temp.append(bern_probs[i])
      else:
        temp.append(0.5)

    return tensor(temp).float()

  def get_hpt(self, t):
    """
    Parameters: 
      t: torch.Tensor, shape (N x3) 
        Tensor of encoded triples in current batch (N x 3), first column contains 
        head indicies, second tails and third relations
    Returns: 
      d: dictionary of relation indices to average number of heads per tail
    """
    t = t.cpu()
    df_hpt = pd.DataFrame(t.numpy(), columns=['from', 'to', 'rel'])
    df_hpt = df_hpt.groupby(['rel', 'to']).count().groupby('rel').mean()
    df_hpt.reset_index(inplace=True)
    
    return {df_hpt.loc[i].values[0]: df_hpt.loc[i].values[1] for i in df_hpt.index}
  
  def get_tph(self, t):
    """
    Parameters: 
      t: torch.Tensor, shape (N x 3) 
        Tensor of encoded triples in current batch (N x 3), first 
        column contains head indicies, second tails and third relations
    Returns: 
      d: dictionary of relation indices to average number of tails per head
    """
    t = t.cpu()
    df_tph = pd.DataFrame(t.numpy(), columns=['from', 'to', 'rel'])
    df_tph = df_tph.groupby(['from', 'rel']).count().groupby('rel').mean()
    df_tph.reset_index(inplace=True)

    return {df_tph.loc[i].values[0]: df_tph.loc[i].values[1] for i in df_tph.index}
   
  def get_bern_probs(self, t):
    """
    Parameters: 
      t: torch.Tensor, shape: (Nx3)
        Tensor of encoded triples in current batch (Nx3), first 
        column contains head indicies, second tails and third relations
    Returns: 
      tph: dictionary that maps relations to sampling probabilities 
    """

    hpt = self.get_hpt(t)
    tph = self.get_tph(t) 

    assert (tph.keys() == hpt.keys())

    
    for k in tph.keys():
      tph[k] = tph[k] / (tph[k] + hpt[k])

    return tph



## Knowledge Graph Class

In [None]:
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
	longTensor = torch.cuda.LongTensor
	floatTensor = torch.cuda.FloatTensor


else:
	longTensor = torch.LongTensor
	floatTensor = torch.FloatTensor

class KnowledgeGraph(Dataset):
  """
  Parameters: 
    facts: 2D array, dtype: int, shape: (n_facts x 3)
      All fact triplets in knowledge graph, where facts[0] is the head, facts[1] 
      is the tail and facts[2] is the relation
    ent2id: dict, dtype: {str: int}, shape: (ent_total)
      Dictionary mapping entity label to integer index 
    rel2id: dict, dtype {str:int}, shape: (rel_total)
      Dictionary mapping relation label to integer index 
    id2ent: dict, dtype: {int: str}, shape: (ent_total)
      Dictionary integer index to entity label
    id2rel: dict, dtype: {int: str}, shape: (ent_total)
      Dictionary integer index to relation label 
    Attributes: 
      n_facts: int 
        Number of facts in data set 
      ent_total: int 
        Number of unique entities in data set 
      rel_total: int 
        Number of unique relations in data set 
      head_index: torch.Tensor, dtype: torch.long, shape (n_facts)
        List of int index of heads for facts, where the ith element represents the
        head of ith triplet
      tail_index: torch.Tensor, dtype: torch.long, shape (n_facts)
        List of int index of tail for facts, where the ith element represents the
        tail of ith triplet
      tail_index: torch.Tensor, dtype: torch.long, shape (n_facts)
        List of int index of relations for facts, where the ith element represents the
        tail of ith triplet 
      weight_index: torch.Tensor, dtype: torch.float, shape (n_facts)
        List of weights for facts, where the ith element respresents the numeric
        edge weight of the ith triplet
  """
  def __init__(self, facts, id2ent, ent2id, id2rel, rel2id):
    super(KnowledgeGraph, self).__init__()
    # Initialize
    self.kg = facts
    self.ent2id = ent2id
    self.id2ent = id2ent
    self.rel2id = rel2id
    self.id2rel = id2rel 

    # Set Attributes
    self.n_facts = len(facts)
    self.ent_total = len(id2ent.keys())
    self.rel_total = len(id2rel.keys())
    
    head_index, tail_index, rel_index, weight_index = [], [], [], []
    for fact in facts[1:]:
      if fact[0] != 's':
        head_index.append(int(fact[0]))
        tail_index.append(int(fact[1]))
        rel_index.append(int(fact[2]))
        weight_index.append(float(fact[3]))
    
    self.head_index = tensor(head_index).long()#.cuda()
    self.tail_index = tensor(tail_index).long()#.cuda()
    self.rel_index = tensor(rel_index).long()#.cuda()
    self.weight_index = tensor(weight_index).float()#.cuda()



  def __len__(self):
    return self.n_facts -1

  def __getitem__(self, item):
    """
    Parameters: 
      item: 
    """
    return (self.head_index[item].item(), 
            self.tail_index[item].item(), 
            self.rel_index[item].item(), 
            self.weight_index[item].item())
    
  def get_ent_id(self, ent):
    """
    Parameters: 
      ent: torch.Tensor, dtype: str, shape: (Nx1)
        Enity string in knowledge graph
    Returns: 
      id: torch.Tensor, dtype: int, shape: (Nx1)
        Integer id of entity, returns None if entity does not exist
    """
    if ent in self.ent2id:
      id = self.ent2id[ent]
    else:
      print("KeyError: Entity does not exist")
      id = None
    return id
    
  def get_rel_id(self, rel):
    """
    Parameters: 
      rel: torch.Tensor, dtype: str, shape: (Nx1)
        Relation string in knowledge graph
    Returns: 
      id: torch.Tensor, dtype: int, shape: (Nx1)
        Integer id of relation, returns None if relation does not exist
    """
    if rel in self.rel2id:
      id = self.rel2id[ent]
    else:
      print("KeyError: Relation does not exist")
      id = None
    return id

  def get_id_rel(self, id):
    """
    Parameters: 
      id: torch.Tensor, dtype: str, shape: (Nx1)
        Integer id of relation
    Returns: 
      rel: torch.Tensor, dtype: int, shape: (Nx1)
        Relation string in knowledge graph, returns None if id does not exist
    """

    if id in self.id2rel:
      rel = self.id2rel[id]
    else:
      print("KeyError: Relation ID does not exist.")
    return rel

  def get_id_ent(self, id):
    """
    Parameters: 
      id: torch.Tensor, dtype: str, shape: (Nx1)
        Integer id of relation
    Returns: 
      ent: torch.Tensor, dtype: int, shape: (Nx1)
        Entity string in knowledge graph, returns None if id does not exist
    """
    if id in self.id2ent:
      ent = self.id2ent[id]
    else:
      print("KeyError: Entity ID does not exist.")
      ent = None
    return ent
    
  def heads(self):
    """
    Returns: 
      heads: torch.Tensor, dtype: torch.int, shape (N x 1)
        Integer id's of heads in current batch 
    """
    kg = self.kg 
    heads = []
    for trip in kg:
      heads.append(trip[0])
    return tensor(heads).long()

  def tails(self):
    """
    Returns: 
      tails: torch.Tensor, dtype: torch.int, shape (N x 1)
        Integer id's of tails in current batch 
    """
    kg = self.kg 
    tails = []
    for trip in kg:
      tails.append(trip[3])
    return tensor(tails).long()



## Data Module

In [None]:
class TransDDataModule(pl.LightningDataModule):
  def __init__(self, batch_size, kg_dir, id2ent_dir, id2rel_dir, ent2id_dir, rel2id_dir):
    super().__init__()
    
  #def setup(self, stage=None):
    
    self.batch_size = batch_size
    self.kg_dir = kg_dir
    self.id2rel_dir = id2rel_dir
    self.ent2id_dir = ent2id_dir
    self.id2ent_dir = id2ent_dir
    self.rel2id_dir = rel2id_dir

    triplets = read_csv(self.kg_dir)[1:]
    id2ent = read_dict(self.id2ent_dir)
    ent2id = read_dict(self.ent2id_dir)
    id2rel = read_dict(self.id2rel_dir)
    rel2id = read_dict(self.rel2id_dir) 

    self.facts = triplets
    self.kg = KnowledgeGraph(self.facts, id2ent, ent2id, id2rel, rel2id)
    
    train_size = round(len(self.facts)*0.9) - round(len(self.facts)*0.9)%1024
    val_size = len(self.facts) - train_size -1

    self.kg_train, self.kg_val = random_split(self.kg, [train_size, val_size]) 
  #def setup(self, stage=None):

  def train_dataloader(self):
    return DataLoader(self.kg_train, batch_size=self.batch_size, shuffle=True, num_workers=4)

  def val_dataloader(self):
    return DataLoader(self.kg_val, batch_size=self.batch_size, num_workers=4)



# Learn

In [None]:
batch_size = 1024
kg_dir = '/content/drive/MyDrive/UCLA REU 2021 KG /results/2020Mar11-2021June16/tripples/all_relations-peaks.csv'
id2ent_dir = '/content/drive/MyDrive/UCLA REU 2021 KG /results/2020Mar11-2021June16/tripples/id2entity-peaks.csv'
ent2id_dir = '/content/drive/MyDrive/UCLA REU 2021 KG /results/2020Mar11-2021June16/tripples/entity2id-peaks.csv'
id2rel_dir = '/content/drive/MyDrive/UCLA REU 2021 KG /results/2020Mar11-2021June16/tripples/id2relation-peaks.csv'
rel2id_dir = '/content/drive/MyDrive/UCLA REU 2021 KG /results/2020Mar11-2021June16/tripples/relation2id-peaks.csv'

triplets = read_csv(kg_dir)[1:]
id2ent = read_dict(id2ent_dir)
ent2id = read_dict(ent2id_dir)
id2rel = read_dict(id2rel_dir)
rel2id = read_dict(rel2id_dir)


kg = KnowledgeGraph(triplets, id2ent, ent2id, id2rel, rel2id)

train_size = round(len(triplets)*0.9) - round(len(triplets)*0.9)%1024
val_size = len(triplets) - train_size -1

kg_train, kg_val = random_split(kg, [train_size, val_size]) 

In [None]:

train_batch = DataLoader(kg_train, batch_size=batch_size, shuffle=True, num_workers=4)
val_batch = DataLoader(kg_val, batch_size=batch_size, num_workers=4)

sampler = BernoulliNegativeSampler(kg=kg, 
                                   n_neg=1, 
                                   batch_size=batch_size, 
                                   rel_total=kg.rel_total, 
                                   ent_total=kg.ent_total)

model = TransD(dim_r=100, 
               dim_e=100, 
               ent_total=kg.ent_total, 
               rel_total=kg.rel_total, 
               max_norm=1, 
               gamma=0.5, 
               beta=1.0, 
               sampler=sampler,
               num_classes=2,
               lr=5e-4, 
               train_dl = train_batch,
               val_dl = val_batch)
# checkpoint_callback = pl.callbacks.ModelCheckpoint(dirpath='/content/drive/MyDrive/UCLA REU 2021 KG /KG Embedding')

# trainer = pl.Trainer(gpus=1,
#                      default_root_dir='/content/drive/MyDrive/UCLA REU 2021 KG /KG Embedding',
#                      max_epochs=35, 
#                      #auto_lr_find=True,
#                      callbacks = [checkpoint_callback])
# trainer.fit(model, train_batch, val_batch)
# resume_from_checkpoint = ‘’

# Link Prediction

## Load Trained Model

In [None]:
checkpoint_path = '/content/drive/MyDrive/UCLA REU 2021 KG /KG Embedding/epoch=9-step=88789-v1.ckpt'
hparams_path = '/content/drive/MyDrive/UCLA REU 2021 KG /KG Embedding/lightning_logs/version_80/hparams.yaml'
trained_model = model.load_from_checkpoint(checkpoint_path, 
                          dim_r=100, 
                          dim_e=100, 
                          ent_total=kg.ent_total, 
                          rel_total=kg.rel_total, 
                          max_norm=1, 
                          gamma=0.5, 
                          beta=1.0, 
                          sampler=sampler,
                          num_classes=2,
                          lr=1e-4, 
                          train_dl = train_batch,
                          val_dl = val_batch)


In [None]:
with open('/content/drive/MyDrive/UCLA REU 2021 KG /results/2020Mar11-2021June16/tripples/dates.csv', 'r') as f:
  dates=[]
  reader = csv.reader(f)
  for row in reader:
    dates.append(''.join(row))
print(len(dates))
print(dates[0])

465
13-Mar-20


In [None]:
with open('/content/drive/MyDrive/UCLA REU 2021 KG /results/2020Mar11-2021June16/tripples/keywords.csv', 'r') as f:
  keywords=[]
  reader = csv.reader(f)
  for row in reader:
    keywords.append(''.join(row))
print(len(keywords))
print(keywords[0])

916
world


## Predict Links

In [None]:
## Sentiment Changepoint Links 
import numpy as np
heads = []
tails = []
for t in range(100):
    for date in dates:
      heads.append(int(ent2id[str(t)])) 
      tails.append(int(ent2id[str(date)]))

print(heads[0:5])
print(tails[0:5])
print(len(heads))
print(len(tails))
relation = 'has_changepoint'
h = tensor(heads).long()
t = tensor(tails).long()
r = tensor([int(rel2id[relation])]*len(heads)).long()
w = tensor([1.0]*len(heads)).float()

projected_h, projected_t, r_e, w = trained_model.forward(h,t,r,w)
score = trained_model.score(projected_h, projected_t, r_e, w, True)
scores_sent = score.tolist()

# max_score = max(score_list)
# min_score = min(score_list)
# scores_sent = [(x-min_score)/(max_score-min_score) for x in score_list]
print('Max score: ' + str(max(scores_sent)))
print('Min score: ' + str(min(scores_sent)))

[501246, 501246, 501246, 501246, 501246]
[401584, 298915, 708373, 378052, 23014]
46500
46500
Max score: 0.007487951312214136
Min score: 2.652928969837376e-07


In [None]:
## Peak Changepoint links 

heads = []
tails = []
for t in range(100):
    for date in dates:
      heads.append(int(ent2id[str(t)])) 
      tails.append(int(ent2id[str(date)]))

print(heads[0:5])
print(tails[0:5])
print(len(heads))
print(len(tails))
relation = 'has_peak'
h = tensor(heads).long()
t = tensor(tails).long()
r = tensor([int(rel2id[relation])]*len(heads)).long()
w = tensor([1.0]*len(heads)).float()

projected_h, projected_t, r_e, w = trained_model.forward(h,t,r,w)
score = trained_model.score(projected_h, projected_t, r_e, w, True)
scores_vol = score.tolist()

# max_score = max(score_list)
# min_score = min(score_list)
# scores_vol = [(x-min_score)/(max_score-min_score) for x in score_list]
print('Max score: ' + str(max(scores_vol)))
print('Min score: ' + str(min(scores_vol)))

[501246, 501246, 501246, 501246, 501246]
[401584, 298915, 708373, 378052, 23014]
46500
46500
Max score: 0.007537588477134705
Min score: 5.008547532270313e-07


In [None]:
#Keyword to Topic Links
heads = []
tails = []
for key in keywords:
  for t in range(100):
    tails.append(int(ent2id[str(t)])) 
    heads.append(int(ent2id[str(key)]))
relation = 'associated_with'
h = tensor(heads).long()
t = tensor(tails).long()
r = tensor([int(rel2id[relation])]*len(heads)).long()
w = tensor([1.0]*len(heads)).float()
print(heads[0:5])
print(tails[0:5])
print(len(heads))
print(len(tails))

projected_h, projected_t, r_e, w = trained_model.forward(h,t,r,w)
score = trained_model.score(projected_h, projected_t, r_e, w, True)
scores = score.tolist()

# heads = []
# tails = []
# for key in keywords:
#   for t in range(50,100):
#     tails.append(int(ent2id[str(t)])) 
#     heads.append(int(ent2id[str(key)]))
# relation = 'associated_with'
# h = tensor(heads).long()
# t = tensor(tails).long()
# r = tensor([int(rel2id[relation])]*len(heads)).long()
# w = tensor([1.0]*len(heads)).float()
# print(heads[0:5])
# print(tails[0:5])
# print(len(heads))
# print(len(tails))

# projected_h, projected_t, r_e, w = trained_model.forward(h,t,r,w)
# score = trained_model.score(projected_h, projected_t, r_e, w, True)
# scores_vol = scores_vol + score.tolist()

# max_score = max(score_list)
# min_score = min(score_list)
# scores_vol = [(x-min_score)/(max_score-min_score) for x in score_list]
# print('Max score: ' + str(max(scores_vol)))
# print('Min score: ' + str(min(scores_vol)))

[186341, 186341, 186341, 186341, 186341]
[553141, 660226, 354376, 519398, 198030]
91600
91600


## Create Dataframe

In [None]:
score_dict = {}
#score_dict_vol = {}
i = 0
for t in range(100):
    for date in keywords:
      tup = (t, str(date))
      score_dict[tup] = scores[i]
      #score_dict_vol[tup] = scores_vol[i]
      i+= 1

In [None]:
print(score_dict_vol[(0,'13-Mar-20')])

2.0470386061788304e-06


In [None]:
int_topics = [1, 2, 6, 8, 9, 10, 12, 14, 17, 20, 34, 39, 43, 44, 49, 59, 60, 62,
              67, 75, 76, 80, 86, 88, 91]

all_sent= []
all_vol = []
sel_sent = []
sel_vol = []
selected_topics = []
col_labels_all = ['Topic ' + str(x) for x in range(100)]
#col_labels_sel = ['Topic ' + str(x) for x in int_topics]
col_labels_all.insert(0,'Keyword')
#col_labels_sel.insert(0,'Date')
row_labels = [k for k in keywords]
# all_sent.append(col_labels_all)
# all_vol.append(col_labels_all)
# sel_vol.append(col_labels_sel)
# sel_sent.append(col_labels_sel)


for i in range(len((keywords))):
  date = keywords[i]
  date_allS = []
  date_selS = [] 
  #date_allV = []
  
  #date_selV = []
  # date_allS.append(date)
  # date_selS.append(date)
  # date_allV.append(date)
  # date_selV.append(date)

  for j in range(100):
    if j in int_topics:
      date_selS.append(score_dict[(j, date)])
      #date_selS.append(score_dict_sent[(j,date)])
    #date_allS.append(score_dict_sent[(j, date)])
    date_allS.append(score_dict[(j, date)])

  sel_vol.append(date_selS)
  #sel_sent.append(date_selS)
  all_sent.append(date_allS)
  #all_vol.append(date_allV)


In [None]:
def normalize(x, axis, softmax):
  if softmax:
    norm = np.exp(x) / np.sum(np.exp(x), axis=axis) 
  else:
    norm = (x - np.amin(x, axis=axis))/(np.amax(x,axis=axis) - np.amin(x,axis=axis))

  return norm

In [None]:

np_sel_vol = np.array(sel_vol)
np_sel_vol = normalize(sel_vol, 0, softmax=False)

# np_sel_sent = np.array(sel_sent)
# np_sel_sent = normalize(sel_sent, None, softmax=False)

np_all_sent = np.array(all_sent)
np_all_sent = normalize(all_sent, 0, softmax=False)

# np_all_vol = np.array(all_vol)
# np_all_vol = normalize(all_vol, None, softmax=False)



## Export

In [None]:
!pip install xlsxwriter

Collecting xlsxwriter
  Downloading XlsxWriter-1.4.5-py2.py3-none-any.whl (149 kB)
[?25l[K     |██▏                             | 10 kB 40.6 MB/s eta 0:00:01[K     |████▍                           | 20 kB 47.8 MB/s eta 0:00:01[K     |██████▋                         | 30 kB 50.4 MB/s eta 0:00:01[K     |████████▊                       | 40 kB 26.9 MB/s eta 0:00:01[K     |███████████                     | 51 kB 17.0 MB/s eta 0:00:01[K     |█████████████▏                  | 61 kB 14.5 MB/s eta 0:00:01[K     |███████████████▎                | 71 kB 13.7 MB/s eta 0:00:01[K     |█████████████████▌              | 81 kB 15.1 MB/s eta 0:00:01[K     |███████████████████▊            | 92 kB 14.0 MB/s eta 0:00:01[K     |█████████████████████▉          | 102 kB 13.0 MB/s eta 0:00:01[K     |████████████████████████        | 112 kB 13.0 MB/s eta 0:00:01[K     |██████████████████████████▎     | 122 kB 13.0 MB/s eta 0:00:01[K     |████████████████████████████▌   | 133 kB 13.0 

In [None]:
import pandas as pd 
import xlsxwriter
path = '/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction/link_prediction-keywords-linear-a0.xlsx'


col_labels_all = ['Topic ' + str(x) for x in range(100)]
col_labels_sel = ['Topic ' + str(x) for x in int_topics]
#col_labels_all.insert(0,'Date')
#col_labels_sel.insert(0,'Date')
row_labels = [k for k in keywords]

df_sel_vol = pd.DataFrame(np_sel_vol)
df_sel_vol = df_sel_vol.rename(index=(lambda x : row_labels[x]))
#df_sel_vol.index = pd.to_datetime(df_sel_vol.index, format = '%d-%b-%y')
#df_sel_vol = df_sel_vol.reset_index()
#df_sel_vol = df_sel_vol.dropna().set_index('index')
#df_sel_vol = df_sel_vol.sort_index(axis=0)


# df_all_vol = pd.DataFrame(np_all_vol)
# df_all_vol = df_all_vol.rename(index=(lambda x : row_labels[x]))
# df_all_vol.index = pd.to_datetime(df_all_vol.index, format = '%d-%b-%y')
# df_all_vol = df_all_vol.reset_index()
# df_all_vol = df_all_vol.dropna().set_index('index')
# df_all_vol = df_all_vol.sort_index(axis=0)

# df_sel_sent = pd.DataFrame(np_sel_sent)
# df_sel_sent = df_sel_sent.rename(index=(lambda x : row_labels[x]))
# df_sel_sent.index = pd.to_datetime(df_sel_sent.index, format = '%d-%b-%y')
# df_sel_sent= df_sel_sent.reset_index()
# df_sel_sent = df_sel_sent.dropna().set_index('index')
# df_sel_sent = df_sel_sent.sort_index(axis=0)

df_all_sent = pd.DataFrame(np_all_sent) 
df_all_sent = df_all_sent.rename(index=(lambda x : row_labels[x]))
# df_all_sent.index = pd.to_datetime(df_all_sent.index, format = '%d-%b-%y')
# df_all_sent = df_all_sent.reset_index()
# df_all_sent = df_all_sent.dropna().set_index('index')
# df_all_sent = df_all_sent.sort_index(axis=0)

writer =pd.ExcelWriter(path, engine='xlsxwriter')
df_all_sent.to_excel(writer, sheet_name='All Topics', header=col_labels_all)
#df_all_vol.to_excel(writer, sheet_name='All Topics (Volume)', header=col_labels_all)
#df_sel_sent.to_excel(writer, sheet_name='Selected Topics (Sentiment)', header=col_labels_sel)
df_sel_vol.to_excel(writer, sheet_name='Selected Topics', header=col_labels_sel)
writer.save()


In [None]:
with open('/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction_allS-v5.csv', 'w') as f:
  writer=csv.writer(f)
  for row in all_sent:
    writer.writerow(row)
with open('/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction_selS-v5.csv', 'w') as f:
  writer=csv.writer(f)
  for row in sel_sent:
    writer.writerow(row)
with open('/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction_allV-v5.csv', 'w') as f:
  writer=csv.writer(f)
  for row in all_vol:
    writer.writerow(row)
with open('/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction_selV-v5.csv', 'w') as f:
  writer=csv.writer(f)
  for row in sel_vol:
    writer.writerow(row)


In [None]:
print(score_dict[(0, '22-Aug-20')])
print(score_dict[(0, '01-Jun-20')])

In [None]:
import numpy as np
print(np.percentile(list(score_dict.values()),10))
perc = np.percentile(list(score_dict.values()),10) 

In [None]:
d = {k:v for k,v in score_dict.items() if v< perc}
print(len(d))
print(d.


In [None]:
from google.colab import drive 
drive.mount('/content/drive')
path = '/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction/link_prediction-linear-aNone.xlsx'

import pandas as pd
scores_linear = pd.read_excel(path).set_index('index')

In [None]:

four_normalization = dict()

In [None]:
#softmax None
path2 = '/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction/link_prediction-softmax-aNone.xlsx'

scores_softmax = pd.read_excel(path2).set_index('index')
#print(scores_softmax[:10])
four_normalization['softmax_None'] = dict()
dates_list = list(scores_softmax.index)
for col in scores_softmax:
  four_normalization['softmax_None'][col] = dict()
  print(col)
  scores_dict = dict(enumerate(scores_softmax[col]))
  ordered_scores = sorted(scores_dict, key=lambda x:scores_dict[x])
  ten_min_scores_softmax_index = ordered_scores[:20]
  ten_min_scores_softmax_dates = [str(dates_list[x])[:10] for x in ordered_scores[:20]]
  four_normalization['softmax_None'][col]['date'] = ten_min_scores_softmax_dates
  four_normalization['softmax_None'][col]['index'] = ten_min_scores_softmax_index

  print(ten_min_scores_softmax_dates)
#2021-02-06, 2020-08-05
#topic 17 Apr 24 2021 vs apr 23 2021

In [None]:
four_normalization['softmax_None']['Topic 0']

In [None]:
#softmax by topics
path2 = '/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction/link_prediction-softmax-a0.xlsx'
four_normalization['softmax_0'] = dict()

scores_softmax = pd.read_excel(path2).set_index('index')
dates_list = list(scores_softmax.index)
for col in scores_softmax:
  four_normalization['softmax_0'][col] = dict()
  print(col)
  scores_dict = dict(enumerate(scores_softmax[col]))
  ordered_scores = sorted(scores_dict, key=lambda x:scores_dict[x])
  ten_min_scores_softmax_index0 = ordered_scores[:20]
  ten_min_scores_softmax_dates0 = [str(dates_list[x])[:10] for x in ordered_scores[:20]]
  four_normalization['softmax_0'][col]['date'] = ten_min_scores_softmax_dates0
  four_normalization['softmax_0'][col]['index'] = ten_min_scores_softmax_index0
  print(ten_min_scores_softmax_dates0)
  
#2021-02-06, 2020-08-05

In [None]:
#linear
path = '/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction/link_prediction-linear-aNone.xlsx'
four_normalization['linear_None'] = dict()

scores_linear = pd.read_excel(path).set_index('index')
dates_list = list(scores_linear.index)
for col in scores_linear:
  print(col)
  four_normalization['linear_None'][col] =dict()
  scores_dict = dict(enumerate(scores_linear[col]))
  ordered_scores = sorted(scores_dict, key=lambda x:scores_dict[x])
  ten_min_scores_linear_index = ordered_scores[:20]
  ten_min_scores_linear_dates = [str(dates_list[x])[:10] for x in ordered_scores[:20]]
  four_normalization['linear_None'][col]['date'] = ten_min_scores_linear_dates
  four_normalization['linear_None'][col]['index'] = ten_min_scores_linear_index

  print(ten_min_scores_linear_dates)

In [None]:
#linear by topics
path = '/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction/link_prediction-linear-a0.xlsx'
four_normalization['linear_0'] = dict()
scores_linear = pd.read_excel(path).set_index('index')
dates_list = list(scores_linear.index)
for col in scores_linear:
  print(col)
  four_normalization['linear_0'][col] = dict()
  scores_dict = dict(enumerate(scores_linear[col]))
  ordered_scores = sorted(scores_dict, key=lambda x:scores_dict[x])
  ten_min_scores_linear_index = ordered_scores[:20]
  ten_min_scores_linear_dates = [str(dates_list[x])[:10] for x in ordered_scores[:20]]
  four_normalization['linear_0'][col]['date'] = ten_min_scores_linear_dates
  four_normalization['linear_0'][col]['index'] = ten_min_scores_linear_index
  print(ten_min_scores_linear_dates)

In [None]:
four_methods = ['softmax_None','softmax_0', 'linear_None', 'linear_0']
for i in range(100):
  topic = 'Topic '+str(i)
  print(topic)
  for method in four_methods:
    print(method)
    print(four_normalization[method][topic]['index'])

In [None]:
#softmax None
def min_whole_dataset(path: str):

  scores_softmax = pd.read_excel(path).set_index('index')

  dates_list = list(scores_softmax.index)
  score_dict = dict()
  for col in scores_softmax:
    for row in range(len(scores_softmax[col])):
      score_dict[(col, row)] = scores_softmax[col][row]
  sorted_score = sorted(score_dict, key=lambda x: score_dict[x])[:20]
  print(sorted_score)
#print('softmax_None')
min_whole_dataset('/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction/link_prediction-softmax-aNone.xlsx')

#min_whole_dataset('/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction/link_prediction-softmax-a0.xlsx')
min_whole_dataset('/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction/link_prediction-linear-aNone.xlsx')
#min_whole_dataset('/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction/link_prediction-linear-a0.xlsx')

for index in sorted_score:
  print(dates_list[index[1]], index[0])
#2020-12-19 00:00:00 Topic 6 vs 2020-12-20
#2020-08-05 00:00:00 Topic 69 vs 08-12

#peaks analysis

In [None]:
#softmax None
four_normalization_peak = dict()
path2 = '/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction/link_prediction-softmax-aNone.xlsx'

scores_softmax = pd.read_excel(path2, sheet_name='All Topics (Volume)').set_index('index')
#print(scores_softmax[:10])
four_normalization_peak['softmax_None'] = dict()
dates_list = list(scores_softmax.index)
for col in scores_softmax:
  four_normalization_peak['softmax_None'][col] = dict()
  print(col)
  scores_dict = dict(enumerate(scores_softmax[col]))
  ordered_scores = sorted(scores_dict, key=lambda x:scores_dict[x])
  ten_min_scores_softmax_index = ordered_scores[:20]
  ten_min_scores_softmax_dates = [str(dates_list[x])[:10] for x in ordered_scores[:20]]
  four_normalization_peak['softmax_None'][col]['date'] = ten_min_scores_softmax_dates
  four_normalization_peak['softmax_None'][col]['index'] = ten_min_scores_softmax_index
  print(ten_min_scores_softmax_dates)
  print([scores_dict[x] for x in ten_min_scores_softmax_index])
  print(min(list(scores_dict.values())))
#topic 2 2020-09-06

Topic 0
['2021-06-11', '2020-09-06', '2021-02-06', '2021-06-06', '2020-08-05', '2021-05-01', '2021-03-24', '2021-04-28', '2021-05-20', '2021-06-05', '2021-04-15', '2021-03-17', '2020-09-05', '2020-05-17', '2021-01-25', '2020-04-11', '2020-10-21', '2021-03-21', '2020-03-28', '2020-12-25']
[2.147475351159724e-05, 2.147475633306967e-05, 2.147475637365424e-05, 2.147475661925269e-05, 2.147475693958231e-05, 2.1474758241787e-05, 2.147475841788622e-05, 2.147475872626766e-05, 2.147475929461522e-05, 2.147475952780287e-05, 2.147475964097629e-05, 2.147475999741056e-05, 2.14747600899615e-05, 2.147476013539591e-05, 2.147476032056371e-05, 2.147476035065149e-05, 2.147476039469186e-05, 2.147476055154678e-05, 2.147476065732765e-05, 2.147476086502956e-05]
2.147475351159724e-05
Topic 1
['2021-06-11', '2020-08-05', '2021-02-06', '2021-06-06', '2020-09-06', '2020-04-11', '2020-10-21', '2021-01-25', '2021-03-24', '2021-04-28', '2020-07-04', '2021-05-01', '2020-05-17', '2021-03-05', '2020-04-12', '2020-06-17'

In [None]:
#softmax None
four_normalization_peak = dict()
path2 = '/content/drive/MyDrive/UCLA REU 2021 KG /results/link_prediction/link_prediction_keywords-softmax-aNone.xlsx'

scores_softmax = pd.read_excel(path2).set_index('index')
#print(scores_softmax[:10])
four_normalization_peak['softmax_None'] = dict()
dates_list = list(scores_softmax.index)
for col in scores_softmax:
  four_normalization_peak['softmax_None'][col] = dict()
  print(col)
  scores_dict = dict(enumerate(scores_softmax[col]))
  ordered_scores = sorted(scores_dict, key=lambda x:scores_dict[x])
  ten_min_scores_softmax_index = ordered_scores[:20]
  ten_min_scores_softmax_dates = [str(dates_list[x])[:10] for x in ordered_scores[:20]]
  four_normalization_peak['softmax_None'][col]['date'] = ten_min_scores_softmax_dates
  four_normalization_peak['softmax_None'][col]['index'] = ten_min_scores_softmax_index
  print(ten_min_scores_softmax_dates)
  print([scores_dict[x] for x in ten_min_scores_softmax_index])
  print(min(list(scores_dict.values())))
#topic 2 2020-09-06

NameError: ignored