In [9]:
!pip install -q transformers
!pip install -q networkx matplotlib
!pip install -q pyvis
!pip install -q neo4j

## Downloading the COYO-reduced Kaggle dataset

In [12]:
!pip install -q kaggle

In [13]:
!kaggle datasets download -d anantjain1223/coyo-1k-reduced

Dataset URL: https://www.kaggle.com/datasets/anantjain1223/coyo-1k-reduced
License(s): MIT
coyo-1k-reduced.zip: Skipping, found more recently modified local copy (use --force to force download)


In [14]:
!unzip /content/coyo-1k-reduced.zip

Archive:  /content/coyo-1k-reduced.zip
replace coyo-1k.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: coyo-1k.csv             


In [15]:
import pandas as pd

df=pd.read_csv("/content/coyo-1k.csv")

In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,url,key,image_phash,original_caption,tag,attribute,short_caption,detailed_caption,llm_caption
0,0,https://www.equipmentsalesandsurplus.com/v/vsp...,9113,e5989a6799687992,"Powermatic 60HH, 8"" Jointer with Helical Cutte...",['Jointer' 'Wood shaper' 'Thickness planer'],['repair which is a workbench or other surface...,['a table sawing machine on a white background'],['a planer that is sitting on a table'\r\n 'a ...,['A table sawing machine on an all-white backg...
1,1,https://texasfurniturehut.com/images/thumbs/00...,7371,be179168cce0cc97,Picture of ELIJAH LEATHER POWER RECLINER,['recliner' 'Power seat' 'Club chair'],['recliner which has upholstered in fabric or ...,['a brown leather recliner chair on a white ba...,['a brown recliner sitting in front of a white...,['A Brown Leather Recliner Chair On A White Ba...
2,2,https://cdn.shopify.com/s/files/1/1909/0703/pr...,1662,c04b03b43f1f3a67,"8 x 10 Reprinted Old Photo of U.S. Mail, Tanan...",['Dog sled' 'Sled dog' 'Sled dog racing'],['dog sled which has snow or ice'\r\n 'dog sle...,['a black and white photo of a dog team pullin...,['a man riding a dog sled past a post office'\...,['A black and white photo of a dog team pullin...
3,3,https://kelliesdesigns.net/wp-content/uploads/...,3390,f9e3c64e0cd8b036,Police Officer In His Hands Snap Tab,['Police officer' 'Police commissioner' 'Handc...,['policeman which has badge' 'cop which has ba...,['a pair of key chains with a police officer o...,"[""two key keychains with police officer's unif...","[""A pair of handmade key chains with a police ..."
4,4,https://cdn.houseplansservices.com/product/9ea...,3666,84cfbfb0c8cab20e,House Design - Colonial Exterior - Front Eleva...,['farmhouse' 'manufactured home'\r\n 'North am...,"['house which has exterior walls with siding, ...",['this is a computer rendering of a house'],['a two story brick house with a garage and po...,['This is a computer rendered image of a house...


In [17]:
df=df[['url','short_caption']]
df.head()

Unnamed: 0,url,short_caption
0,https://www.equipmentsalesandsurplus.com/v/vsp...,['a table sawing machine on a white background']
1,https://texasfurniturehut.com/images/thumbs/00...,['a brown leather recliner chair on a white ba...
2,https://cdn.shopify.com/s/files/1/1909/0703/pr...,['a black and white photo of a dog team pullin...
3,https://kelliesdesigns.net/wp-content/uploads/...,['a pair of key chains with a police officer o...
4,https://cdn.houseplansservices.com/product/9ea...,['this is a computer rendering of a house']


## Now that data loaded, Knowledge Graphs

In [18]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
from tqdm import tqdm
import torch
from pyvis.network import Network

### Loading REBEL model for relational extraction

In [19]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")
gen_kwargs = {
    "max_length": 256,
    "length_penalty": 0,
    "num_beams": 3,
    "num_return_sequences": 3,
}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/344 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

## Relation extraction Triplets using REBEL-Large

In [20]:
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

## Extraction of Triplets

In [21]:
# Text to extract triplets from
triplet_list=[]
url_list=[]

for i in tqdm(range(len(df))):
  text = df['short_caption'][i]
  url = df['url'][i]

  # Tokenizer text
  model_inputs = tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors = 'pt')

  # Generate
  generated_tokens = model.generate(
      model_inputs["input_ids"].to(model.device),
      attention_mask=model_inputs["attention_mask"].to(model.device),
      **gen_kwargs,
  )

  # Extract text
  decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

  # Extract triplets
  for _, sentence in enumerate(decoded_preds):
      triplet_list.append(extract_triplets(sentence))
      url_list.append(url)


100%|██████████| 1000/1000 [36:02<00:00,  2.16s/it]


In [23]:
triplet_list

[[{'head': 'table sawing', 'type': 'subclass of', 'tail': 'machine'}],
 [{'head': 'table sawing machine',
   'type': 'subclass of',
   'tail': 'white background'}],
 [{'head': 'table sawing machine',
   'type': 'has part',
   'tail': 'white background'}],
 [{'head': 'recliner', 'type': 'subclass of', 'tail': 'chair'}],
 [{'head': 'leather', 'type': 'subclass of', 'tail': 'chair'}],
 [{'head': 'chair', 'type': 'different from', 'tail': 'recliner'}],
 [{'head': 'sled', 'type': 'subclass of', 'tail': 'dog team'}],
 [{'head': 'sled', 'type': 'subclass of', 'tail': 'dog'}],
 [{'head': 'sled', 'type': 'used by', 'tail': 'dog team'}],
 [{'head': 'police officer', 'type': 'employer', 'tail': 'police'}],
 [{'head': "'a pair of key chains with a police officer on them'",
   'type': 'instance of',
   'tail': 'key chain'}],
 [{'head': 'police officer',
   'type': 'field of this occupation',
   'tail': 'police'}],
 [{'head': 'computer rendering', 'type': 'part of', 'tail': 'computer'}],
 [{'head': 

In [24]:
df_rebel=pd.DataFrame({'url':url_list,'triplet':triplet_list})
df_rebel.to_csv('rebel_babelscape_rebel-large.csv',index=False)
df_rebel.head()

Unnamed: 0,url,triplet
0,https://www.equipmentsalesandsurplus.com/v/vsp...,"[{'head': 'table sawing', 'type': 'subclass of..."
1,https://www.equipmentsalesandsurplus.com/v/vsp...,"[{'head': 'table sawing machine', 'type': 'sub..."
2,https://www.equipmentsalesandsurplus.com/v/vsp...,"[{'head': 'table sawing machine', 'type': 'has..."
3,https://texasfurniturehut.com/images/thumbs/00...,"[{'head': 'recliner', 'type': 'subclass of', '..."
4,https://texasfurniturehut.com/images/thumbs/00...,"[{'head': 'leather', 'type': 'subclass of', 't..."
