# DistilBERT
- Overview: DistilBERT is a smaller, faster, and lighter version of BERT, trained using knowledge distillation to retain much of the performance of the full BERT model.
- Size: About 66 million parameters, making it around 40% smaller and 60% faster than BERT.
- Usage: It can perform named entity recognition (NER) to identify objects and relevant attributes (like colors and positions) in sentences. You can fine-tune DistilBERT on a small dataset of descriptions to extract objects and attributes with higher precision.
- Implementation: Using Hugging Face’s Transformers library, DistilBERT can be fine-tuned for entity recognition on your specific task.

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
from transformers import pipeline

# Check if GPU is available and set device accordingly
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'cuda' if device == 0 else 'cpu'}")

# Load DistilBERT for fill-mask (masked language modeling)
fill_mask = pipeline("fill-mask", model="distilbert-base-uncased", device=device)

# Sample description text
description = "In this image, I can see a dog which is black, brown, and white in color laying on the ground. " \
              "I can also see two balls which are green and purple in color on the grass. " \
              "In the background, there are a few plants which are purple and blue, the ground, a few trees, and the sky."

# Define a list to store extracted objects and their attributes
objects = []

# Define function to identify objects and attributes
def extract_objects(description):
    # Define prompts to extract objects and their colors
    sentences = [
        f"In the image, there is a [MASK].",
        f"The [MASK] is black, brown, and white.",
        f"I see [MASK] balls which are green and purple.",
        f"There are [MASK] plants which are purple and blue.",
    ]
    
    for sentence in sentences:
        results = fill_mask(sentence)
        # Filter top prediction
        best_match = results[0]['token_str']
        objects.append(best_match)
    
    return objects

# Run the function
extracted_objects = extract_objects(description)

# Print extracted objects and their descriptions
print("Extracted Objects and Attributes:")
for obj in extracted_objects:
    print(f"Object: {obj}")


In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# Load MiniLM for question answering
model_name = "microsoft/MiniLM-L12-H384-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Check if GPU is available and set device accordingly
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'cuda' if device == 0 else 'cpu'}")

# Initialize the question-answering pipeline
qa_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, device=device)

# Define a list to store extracted objects and their attributes
objects = []




# Distilbert and Spacy

In [11]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/home/g22/GitHub/notebooks/env/bin/huggingface-cli", line 8, in <module>
    sys.exit(main())
  File "/home/g22/GitHub/notebooks/env/lib/python3.10/site-packages/huggingface_hub/commands/huggingface_cli.py", line 57, in main
    ser

In [50]:
import spacy
from transformers import DistilBertTokenizer, DistilBertForTokenClassification
from transformers import pipeline

# Check if GPU is available and set device accordingly
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'cuda' if device == 0 else 'cpu'}")

# Load spaCy and DistilBERT models
nlp_spacy = spacy.load("en_core_web_sm")  # Lightweight spaCy model for parsing
tokenizer = DistilBertTokenizer.from_pretrained("elastic/distilbert-base-uncased-finetuned-conll03-english")
distilbert_model = DistilBertForTokenClassification.from_pretrained("elastic/distilbert-base-uncased-finetuned-conll03-english")

# Use Hugging Face pipeline for NER with DistilBERT
ner_pipeline = pipeline("ner", model=distilbert_model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)

# Sample input text
text = """In this image I can see a dog which is black, brown and white in color laying on the ground.
I can also see two balls which are green and purple in color on the grass.
In the background I can see a few plants which are purple and blue in color, the ground, a few trees, and the sky."""

# Step 1: Extract Entities Using DistilBERT NER
# Step 1: Extract Entities Using DistilBERT NER
entities = ner_pipeline(text)

# Print extracted entities
for entity in entities:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")


Using device: cuda


# SPACY ONLY

## Example Code with SpaCy for NER and extracting object attributes:

In [None]:
import spacy

# Load the pre-trained SpaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = """
In this image I can see a dog which is black, brown and white in color laying on the ground.
"""

# Process the text with SpaCy NLP pipeline
doc = nlp(text)

# Initialize a list to hold the extracted objects with their attributes
extracted_objects = []

# Loop through the entities identified by SpaCy
for ent in doc.ents:
    entity_info = {}
    
    # Check the entity type and assign relevant attributes
    if ent.label_ == "ORG":
        entity_info["entity"] = ent.text
        entity_info["type"] = "Organization"
        # Check if there is a location mentioned for the organization
        for token in doc:
            if token.dep_ == "pobj" and token.head.text in ["in", "at"]:
                entity_info["location"] = token.text

    elif ent.label_ == "PRODUCT":
        entity_info["entity"] = ent.text
        entity_info["type"] = "Product"
        # Check if a price is mentioned after the product
        for token in doc:
            if token.text.startswith("$"):
                entity_info["price"] = token.text
    
    elif ent.label_ == "GPE":  # Geopolitical Entity (Country, City, etc.)
        entity_info["entity"] = ent.text
        entity_info["type"] = "Location"

    elif ent.label_ == "PERSON":
        entity_info["entity"] = ent.text
        entity_info["type"] = "Person"  

    elif ent.label_ == "OBJECT":
        entity_info["entity"] = ent.text
        entity_info["type"] = "Object"
        # Check for color and quantity descriptors
        for token in doc:
            if token.dep_ == "amod" and token.head.text == ent.text:
                entity_info["color"] = token.text
            elif token.dep_ == "nummod" and token.head.text == ent.text:
                entity_info["quantity"] = token.text
    
    # Add the object with its attributes to the list
    if entity_info:
        extracted_objects.append(entity_info)

# Display the extracted objects and their attributes
for obj in extracted_objects:
    print(obj)



AttributeError: 'spacy.tokens.token.Token' object has no attribute 'label_'

In [85]:
import spacy

# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm")

# Function to extract objects from a paragraph and make them singular
def extract_objects_from_paragraph(paragraph):
    doc = nlp(paragraph)
    objects = []
    for sent in doc.sents:  # Iterate through each sentence in the paragraph
        sentence_objects = []
        for token in sent:
            # Check if the token is an object (direct, indirect, or prepositional)
            if token.dep_ in ("dobj", "iobj"):#, "pobj"):
                # Append the lemmatized form of the object (singular form)
                sentence_objects.append(token.lemma_)
        objects.append((sent.text, sentence_objects))
    return objects

# Example paragraph
paragraph = """
In this image I can see a dog which is black, brown and white in color laying on the ground.
I can also see two balls which are green and purple in color on the grass.
In the background I can see a few plants which are purple and blue in color, the ground, a few trees, and the sky.
"""

# Extract objects
objects = extract_objects_from_paragraph(paragraph)
for sentence, obj_list in objects:
    print(f"Sentence: {sentence}")
    print(f"Extracted objects: {obj_list}")
    print()


Sentence: 
In this image I can see a dog which is black, brown and white in color laying on the ground.

Extracted objects: ['dog']

Sentence: I can also see two balls which are green and purple in color on the grass.

Extracted objects: ['ball']

Sentence: In the background I can see a few plants which are purple and blue in color, the ground, a few trees, and the sky.

Extracted objects: ['plant']



In [87]:
import spacy

# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm")

# Function to extract objects from a paragraph, make them singular, and combine them into one string
def extract_and_combine_objects(paragraph):
    doc = nlp(paragraph)
    all_objects = []
    for sent in doc.sents:  # Iterate through each sentence in the paragraph
        for token in sent:
            # Check if the token is an object (direct, indirect, or prepositional)
            if token.dep_ in ("dobj", "iobj", "pobj"):
                # Append the lemmatized form of the object (singular form)
                all_objects.append(token.lemma_)
    
    # Join all objects into a single string
    combined_objects = ' '.join(all_objects)
    return combined_objects

# Example paragraph
paragraph = """
She gave the books to her friends in the libraries. Later, she met with her professors to discuss the projects. 
The students were completing their assignments in the classrooms while the teachers graded exams.
"""

# Extract and combine objects into a single string
combined_objects = extract_and_combine_objects(paragraph)
print(f"Combined objects: {combined_objects}")


Combined objects: book friend library professor project assignment classroom exam


In [59]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("I can see a brown box on the floor.")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [53]:

import spacy

# Load the English model
nlp = spacy.load("en_core_web_sm")

# Process a text
doc = nlp("I can see a dog which is black, brown and white in color laying on the ground.")

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities
for entity in doc.ents:
    print(entity.text, entity.label_)


Noun phrases: ['I', 'a dog', 'which', 'the ground']
Verbs: ['see', 'lay']


In [55]:
import spacy

# Load spaCy English model
nlp_spacy = spacy.load("en_core_web_sm")

# Sample text
text = """I can see a dog which is black, brown and white in color laying on the ground."""

# Process the text using spaCy
doc = nlp_spacy(text)

# Define a function to extract the objects and attributes
def extract_objects_and_attributes(doc):
    extracted_info = {}
    
    # Iterate over the tokens in the document
    for token in doc:
        if token.pos_ == "NOUN" or token.pos_ == "PROPN":  # Object candidates are nouns and proper nouns
            object_name = token.lemma_.lower()

            if object_name not in extracted_info:
                extracted_info[object_name] = {"color": [], "position": [], "quantity": None}
            
            # Extract adjectives (color) and quantities
            for child in token.children:
                if child.dep_ == "amod" and child.pos_ == "ADJ":  # Color adjectives
                    extracted_info[object_name]["color"].append(child.text)
                elif child.dep_ == "nummod":  # Quantities
                    extracted_info[object_name]["quantity"] = child.text
                elif child.dep_ == "prep":  # Prepositions indicating position
                    for grandchild in child.children:
                        if grandchild.dep_ == "pobj" and grandchild.pos_ == "NOUN":
                            extracted_info[object_name]["position"].append(grandchild.text)

    # Clean up results, removing unnecessary empty values
    for obj in extracted_info:
        if not extracted_info[obj]["color"]:
            extracted_info[obj]["color"] = None
        if not extracted_info[obj]["position"]:
            extracted_info[obj]["position"] = None
        if not extracted_info[obj]["quantity"]:
            extracted_info[obj]["quantity"] = 1  # Default to 1 if not specified

    return extracted_info

# Run the extraction function
extracted_info = extract_objects_and_attributes(doc)
for obj, attributes in extracted_info.items():
    print(f"Object: {obj}")
    print(f"Attributes: {attributes}")
    print()

Object: dog
Attributes: {'color': None, 'position': None, 'quantity': 1}

Object: color
Attributes: {'color': None, 'position': None, 'quantity': 1}

Object: ground
Attributes: {'color': None, 'position': None, 'quantity': 1}



In [6]:
import spacy

# Load spaCy English model
nlp_spacy = spacy.load("en_core_web_sm")

# Sample text
text = """In this image I can see a dog which is black, brown and white in color laying on the ground.
I can also see two balls which are green and purple in color on the grass.
In the background I can see a few plants which are purple and blue in color, the ground, a few trees, and the sky."""

# Process the text using spaCy
doc = nlp_spacy(text)

# Define a function to extract the objects and attributes
def extract_objects_and_attributes(doc):
    extracted_info = {}

    # Iterate over the tokens in the document
    for token in doc:
        # Object candidates are nouns and proper nouns
        if token.pos_ == "NOUN" or token.pos_ == "PROPN":  
            object_name = token.lemma_.lower()

            if object_name not in extracted_info:
                extracted_info[object_name] = {"color": [], "position": [], "quantity": None}

            # Extract adjectives (color) and quantities
            for child in token.children:
                if child.dep_ == "amod" and child.pos_ == "ADJ":  # Color adjectives
                    extracted_info[object_name]["color"].append(child.text)
                elif child.dep_ == "nummod":  # Quantities
                    extracted_info[object_name]["quantity"] = child.text
                elif child.dep_ == "prep" and child.pos_ == "NOUN":  # Positioning (e.g., "on the grass")
                    extracted_info[object_name]["position"].append(child.text)

    # Extracting more complex color phrases (like "black, brown and white in color")
    for ent in doc.ents:
        if ent.label_ == "COLOR":  # Handle color-based entities
            object_name = ent.text.lower()
            for key in extracted_info:
                if key in object_name:
                    extracted_info[key]["color"] = extracted_info[key].get("color", []) + [ent.text]

    # Clean up results, removing unnecessary empty values
    for obj in extracted_info:
        if not extracted_info[obj]["color"]:
            extracted_info[obj]["color"] = None
        if not extracted_info[obj]["position"]:
            extracted_info[obj]["position"] = None
        if not extracted_info[obj]["quantity"]:
            extracted_info[obj]["quantity"] = 1  # Default to 1 if not specified

    return extracted_info

# Run the extraction function
extracted_info = extract_objects_and_attributes(doc)
print(extracted_info)


{'image': {'color': None, 'position': None, 'quantity': 1}, 'dog': {'color': None, 'position': None, 'quantity': 1}, 'color': {'color': None, 'position': None, 'quantity': 1}, 'ground': {'color': None, 'position': None, 'quantity': 1}, 'ball': {'color': None, 'position': None, 'quantity': 'two'}, 'grass': {'color': None, 'position': None, 'quantity': 1}, 'background': {'color': None, 'position': None, 'quantity': 1}, 'plant': {'color': None, 'position': None, 'quantity': 'few'}, 'tree': {'color': ['few'], 'position': None, 'quantity': 1}, 'sky': {'color': None, 'position': None, 'quantity': 1}}


In [76]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import tree2conlltags

# Ensure you have the necessary NLTK data files
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker_tab')

text = """In this image I can see a dog which is black, brown and white in color laying on the ground.
I can also see two balls which are green and purple in color on the grass.
In the background I can see a few plants which are purple and blue in color, the ground, a few trees, and the sky."""

# Tokenize the text into sentences
sentences = nltk.sent_tokenize(text)

# Function to extract noun phrases
def extract_objects(sentence):
    words = word_tokenize(sentence)
    pos_tags = pos_tag(words)
    chunked = ne_chunk(pos_tags)
    iob_tagged = tree2conlltags(chunked)
    
    objects = []
    for word, pos, chunk in iob_tagged:
        if chunk == 'B-NP' or chunk == 'I-NP':
            objects.append(word)
    return objects

# Extract objects from each sentence
all_objects = []
for sentence in sentences:
    objects = extract_objects(sentence)
    all_objects.extend(objects)

# Print the extracted objects
print(all_objects)

[nltk_data] Downloading package punkt to /home/g22/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/g22/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/g22/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/g22/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /home/g22/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!


[]


In [77]:
import spacy

# Install spaCy and download the English model
# !pip install spacy
# !python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")

text = """In this image I can see a dog which is black, brown and white in color laying on the ground.
I can also see two balls which are green and purple in color on the grass.
In the background I can see a few plants which are purple and blue in color, the ground, a few trees, and the sky."""

# Process the text with spaCy
doc = nlp(text)

# Extract objects based on dependency parsing
objects = [chunk.text for chunk in doc.noun_chunks]

# Print the extracted objects
print(objects)

['this image', 'I', 'a dog', 'which', 'the ground', 'I', 'two balls', 'which', 'color', 'the grass', 'the background', 'I', 'a few plants', 'which', 'color', 'the ground', 'a few trees', 'the sky']
