## Assignment 4

###  Build a Named Entity Recognition (NER) system for extracting entities from real-world text such as news articles or social media data. And measure its accuracy, precision, recall, and F1-score.

In [2]:
# pip install spacy scikit-learn pandas
# python -m spacy download en_core_web_sm

In [4]:
import spacy
import pandas as pd

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [6]:
nlp = spacy.load("en_core_web_sm")
nlp

<spacy.lang.en.English at 0x254e35dc830>

In [7]:
data = [
    {
        "text": "Elon Musk is the CEO of Tesla and SpaceX.",
        "entities": [("Elon Musk", "PERSON"), ("Tesla", "ORG"), ("SpaceX", "ORG")]
    },
    {
        "text": "Apple was founded by Steve Jobs in California.",
        "entities": [("Apple", "ORG"), ("Steve Jobs", "PERSON"), ("California", "GPE")]
    },
    {
        "text": "Google announced new AI features in India.",
        "entities": [("Google", "ORG"), ("India", "GPE")]
    },
    {
        "text": "Microsoft acquired OpenAI for billions of dollars.",
        "entities": [("Microsoft", "ORG"), ("OpenAI", "ORG")]
    }
]

data

[{'text': 'Elon Musk is the CEO of Tesla and SpaceX.',
  'entities': [('Elon Musk', 'PERSON'), ('Tesla', 'ORG'), ('SpaceX', 'ORG')]},
 {'text': 'Apple was founded by Steve Jobs in California.',
  'entities': [('Apple', 'ORG'),
   ('Steve Jobs', 'PERSON'),
   ('California', 'GPE')]},
 {'text': 'Google announced new AI features in India.',
  'entities': [('Google', 'ORG'), ('India', 'GPE')]},
 {'text': 'Microsoft acquired OpenAI for billions of dollars.',
  'entities': [('Microsoft', 'ORG'), ('OpenAI', 'ORG')]}]

In [8]:
def get_predicted_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

In [9]:
for item in data:
    print("Text:", item["text"])
    print("Predicted:", get_predicted_entities(item["text"]))
    print()

Text: Elon Musk is the CEO of Tesla and SpaceX.
Predicted: [('Elon Musk', 'PERSON'), ('Tesla', 'ORG')]

Text: Apple was founded by Steve Jobs in California.
Predicted: [('Apple', 'ORG'), ('Steve Jobs', 'PERSON'), ('California', 'GPE')]

Text: Google announced new AI features in India.
Predicted: [('Google', 'ORG'), ('AI', 'GPE'), ('India', 'GPE')]

Text: Microsoft acquired OpenAI for billions of dollars.
Predicted: [('Microsoft', 'ORG'), ('OpenAI', 'ORG'), ('billions of dollars', 'MONEY')]



In [10]:
y_true = []
y_pred = []

for item in data:
    text = item["text"]
    
    true_entities = set(item["entities"])
    predicted_entities = set(get_predicted_entities(text))
    
    # True positives
    for ent in predicted_entities:
        y_pred.append(1)
        y_true.append(1 if ent in true_entities else 0)
    
    # False negatives
    for ent in true_entities:
        if ent not in predicted_entities:
            y_pred.append(0)
            y_true.append(1)

In [11]:
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)

print("Accuracy :", round(accuracy, 2))
print("Precision:", round(precision, 2))
print("Recall   :", round(recall, 2))
print("F1-score :", round(f1, 2))

Accuracy : 0.75
Precision: 0.82
Recall   : 0.9
F1-score : 0.86


In [12]:
results = []

for item in data:
    results.append({
        "text": item["text"],
        "predicted_entities": get_predicted_entities(item["text"])
    })

df_results = pd.DataFrame(results)
df_results.to_csv("ner_extracted_entities.csv", index=False)

In [13]:
metrics = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-score": f1
}

pd.DataFrame([metrics]).to_csv("ner_evaluation_metrics.csv", index=False)