# Task 1: Sentence Transformer Implementation

In [1]:
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn as nn

In [2]:
class SentenceTransformerModel(nn.Module):
    def __init__(self, model_name="distilbert-base-uncased"):
        super(SentenceTransformerModel, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def get_sentence_embedding(self, sentence):
        # Tokenize the sentence and get embeddings
        inputs = self.tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
        outputs = self.model(**inputs).last_hidden_state
        # I have used mean pooling to get a fixed-size embedding
        embedding = outputs.mean(dim=1)
        return embedding



In [3]:
# I have used separate sentences for TaskA and TaskB

# Sentences for classification into classes (pos,neg)
sample_sentences = [
    "Adam loves the new CocaCola and enjoys shopping at Walmart.",  # Positive
    "Sam says Nike makes the best shoes!",  # Positive
    "Sarah bought an Apple iPhone and is very happy with it.",  # Positive
    "Amazon’s delivery service was fast and reliable.",  # Positive
    "The Starbucks coffee was overpriced and tasted terrible.",  # Negative
    "My kids enjoy the Happy Meal from McDonald's every time.",  # Positive
    "I bought a new Samsung Galaxy, but it stopped working after a week.",  # Negative
    "The customer service at Target was rude and unhelpful.",  # Negative
    "My experience at Costco was great, lots of deals!"  # Positive
]

# Sentence for NER tagging
news_sentences = [
    "John Smith, CEO of General Electric, announced new sustainability initiatives, which were met with positive feedback.", 
    "Jessica Lee, a spokesperson for Pfizer, faced criticism after the company reported delays in vaccine production.", 
    "Michael Johnson from Microsoft revealed groundbreaking advancements in AI technology at the annual developer conference.", 
    "Emily Chen, a lead designer at Tesla, discussed upcoming vehicle designs, though some investors are skeptical of the high costs involved.",
    "David Brown, an analyst at Goldman Sachs, predicts steady economic growth in the next quarter, citing strong market performance.",  
    "Rachel Kim, marketing director at Intel, highlighted semiconductor innovations, but analysts questioned the company's future competitiveness.",  
    "Daniel White from Unilever announced a new line of eco-friendly products, drawing praise from environmental groups.",  
    "Sarah Adams, a senior engineer at Siemens, shared concerns over delays in renewable energy projects at the tech summit.",  
    "Alex Thompson, head of operations at IBM, addressed the company’s focus on cloud computing, which analysts applauded.",  
    "Laura Davis from Procter & Gamble highlighted their commitment to sustainable packaging, though some consumers expressed doubts about actual impact."  
]


In [4]:
# Instantiate the sentence transformer model
sentence_transformer = SentenceTransformerModel()


embeddings = [sentence_transformer.get_sentence_embedding(sentence) for sentence in sample_sentences]
news_embeddings = [sentence_transformer.get_sentence_embedding(sentence) for sentence in news_sentences]
# Embeddings
for i, embedding in enumerate(embeddings):
    print(f"Embedding for '{sample_sentences[i]}': {embedding}")
for i, embedding in enumerate(news_embeddings):
    print(f"Embedding for '{news_sentences[i]}': {embedding}")

Embedding for 'Adam loves the new CocaCola and enjoys shopping at Walmart.': tensor([[ 6.7405e-02, -1.0782e-01,  1.2245e-01,  2.3454e-01,  3.7781e-01,
         -2.1122e-01,  1.6062e-01,  6.7913e-01,  3.8589e-02, -1.0414e-02,
          3.0569e-02, -3.1418e-01,  9.9426e-02,  4.3118e-01,  2.0299e-02,
          2.9144e-01, -3.5401e-02,  4.4236e-01,  5.7512e-02,  4.2884e-01,
          1.8735e-01, -1.0263e-01, -4.3220e-02,  5.5153e-01,  3.6643e-01,
         -5.6519e-02, -7.1243e-02, -1.4520e-01, -1.0954e-01,  1.0390e-01,
          3.2745e-01,  3.6997e-01, -6.5411e-02, -1.5218e-01,  1.9537e-01,
         -2.9075e-01, -3.2923e-01, -1.3908e-01, -3.0704e-01,  1.0781e-01,
         -4.5800e-01, -2.6278e-01, -3.1323e-02,  9.9463e-02, -1.1425e-02,
         -1.3369e-01, -7.2806e-02, -4.4438e-02, -7.8927e-02, -2.2771e-01,
         -1.2587e-01,  1.2653e-01, -9.3794e-02, -2.6375e-01,  1.7433e-01,
          5.0064e-01, -4.1246e-01, -4.0017e-01,  4.7385e-02,  1.8063e-02,
          1.0949e-01,  3.1954e-01, 

# Task 2

## Task 2A: Sentence Classification
## Task 2B : Named Entity Recognition

In [26]:
from transformers import AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoTokenizer
import torch
import torch.nn as nn

class MultiTaskSentenceTransformer(nn.Module):
    def __init__(self, model_name="distilbert-base-uncased-finetuned-sst-2-english", ner_model_name="dbmdz/bert-large-cased-finetuned-conll03-english"):
        super(MultiTaskSentenceTransformer, self).__init__()
        
        # Load pre-trained sentiment analysis model for Task 2A
        self.sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.sentiment_tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # Load pre-trained NER model for Task 2B
        self.ner_tokenizer = AutoTokenizer.from_pretrained(ner_model_name)
        self.ner_model = AutoModelForTokenClassification.from_pretrained(ner_model_name)
        
        # Define a list of labels for NER
        # This section sets up the standard IOB (Inside-Outside-Beginning) tagging scheme for NER
        self.label_list = [
            "O", "B-MISC", "I-MISC", "B-PER", "I-PER", 
            "B-ORG", "I-ORG", "B-LOC", "I-LOC"
        ]
        # Simplified NER label mapping so that it is easy to read
        self.simplified_labels = {
            "B-PER": "Person", "I-PER": "Person",
            "B-LOC": "Location", "I-LOC": "Location",
            "B-ORG": "Organization", "I-ORG": "Organization",
            "B-MISC": "Misc", "I-MISC": "Misc"
        }

    def classify_sentence(self, sentence):
        # Using pre-trained model for sentiment classification
        inputs = self.sentiment_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
        outputs = self.sentiment_model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        
        predicted_class = torch.argmax(probs, dim=1).item()
        label = "positive" if predicted_class == 1 else "negative"   # Positive and Negative are the 2 classes
        return label, probs

    def get_ner_tags(self, sentence):
        # Tokenizing the sentence and getting model predictions for NER
        inputs = self.ner_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
        outputs = self.ner_model(**inputs).logits
        predictions = torch.argmax(outputs, dim=2)
    
        tokens = self.ner_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        predicted_labels = [self.label_list[prediction] for prediction in predictions[0].numpy()]
    
        tagged_output = []
        current_entity = []
        readable_label = None
    
        for token, label in zip(tokens, predicted_labels):
            # Check if token is part of a named entity
    # This section checks for the start of a new entity, saving the current one if present, to handle consecutive entities correctly
            if label != "O":
                if label.split('-')[0] == 'B' and current_entity:
                    tagged_output.append(("".join(current_entity), readable_label))
                    current_entity = []
    
                # Updating readable label and adding the current token (without space if it is a subword)
                readable_label = self.simplified_labels.get(label, "O")
                token = token.replace("##", "")  # Removing subword indicator
                current_entity.append(token)
            else:
                # If outside an entity, saving the current entity if any
                if current_entity:
                    tagged_output.append(("".join(current_entity), readable_label))
                    current_entity = []
    
        # Appending any remaining entity at the end
        if current_entity:
            tagged_output.append(("".join(current_entity), readable_label))
            
""" This is all done to make sure the entities are in a readable format and shown as separate entities. It often happens that a single entity
is split into more than one entity. The above code will make sure the it is tagged as a single entity.
""" 
    
        return tagged_output



    def get_ner_tags_for_sentences(self, sentences):
        results = {}
        for sentence in sentences:
            ner_tags = self.get_ner_tags(sentence)
            results[sentence] = ner_tags
        return results

# Instantiate and test the model
multi_task_model = MultiTaskSentenceTransformer()

# Task 2A: Sentence Classification (Sentiment)
print("Task 2A: Sentence Classification Results:")
for sentence in sample_sentences:
    sentiment, probs = multi_task_model.classify_sentence(sentence)
    print(f"Sentence: '{sentence}' | Sentiment: {sentiment} | Probabilities: {probs}")

# Task 2B: Named Entity Recognition
print("\nTask 2B: Named Entity Recognition Results:")
ner_results = multi_task_model.get_ner_tags_for_sentences(news_sentences)
for sentence, entities in ner_results.items():
    print(f"Sentence: '{sentence}'")
    print("Named Entities:", entities)
    print("------")


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Task 2A: Sentence Classification Results:
Sentence: 'Adam loves the new CocaCola and enjoys shopping at Walmart.' | Sentiment: positive | Probabilities: tensor([[0.0014, 0.9986]], grad_fn=<SoftmaxBackward0>)
Sentence: 'Sam says Nike makes the best shoes!' | Sentiment: positive | Probabilities: tensor([[2.4354e-04, 9.9976e-01]], grad_fn=<SoftmaxBackward0>)
Sentence: 'Sarah bought an Apple iPhone and is very happy with it.' | Sentiment: positive | Probabilities: tensor([[1.4851e-04, 9.9985e-01]], grad_fn=<SoftmaxBackward0>)
Sentence: 'Amazon’s delivery service was fast and reliable.' | Sentiment: positive | Probabilities: tensor([[7.7581e-04, 9.9922e-01]], grad_fn=<SoftmaxBackward0>)
Sentence: 'The Starbucks coffee was overpriced and tasted terrible.' | Sentiment: negative | Probabilities: tensor([[9.9950e-01, 4.9810e-04]], grad_fn=<SoftmaxBackward0>)
Sentence: 'My kids enjoy the Happy Meal from McDonald's every time.' | Sentiment: positive | Probabilities: tensor([[1.4214e-04, 9.9986e-0

# Describe the changes made to the architecture to support multi-task learning.

To support multi-task learning in this architecture, I adapted the original sentence transformer by adding separate task-specific output layers on top of the transformer backbone. For Task 2A (sentence classification), I introduced a dense layer that maps embeddings to class probabilities, allowing the model to perform sentiment classification. For Task 2B (named entity recognition), I added a token-level classification layer to enable NER tagging for entities within each input sentence.

These additional layers make it possible for the model to learn to perform each task individually while sharing a common feature extractor (the transformer backbone), optimizing performance across tasks without requiring entirely separate models.

# Task 3: Training Considerations

## Task 4

In [36]:
from transformers import AdamW
import torch

layerwise_learning_rates = [
    # For DistilBERT (Sentiment Analysis)
    {'params': multi_task_model.sentiment_model.distilbert.embeddings.parameters(), 'lr': 1e-5},
    {'params': multi_task_model.sentiment_model.distilbert.transformer.layer[0].parameters(), 'lr': 2e-5},
    {'params': multi_task_model.sentiment_model.distilbert.transformer.layer[1].parameters(), 'lr': 3e-5},
    {'params': multi_task_model.sentiment_model.classifier.parameters(), 'lr': 1e-4},
    
    # For BERT model used in Named Entity Recognition (NER)
    {'params': multi_task_model.ner_model.bert.embeddings.parameters(), 'lr': 1e-5},
    {'params': multi_task_model.ner_model.bert.encoder.layer[0].parameters(), 'lr': 2e-5},
    {'params': multi_task_model.ner_model.bert.encoder.layer[1].parameters(), 'lr': 3e-5},
    {'params': multi_task_model.ner_model.classifier.parameters(), 'lr': 1e-4},
]

""" 
The above block sets specific learning rates for each layer of the sentiment and NER models, gradually increasing from foundational 
to task-specific layers to optimize fine-tuning.
"""

# Instantiate optimizer with layer-wise learning rates
optimizer = AdamW(layerwise_learning_rates)


# Defining a loss function for classification and NER
classification_loss_fn = torch.nn.CrossEntropyLoss()
ner_loss_fn = torch.nn.CrossEntropyLoss()

# Training loop to demonstrate weight updates with layer-wise learning rates. I have just done with 3 epochs
# We can always improve with more epochs or by tweaking the learning rates
num_epochs = 3
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    
    # Process sample_sentences for Task 2A: Sentence Classification (Sentiment Analysis)
    print("SENTENCE CLASSIFICATION LOSS:")
    for sentence in sample_sentences:
        sentiment_probs = multi_task_model.classify_sentence(sentence)[1]
        classification_label = torch.tensor([1])  # Example label: 1 for positive, 0 for negative
        classification_loss = classification_loss_fn(sentiment_probs, classification_label)

        optimizer.zero_grad()
        classification_loss.backward()
        optimizer.step()

        print(f"Sentence: '{sentence}' | Sentiment Classification Loss: {classification_loss.item()}")
    print("_______________________________________________________________________________________")
    print("NER LOSS:")
    # Process news sentences for Task 2B: Named Entity Recognition
    for sentence in news_sentences:
        inputs = multi_task_model.ner_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
        ner_logits = multi_task_model.ner_model(**inputs).logits  # Directly fetch logits
        ner_labels = torch.randint(0, ner_logits.shape[-1], (ner_logits.shape[1],))  # Random labels for demonstration
        ner_loss = ner_loss_fn(ner_logits.view(-1, ner_logits.size(-1)), ner_labels.view(-1))

        optimizer.zero_grad()
        ner_loss.backward()
        optimizer.step()

        # This is to get NER output in readable format
        ner_output = multi_task_model.get_ner_tags(sentence)
        
        print(f"Sentence: '{sentence}' | NER Loss: {ner_loss.item()}")
        print(f"Named Entities: {ner_output}")
    
    print(f"Epoch {epoch+1} Completed\n")


Epoch 1/3
SENTENCE CLASSIFICATION LOSS:
Sentence: 'Adam loves the new CocaCola and enjoys shopping at Walmart.' | Sentiment Classification Loss: 0.3133243918418884
Sentence: 'Sam says Nike makes the best shoes!' | Sentiment Classification Loss: 0.3132992088794708
Sentence: 'Sarah bought an Apple iPhone and is very happy with it.' | Sentiment Classification Loss: 0.3132905066013336
Sentence: 'Amazon’s delivery service was fast and reliable.' | Sentiment Classification Loss: 0.31332623958587646
Sentence: 'The Starbucks coffee was overpriced and tasted terrible.' | Sentiment Classification Loss: 0.31334730982780457
Sentence: 'My kids enjoy the Happy Meal from McDonald's every time.' | Sentiment Classification Loss: 0.3132844865322113
Sentence: 'I bought a new Samsung Galaxy, but it stopped working after a week.' | Sentiment Classification Loss: 1.3097096681594849
Sentence: 'The customer service at Target was rude and unhelpful.' | Sentiment Classification Loss: 1.288550615310669
Sentence:

### Explain the rationale for the specific learning rates you've set for each layer.

I set different learning rates for each layer, gradually increasing from the foundational to the task-specific layers. The lower layers handle more general language representations, so I kept their learning rates low (1e-5 to 3e-5) to avoid drastic changes that could disrupt pre-trained knowledge. On the other hand, I set higher learning rates (1e-4) for the sentiment and NER heads, allowing these layers to adapt more quickly to the specific tasks of sentiment analysis and named entity recognition.

The purpose behind this approach is to strike a balance: protect the foundational knowledge in lower layers while enabling rapid learning in the task-specific heads. This approach is particularly effective for multi-task settings, where each task needs distinct adjustments without losing the benefits of the shared layers.

### Describe the potential benefits of using layer-wise learning rates for training deep neural networks. Does the multi-task setting play into that?

Layer-wise learning rates make it possible to customize the training process so that each component of the model can adapt at a rate appropriate for its function. This has a number of significant benefits:

1) Preventing Overfitting on Task-Specific Heads: The sentiment and NER heads can adapt faster without forcing changes in the core, shared layers.

2) Efficient Use of Pre-Trained Knowledge: Lower layers carry important general linguistic structures, so small learning rates keep these stable while contributing to the specific tasks.

3) Optimizing Shared Representations for Multiple Tasks: Layer-wise rates ensure that shared representations in the model can support multiple tasks effectively without overfitting to any single one.
    
Overall, this approach allows the model to leverage foundational representations while tailoring the higher layers to each task. In a multi-task setting like mine, this strategy enhances adaptability and efficiency across both tasks.

# Technical explanation and brief write-up summary for Task 3 and Task 4

## Task 3 Summary: Training Considerations and Key Decisions

For Task 3, my approach focused on selectively freezing layers and utilizing transfer learning principles to optimize model performance across both tasks—sentiment analysis and named entity recognition.

Layer-Freezing Strategy: I considered scenarios where either the entire model, the transformer backbone, or only one task-specific head could be frozen. Freezing the backbone allows leveraging the rich linguistic knowledge from pre-trained models while focusing on training the task-specific heads. Conversely, freezing only one head lets the other task adapt further, which can be useful when the tasks vary in difficulty or domain specificity.

Transfer Learning Choice: I chose pre-trained models tailored for each task—DistilBERT for sentiment analysis and a BERT-based model fine-tuned on named entity recognition data. I retained the backbone's pre-trained layers, freezing the lower layers and allowing higher layers to adapt based on task requirements. This approach effectively balances stability with flexibility, improving the model's adaptability to our specific data.

These decisions helped ensure that the model leverages both existing linguistic representations and learns the distinct nuances of each task without overfitting.

## Task 4 Summary: Layer-Wise Learning Rates and Key Insights

In Task 4, I implemented layer-wise learning rates to improve fine-tuning precision and performance across the model’s layers.

Layer-Wise Rates for Gradual Adjustment: I assigned lower learning rates to the foundational layers (1e-5 to 3e-5) to protect general linguistic knowledge, while task-specific heads had higher rates (1e-4) to adapt more quickly to their respective tasks. This gradient in learning rates allowed the lower layers to retain their generalized structure, benefiting both tasks, while letting the heads adjust for domain-specific requirements.

Multi-Task Setting Benefits: Using layer-wise learning rates is particularly effective for multi-task learning, as each task’s unique requirements can be met without disrupting shared layers. This approach minimizes interference between tasks, allowing both to converge effectively while maintaining efficient use of shared representations.

## Conclusion

In conclusion, this implementation captures the requirements set forth, integrating distinct model architectures for both sentiment analysis and named entity recognition tasks. Through carefully applied layer-wise learning rates and task-specific adjustments, this approach balances shared knowledge retention with adaptability for each task. This solution meets the objectives effectively, enhancing model precision while minimizing interference across tasks, achieving a robust multi-task framework aligned with the assessment's goals