# CS 585 - HW 4 - EDUARDO GALEOTE - A20552496

#### PROBLEM 1 – Reading the data in CoNLL format

In [1]:
def read_conll_format(file_path):
    sequences = []
    tags = []

    current_sequence = []
    current_tags = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            
            # Check for a new sequence
            if not line:
                if current_sequence and current_tags:
                    sequences.append(current_sequence)
                    tags.append(current_tags)
                    current_sequence = []
                    current_tags = []
            else:
                # Split the line into token and tag
                token, tag = line.split('\t')
                current_sequence.append(token)
                current_tags.append(tag)
        
        # Append any sequence left at the end of the file
        if current_sequence and current_tags:
            sequences.append(current_sequence)
            tags.append(current_tags)

    return sequences, tags


In [2]:
# Apply the function to the training and testing datasets
train_file_path = 'train.tsv'
test_file_path = 'test.tsv'

train_token_sequences, train_tag_sequences = read_conll_format(train_file_path)
test_token_sequences, test_tag_sequences = read_conll_format(test_file_path)

# Display the number of sequences in the training and testing datasets
print(f"Number of sequences in the training dataset: {len(train_token_sequences)}")
print(f"Number of sequences in the testing dataset: {len(test_token_sequences)}")

# Display the tokens and tags of the first sequence in the training dataset
print("\nTokens and tags of the first sequence in the training dataset:")
for token, tag in zip(train_token_sequences[0], train_tag_sequences[0]):
    print(f"{token} ({tag})")


Number of sequences in the training dataset: 5432
Number of sequences in the testing dataset: 940

Tokens and tags of the first sequence in the training dataset:
Identification (O)
of (O)
APC2 (O)
, (O)
a (O)
homologue (O)
of (O)
the (O)
adenomatous (B-Disease)
polyposis (I-Disease)
coli (I-Disease)
tumour (I-Disease)
suppressor (O)
. (O)


#### PROBLEM 2 – Data Discovery

In [3]:
from collections import Counter

# Flatten the list of tags to make it easier to count
flat_tags = [tag for sublist in train_tag_sequences for tag in sublist]

# Count each of the tags
tag_counts = Counter(flat_tags)

# Display the count of each tag
print("Count of each tag in the training data:")
for tag, count in tag_counts.items():
    print(f"{tag}: {count}")


Count of each tag in the training data:
O: 124819
B-Disease: 5145
I-Disease: 6122


In [4]:
def extract_disease_phrases(token_sequences, tag_sequences):
    disease_phrases = []

    for tokens, tags in zip(token_sequences, tag_sequences):
        phrase = []
        for token, tag in zip(tokens, tags):
            if tag == 'B-Disease':
                if phrase:  # if the previous phrase is complete, append it to the list
                    disease_phrases.append(' '.join(phrase))
                    phrase = []  # start a new phrase
                phrase.append(token)
            elif tag == 'I-Disease' and phrase:
                phrase.append(token)
            elif phrase:  # if the previous phrase is complete, append it to the list
                disease_phrases.append(' '.join(phrase))
                phrase = []  # start a new phrase
        
        # Append any phrase left at the end of the sequence
        if phrase:
            disease_phrases.append(' '.join(phrase))

    return disease_phrases

# Extract disease phrases from the training data
disease_phrases = extract_disease_phrases(train_token_sequences, train_tag_sequences)

# Count the most common disease phrases
disease_phrase_counts = Counter(disease_phrases)

# Display the most common disease phrases
print("\nCommon diseases that appear together:")
for phrase, count in disease_phrase_counts.most_common(20):
    print(f"{phrase}: {count}")



Common diseases that appear together:
DM: 120
DMD: 109
APC: 91
ALD: 86
PWS: 75
WAS: 63
myotonic dystrophy: 57
G6PD deficiency: 57
HD: 55
PKU: 52
aniridia: 50
Duchenne muscular dystrophy: 44
breast cancer: 42
VHL: 40
FAP: 39
cancer: 37
tumor: 37
A - T: 37
Tay - Sachs disease: 35
TSD: 35


#### PROBLEM 3 – Building Features

In [5]:
def token_features(token_sequence, tag_sequence, position):
    # Feature for the current word
    current_word = token_sequence[position].lower()
    
    # Feature for the suffix of the current word
    suffix = current_word[-3:]
    
    # Feature for the previous word (or BOS if at the beginning)
    previous_word = "BOS" if position == 0 else token_sequence[position - 1].lower()
    
    # Feature for the next word (or EOS if at the end)
    next_word = "EOS" if position == len(token_sequence) - 1 else token_sequence[position + 1].lower()

    # Check if the token is a number
    is_numeric = "NUM" if current_word.isdigit() else "NONNUM"

    # Return the feature list
    features = [
        f"word={current_word}",
        f"suffix={suffix}",
        f"prev_word={previous_word}",
        f"next_word={next_word}",
        f"is_numeric={is_numeric}"
    ]
    
    return features

# Apply the function to the train token and tag sequences
train_features = []
for tokens, tags in zip(train_token_sequences, train_tag_sequences):
    sequence_features = [token_features(tokens, tags, i) for i in range(len(tokens))]
    train_features.append(sequence_features)

# Apply the function to the test token and tag sequences
test_features = []
for tokens, tags in zip(test_token_sequences, test_tag_sequences):
    sequence_features = [token_features(tokens, tags, i) for i in range(len(tokens))]
    test_features.append(sequence_features)

# Apply the modified function to the first 3 tokens of the first sequence in the train dataset
first_sequence_features = [token_features(train_token_sequences[0], train_tag_sequences[0], i) for i in range(3)]

# Display the features for the first 3 tokens of the first sequence in the train dataset
print("Features for the first 3 tokens of the first sequence in train:")
for i, features in enumerate(first_sequence_features, 1):
    print(f"Token {i} features: {features}")

Features for the first 3 tokens of the first sequence in train:
Token 1 features: ['word=identification', 'suffix=ion', 'prev_word=BOS', 'next_word=of', 'is_numeric=NONNUM']
Token 2 features: ['word=of', 'suffix=of', 'prev_word=identification', 'next_word=apc2', 'is_numeric=NONNUM']
Token 3 features: ['word=apc2', 'suffix=pc2', 'prev_word=of', 'next_word=,', 'is_numeric=NONNUM']


#### PROBLEM 4 – Training a CRF model

In [6]:
import pycrfsuite
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

# Function to convert labels to binary
def bio_classification_report(y_true, y_pred):
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

    tagset = list(set(lb.classes_))
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels=[class_indices[cls] for cls in tagset],
        target_names=tagset,
        zero_division=0
    )

# Train the CRF model
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(train_features, train_tag_sequences):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,   # Coefficient for L1 regularization
    'c2': 1e-3,  # Coefficient for L2 regularization
    'max_iterations': 50,  # Maximum number of iterations
    'feature.possible_transitions': True  # Include possible transitions
})

trainer.train('bio_crf.model')

# Load the trained model
tagger = pycrfsuite.Tagger()
tagger.open('bio_crf.model')

# Apply the model to the test dataset
predicted_tag_sequences = [tagger.tag(xseq) for xseq in test_features]

# Evaluate the model
report = bio_classification_report(test_tag_sequences, predicted_tag_sequences)
print(report)


              precision    recall  f1-score   support

   B-Disease       0.86      0.69      0.77       960
   I-Disease       0.87      0.73      0.79      1087
           O       0.98      0.99      0.99     22450

   micro avg       0.97      0.97      0.97     24497
   macro avg       0.90      0.80      0.85     24497
weighted avg       0.97      0.97      0.97     24497
 samples avg       0.97      0.97      0.97     24497



#### PROBLEM 5 - Inspecting the trained model

In [7]:
# Load the trained model
tagger = pycrfsuite.Tagger()
tagger.open('bio_crf.model')

# Show parameter weights for transitions between the 3 tag types
transitions = tagger.info().transitions
print("Transition weights:")
for (label_from, label_to), weight in transitions.items():
    if label_from in ["B-Disease", "I-Disease", "O"] and label_to in ["B-Disease", "I-Disease", "O"]:
        print(f"Transition from {label_from} to {label_to}: {weight}")

# Show parameter weights assigned to the "is_numeric" feature
state_features = tagger.info().state_features
print("\nState feature weights for 'is_numeric':")
for (attribute, label), weight in state_features.items():
    if "is_numeric" in attribute:
        print(f"Feature {attribute}, Label {label}: {weight}")


Transition weights:
Transition from O to O: 2.342342
Transition from O to B-Disease: 0.240565
Transition from O to I-Disease: -7.488606
Transition from B-Disease to O: -2.062396
Transition from B-Disease to B-Disease: -5.738875
Transition from B-Disease to I-Disease: 1.754811
Transition from I-Disease to O: -1.264343
Transition from I-Disease to B-Disease: -3.704388
Transition from I-Disease to I-Disease: 2.72729

State feature weights for 'is_numeric':
Feature is_numeric=NONNUM, Label O: -0.008705
Feature is_numeric=NONNUM, Label B-Disease: -0.195014
Feature is_numeric=NONNUM, Label I-Disease: -1.257853
Feature is_numeric=NUM, Label O: 2.791767
Feature is_numeric=NUM, Label I-Disease: 0.789793


#### PROBLEM 6 – Document level performance

In [8]:
def aggregate_to_document_level(tags):
    """
    Aggregates token-level tags to a document-level label.
    
    Args:
    tags (list of str): The sequence of tags for tokens in a document.
    
    Returns:
    int: 1 if there's at least one "B-Disease" tag, 0 otherwise.
    """
    return 1 if "B-Disease" in tags else 0

# Apply the function to both true and predicted document-level labels from the test set
true_doc_labels = [aggregate_to_document_level(seq) for seq in test_tag_sequences]
predicted_doc_labels = [aggregate_to_document_level(seq) for seq in predicted_tag_sequences]

# Compute document level precision and recall
from sklearn.metrics import precision_score, recall_score

doc_precision = precision_score(true_doc_labels, predicted_doc_labels)
doc_recall = recall_score(true_doc_labels, predicted_doc_labels)

# Display the results
print(f"Document-level Precision: {doc_precision}")
print(f"Document-level Recall: {doc_recall}")


Document-level Precision: 0.9682875264270613
Document-level Recall: 0.849721706864564
