In [46]:
import numpy as np
from itertools import product
from collections import Counter
import tensorflow as tf

# Define the Indexer class
class Indexer:
    def __init__(self, labels):
        self.objs_to_ints = {label: i for i, label in enumerate(labels)}
        self.ints_to_objs = labels

    def get_index(self, obj, add=True):
        if obj not in self.objs_to_ints:
            if add:
                self.objs_to_ints[obj] = len(self.ints_to_objs)
                self.ints_to_objs.append(obj)
            else:
                return None
        return self.objs_to_ints[obj]

    def get_object(self, idx):
        return self.ints_to_objs[idx]


# Define feature extraction functions
def extract_features(sentence):
    features_by_pair = []

    for i in range(len(sentence) - 1):
        features = []

        # Feature function for specific tag transitions (e.g., B-PER to I-PER)
        if sentence[i + 1][1][0] == 'I' and sentence[i][1][0] == 'B' and sentence[i][1][2:] == sentence[i + 1][1][2:]:
            features.append(1)
        else:
            features.append(0)

        # Feature function for current tag being the same as the previous tag
        features.append(int(sentence[i][1] == sentence[i + 1][1]))

        # Feature function for transitions between entity types
        entity_types = ['PER', 'LOC', 'ORG', 'MISC', 'O']
        tag_pairs = product(entity_types, repeat=2)
        for prev_entity, curr_entity in tag_pairs:
            if len(sentence[i][1]) > 1 and len(sentence[i + 1][1]) > 1 and sentence[i][1][2:] == prev_entity and sentence[i + 1][1][2:] == curr_entity:
                features.append(1)
            elif len(sentence[i][1]) > 1 and len(sentence[i + 1][1]) == 1 and sentence[i][1][2:] == prev_entity and sentence[i + 1][1] == curr_entity:
                features.append(1)
            elif len(sentence[i][1]) == 1 and len(sentence[i + 1][1]) > 1 and sentence[i][1] == prev_entity and sentence[i + 1][1][2:] == curr_entity:
                features.append(1)
            elif len(sentence[i][1]) == 1 and len(sentence[i + 1][1]) == 1 and sentence[i][1] == prev_entity and sentence[i + 1][1] == curr_entity:
                features.append(1)
            else:
                features.append(0)

        features_by_pair.append(features)

    return features_by_pair

# Function to read CoNLL 2003 data
def read_conll_2003(file_path):
    sentences = []
    current_sentence = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line == '':
                if len(current_sentence) > 1:  # Check if sentence length is greater than 1
                    sentences.append(current_sentence)
                current_sentence = []
            else:
                parts = line.split(' ')
                word = parts[0]
                tag = parts[-1]
                current_sentence.append((word, tag))

    return sentences

# Define the LinearChainCRF class
class LinearChainCRF:
    def __init__(self, num_labels, feature_size):
        self.num_labels = num_labels
        self.feature_size = feature_size

        # Define model parameters (weights)
        self.weights = tf.Variable(tf.random.normal([1, feature_size]))

    def compute_score(self, features):
        # Calculate the unnormalized scores using TensorFlow
        scores = tf.matmul(self.weights, tf.transpose(features))
        return scores

    def forward(self, features):
        # Forward algorithm to calculate the log partition function
        scores = self.compute_score(features)
        scores = tf.transpose(scores)

        alphas = [scores[0]]
        for t in range(1, tf.shape(scores)[0]):
            alpha_t = alphas[t - 1] + tf.reduce_logsumexp(scores, axis=1)
            alphas.append(alpha_t + scores[t])

        log_partition = tf.reduce_logsumexp(alphas[-1])

        return log_partition

    def viterbi_decode(self, features):
        # Viterbi decoding to find the best sequence
        scores = self.compute_score(features)
        scores = tf.transpose(scores)

        viterbi_scores = [scores[0]]
        backpointers = []

        for t in range(1, tf.shape(scores)[0]):
            viterbi_t = viterbi_scores[t - 1] + scores
            backpointer_t = tf.argmax(viterbi_t, axis=1)
            viterbi_t = tf.reduce_max(viterbi_t, axis=1)

            viterbi_scores.append(viterbi_t)
            backpointers.append(backpointer_t)

        best_last_label = tf.argmax(viterbi_scores[-1])
        best_sequence = [best_last_label]

        for backpointers_t in reversed(backpointers):
            best_last_label = tf.gather(backpointers_t, best_last_label)
            best_sequence.insert(0, best_last_label)

        return best_sequence


    def negative_log_likelihood(self, features, labels):
        # Calculate the negative log-likelihood using TensorFlow
        scores = self.compute_score(features)

        # Flatten the scores tensor
        flat_scores = tf.reshape(scores, [-1])


        # labeled_scores = tf.gather(flat_scores, indices)

        # Calculate the log partition function using the forward algorithm
        log_partition = self.forward(features)

        log_likelihood = tf.reduce_sum(flat_scores) - log_partition
        return -log_likelihood

    # ... (other methods)




    def train_step(self, features, labels, optimizer):
        with tf.GradientTape() as tape:
            loss = self.negative_log_likelihood(features, labels)

        gradients = tape.gradient(loss, [self.weights])
        optimizer.apply_gradients(zip(gradients, [self.weights]))


        return loss


    def save_weights(self, checkpoint_path):
        # Save the model weights using tf.train.Checkpoint
        checkpoint = tf.train.Checkpoint(model=self)
        checkpoint.save(checkpoint_path)

    def load_weights(self, checkpoint_path):
        # Restore the model weights using tf.train.Checkpoint
        checkpoint = tf.train.Checkpoint(model=self)
        checkpoint.restore(checkpoint_path)

# Main function to prepare data, extract features, and train the model
def prepare_data_and_train_crf(train_file_path, num_epochs=10):
    # Read CoNLL 2003 data
    train_sentences = read_conll_2003(train_file_path)

    # Extract features for each sentence
    features_list = [extract_features(sentence) for sentence in train_sentences]

    # Convert features to TensorFlow constants
    features_tf = [tf.constant(features, dtype=tf.float32) for features in features_list]

    labels_1 = ['I-LOC', 'B-ORG', 'O', 'B-PER', 'I-PER', 'I-MISC', 'B-MISC', 'I-ORG', 'B-LOC']
    indexer = Indexer(labels_1)

    # Initialize CRF model
    num_labels = 9  # Replace with the actual number of labels
    feature_size = 27
    crf = LinearChainCRF(num_labels, feature_size)

    # Initialize optimizer
    optimizer = tf.optimizers.Adam(learning_rate=0.001)





    # Training loop
    for epoch in range(num_epochs):
        total_loss = 0

        for features, sentence in zip(features_tf, train_sentences):

            labels = [indexer.get_index(tag) for _, tag in sentence]
            labels = tf.constant(labels, dtype=tf.int32)
            loss = crf.train_step(features, labels, optimizer)
            total_loss += loss


        print(f"Epoch {epoch + 1}, Loss: {total_loss.numpy()}")



# Example usage
train_file_path = '/content/train.txt'
prepare_data_and_train_crf(train_file_path)


Epoch 1, Loss: -311363.1875
Epoch 2, Loss: -1187892.125
Epoch 3, Loss: -2062892.0
Epoch 4, Loss: -2936997.25
Epoch 5, Loss: -3811049.0
Epoch 6, Loss: -4685089.5
Epoch 7, Loss: -5559133.0
Epoch 8, Loss: -6433126.0
Epoch 9, Loss: -7307153.0
Epoch 10, Loss: -8181192.0
