### Named Entity Recognition of a Corpus
The purpose of this project is to analyze the Georgetown University Multilayer (**GUM**) Corpus  coming from this [repo](https://github.com/nluninja/nlp_datasets/tree/main/GUM), that contains two columns:

- token
- ner_tag

This [corpus](https://github.com/amir-zeldes/gum) contains English texts from twelve written and spoken text types:
- interviews
- news
- travel guides
- how-to guides
- academic writing
- biographies
- fiction
- online forum discussions
- spontaneous face to face conversations
- political speeches
- textbooks
- vlogs

Our goal is to classify correctly the 23 classes coming from the **ner_tag** through a **BILSTM** and **Bert Model**

### 0. Libraries

In [2]:
# Importing
import re
import urllib
import pickle
import os

# Preprocessing
import nltk
import spacy
from spacy import displacy
from math import nan
import random
import pandas as pd
import numpy as np


# Data Visualization
from matplotlib import pyplot as plt
import seaborn as sns
import en_core_web_sm
from tqdm import tqdm
from wordcloud import WordCloud, STOPWORDS


# EDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation,NMF


# Metrics
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay


# BILSTM
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# BERT
from transformers import BertTokenizerFast,BertForTokenClassification


import torch
from torch.utils.data import DataLoader
from torch.optim import SGD, Adam


from Functions.importing import load_conll_data
from Functions.preprocessing import flatten_list, display_topics, remove_seq_padding, from_encode_to_literal_labels
from Functions.model import load_glove_embedding_matrix, bilstm
from Functions.plot import plot_confusion_matrix


  from .autonotebook import tqdm as notebook_tqdm


### 1. Importing

In [None]:
data_dir = os.path.join('/Users/gabrielecola/NER_2/data')
raw_train, ner_train, output_labels = load_conll_data('gum-train.conll', dir_path=data_dir, only_tokens=True)
raw_test, ner_test, _ = load_conll_data('gum-test.conll', dir_path=data_dir, only_tokens=True)

In [None]:
sentence_data = pd.DataFrame({'Sentence': raw_train})
sentence_data['Sentence'] = [' '.join(map(str, l)) for l in sentence_data['Sentence']]
sentence_data

### 2. Pre- Processing

In [None]:
train_new = flatten_list(raw_train)
ner_new = flatten_list(ner_train)
ner_data = pd.DataFrame(
    {'Word': train_new,
     'Tag': ner_new
    })
ner_data

### 3. EDA

In [None]:
t= str(train_new)
testo=" ".join(train_new)
stopwords = ['many','one','will','may','see','make','well']+ list(STOPWORDS)
testo2 = WordCloud(
    background_color='black',
    max_words=2000,
    stopwords=stopwords,
    width=1600, height=800
)
# generate the word cloud
testo2.generate(testo)

# display the word cloud
plt.figure( figsize=(15,15))
plt.title('WordCloud')
plt.imshow(testo2, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
ner_data['Tag'].value_counts()

In [None]:
plt.figure(figsize=(6, 6))
# Create the count plot ordered by the counts
ax = sns.countplot(
    y='Tag', 
    data=ner_data,
    order=ner_data['Tag'].value_counts().index
)
ax.set_title('The count of each of the 23 classes')

# Display the plot
plt.show()

### 4. Topic Modelling

#### 4.1 Pre-process The dataset

In [None]:
data_topic = pd.DataFrame({'Sentence': raw_train})
data_topic['Sentence'] = [' '.join(map(str, l)) for l in data_topic['Sentence']]
data_topic

In [None]:
# CountVectorizer is used to convert a collection of text documents to a matrix of token counts
# The result is a sparse matrix where each row corresponds to a document and each column corresponds to a token
# The value in each cell is the count of the token in the document
cv = CountVectorizer(stop_words='english')
dtm = cv.fit_transform(data_topic['Sentence'])
dtm_feature_names = cv.get_feature_names_out()

#### 4.2 LDA

LDA used for **Topic Modeling**, it is a generative **probabilistic** model that assumes each **document** is a mixture of **topics** and each topic is a mixture of words. \
The model learns the topics from the data and can be used to infer the topic distribution of new documents. \
Furthermore The model is trained using the Expectation-Maximization algorithm





In [None]:
LDA = LatentDirichletAllocation(n_components=12,random_state=42) # The number of topics is a hyperparameter that needs to be set before training the model
LDA.fit(dtm)

### 5. Feature Engineering

#### 5.1 Token Ordinal Encoding

An ordinal encoding involves mapping each unique label to an integer value.

In [None]:
# integer encode sequences of words
token_tokenizer = Tokenizer()    # Automatically lowers tokens
token_tokenizer.fit_on_texts(raw_train + raw_test)
# Save the tokenizer
with open('/Users/gabrielecola/NER_2/tokenizer.pkl', 'wb') as file:
    pickle.dump(token_tokenizer, file)

train_sequences = token_tokenizer.texts_to_sequences(raw_train)
test_sequences = token_tokenizer.texts_to_sequences(raw_test)

tag2idx = { tag: idx for idx, tag in enumerate(output_labels) }
idx2tag = { idx: tag for tag, idx in tag2idx.items() }
ner_train_sequences = [[tag2idx[tag] for tag in sentence] for sentence in ner_train]
ner_test_sequences  = [[tag2idx[tag] for tag in sentence] for sentence in ner_test ]

vocabulary_size = len(token_tokenizer.word_counts)

In [None]:
print(raw_test[5])
print(test_sequences[5])
for i in test_sequences[5]:
    print(f'{i} : {token_tokenizer.index_word[i]}')

#### 5.2 Sequence Padding

In [None]:
# if you want to specify the maximum length of each sequence you can use the maxlen argument. This will truncate all sequences longer than maxlen.
max_sequence_len = 50
X_train = pad_sequences(train_sequences, maxlen=max_sequence_len, padding='post', truncating='post')
X_test = pad_sequences(test_sequences, maxlen=max_sequence_len, padding='post', truncating='post')

Y_train = pad_sequences(ner_train_sequences, maxlen=max_sequence_len, value=tag2idx['O'], padding='post', truncating='post')
Y_test = pad_sequences(ner_test_sequences, maxlen=max_sequence_len, value=tag2idx['O'], padding='post', truncating='post')

Y_train = to_categorical(Y_train, num_classes=len(output_labels), dtype='int32')
Y_test = to_categorical(Y_test, num_classes=len(output_labels), dtype='int32')

#### 5.3 Train/Test Split

In [None]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [None]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

### 6. Model: BILSTM

#### 6.1 Glove: Word Embedding

In [None]:
USE_GLOVE=True
glove_matrix=None
if USE_GLOVE:
    glove_embedding_path = '/Users/gabrielecola/NER_2/glove.6B/glove.6B.100d.txt'
    embedding_dim = 100
    glove_matrix = load_glove_embedding_matrix(glove_embedding_path, token_tokenizer.word_index, embedding_dim)

In [None]:
model= bilstm(vocabulary_size+1, # vocabulary size + 1 for padding,
              max_sequence_len, # maximum length of the sequences
              drop=0.6, # dropout rate
              hidden_cells=200, # number of hidden cells
              n_classes= len(output_labels),
              use_glove=USE_GLOVE,
              glove_matrix=glove_matrix)


es=EarlyStopping(monitor='loss',patience=2,mode="auto")
history = model.fit(X_train, Y_train, batch_size=10, epochs=100, verbose=2, callbacks=[es])
model.save('/Users/gabrielecola/NER_2/bilstm.h1')

In [None]:
# Extract loss and accuracy
loss = history.history['loss']
accuracy = history.history['accuracy']

# (Optional) validation data
val_loss = history.history.get('val_loss')
val_accuracy = history.history.get('val_accuracy')

# Create the figure
plt.figure(figsize=(10,6))

# Plot Loss
plt.plot(loss, label='Training Loss', color='red', marker='o')
if val_loss:
    plt.plot(val_loss, label='Validation Loss', color='orange', marker='x')

# Plot Accuracy
plt.plot(accuracy, label='Training Accuracy', color='blue', marker='o')
if val_accuracy:
    plt.plot(val_accuracy, label='Validation Accuracy', color='green', marker='x')

# Titles and labels
plt.title('Training Loss and Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Value')
plt.legend()
plt.show()


In [None]:
batch_size = 10
datasets = [('Training Set', X_train, Y_train), ('Test Set', X_test, Y_test)]
for title, X, Y in datasets:
    # we make predictions
    Y_pred_lstm = model.predict(X, batch_size=batch_size)
    Y_pred_lstm = np.array(np.argmax(Y_pred_lstm, axis=-1))
    Y = np.array(np.argmax(Y, axis=-1))
    Y, Y_pred = remove_seq_padding(X, Y, Y_pred_lstm)
    let_y_true_lstm, let_y_pred_lstm = from_encode_to_literal_labels(Y, Y_pred_lstm, idx2tag)
    # from a double list we make a single list according to the argument of classification report
    single_list_true_lstm = []
    single_list_pred_lstm = []
    for i in range(len(let_y_true_lstm)):
      for j in range(len(let_y_true_lstm[i])):
        single_list_true_lstm.append(let_y_true_lstm[i][j])
    for i in range(len(let_y_pred_lstm)):
      for j in range(len(let_y_pred_lstm[i])):
       single_list_pred_lstm.append(let_y_pred_lstm[i][j])
    print(title)
    print(classification_report(single_list_pred_lstm, single_list_true_lstm))

In [None]:
classes= ['B-abstract','B-animal','B-event','B-object','B-organization','B-person','B-place','B-plant','B-quantity',
          'B-substance','B-time','I-abstract','I-animal','I-event','I-object','I-organization','I-person','I-place',
          'I-plant','I-quantity','I-substance','I-time','O']


# calculated on test set
plot_confusion_matrix(single_list_pred_lstm, single_list_true_lstm, classes)

### Bert

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

class NERDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(sentence,
                                  truncation=True,
                                  padding='max_length',
                                  max_length=self.max_len,
                                  is_split_into_words=True,
                                  return_tensors='pt')

        labels_enc = np.ones(self.max_len, dtype=int) * -100
        word_ids = encoding.word_ids(batch_index=0)

        label_ids = [tag2idx[l] for l in label]

        previous_word_idx = None
        label_idx = 0
        for i, word_idx in enumerate(word_ids):
            if word_idx is not None:
                if word_idx != previous_word_idx:
                    labels_enc[i] = label_ids[label_idx]
                    label_idx += 1
                previous_word_idx = word_idx

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(labels_enc, dtype=torch.long)
        }

# Parameters
MAX_LEN = 128
BATCH_SIZE = 2

train_dataset = NERDataset(raw_train, ner_train, tokenizer, MAX_LEN)
test_dataset = NERDataset(raw_test, ner_test, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# =====================
# 3. Model: BERT for NER
# =====================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(output_labels))
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

# =====================
# 4. Training (1 Epoch Example)
# =====================
model.train()

for epoch in range(1):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}')

# =====================
# 5. Evaluation
# =====================
model.eval()

true_labels = []
pred_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        preds = torch.argmax(logits, dim=2)

        for i in range(input_ids.size(0)):
            true = labels[i].cpu().numpy()
            pred = preds[i].cpu().numpy()

            mask = true != -100
            true_labels.extend(true[mask])
            pred_labels.extend(pred[mask])

# =====================
# 6. Confusion Matrix & Report
# =====================
print("Classification Report - BERT")
print(classification_report(true_labels, pred_labels, target_names=output_labels))

cm = confusion_matrix(true_labels, pred_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=output_labels)
fig, ax = plt.subplots(figsize=(20, 20))
disp.plot(ax=ax, cmap='Blues', xticks_rotation='vertical')
plt.title('Confusion Matrix - BERT Model')
plt.show()