In [None]:
import nltk
import pandas as pd
nltk.download('popular', quiet=True)
import demoji
from wordcloud import WordCloud
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.util import bigrams
from nltk import FreqDist
import spacy
import string
nltk.download("wordnet", quiet=True)
nltk.download("stopwords", quiet=True)
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from collections import defaultdict
import pandas as pd
nlp = spacy.load("en_core_web_sm")


## 1. Perform necessary data preprocessing, e.g. removing punctuation and stop words, stemming, lemmatizing. You may use the outputs from previous weekly assignments. (10 points)


In [None]:
import os
import glob
def collect_data():
    text_file_pattern = "*.txt"  # You can adjust the pattern to match your file extensions
    text_files = glob.glob(os.path.join("../nhs/content", text_file_pattern))
    data = {}
    for file_path in text_files:
        with open(file_path, 'r', encoding='utf-8') as file:
            file_name = os.path.basename(file_path)
            file_content = file.read()
            data[file_name] = file_content
    return data


In [None]:
corpus = collect_data()
text = ""
for data in corpus:
    text += " " + data


In [None]:
def remove_punctuation(text):
    # Create a translation table to remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    # Use translate method to remove punctuation
    cleaned_text = text.translate(translator)
    return cleaned_text

def remove_stop_words(text):
    nltk_stopwords = stopwords.words('english')
    spacy_stopwords = nlp.Defaults.stop_words

    stop_words = (*nltk_stopwords, *spacy_stopwords, "NHStxt")

    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    return " ".join(tokens)

def apply_lemmitization(text):
    tag_map = defaultdict(lambda : wordnet.NOUN)
    tag_map['V'] = wordnet.VERB
    tag_map['A'] = wordnet.ADJ
    tag_map['R'] = wordnet.ADJ

    lemmitizer = WordNetLemmatizer()
    lemmitized_result = ""
    tokens = word_tokenize(text)
    for token, tag in pos_tag(tokens):
        lemma = lemmitizer.lemmatize(token, tag_map[tag[0]])
        lemmitized_result = lemmitized_result + " " + lemma
    return lemmitized_result

def remove_emoji_and_smart_quotes(text):
    # replacing emojis with description
    text = demoji.replace_with_desc(text)
    #Removing smart quotes
    return text.replace(""", "\"").replace(""","\"")


In [None]:
def data_preprocessing(text):
    text = remove_emoji_and_smart_quotes(text)
    text = remove_punctuation(text)
    text = remove_stop_words(text)
    text = apply_lemmitization(text)
    return text

def apply_data_preprocessing_to_corpus(corpus):
    new_corpus = {}
    for idx, key in enumerate(corpus.keys()):
        new_corpus[key] = data_preprocessing(corpus[key])
        print(f"idx: {idx}")
    return new_corpus


In [None]:
processed_text = data_preprocessing(text)
with open('week11_1.txt', 'w') as file:
    file.write(f'{processed_text}')


In [None]:
data = []
for filename in corpus.keys():
    data.append({
        "label": filename.removesuffix("  NHS.txt"),
        "data": data_preprocessing(corpus[filename])
    })
df = pd.DataFrame(data)

In [None]:
data = []
import textwrap
indexes = {k: v for v, k in enumerate(sorted(corpus.keys()))}
for filename in corpus.keys():
    text = data_preprocessing(corpus[filename])
    lines = textwrap.wrap(text, 100, break_long_words=False)
    for line in lines:
        data.append({
            "label": filename.removesuffix("  NHS.txt"),
            "data": line,
            "idx": indexes[filename]
        })
df = pd.DataFrame(data)

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)

In [None]:
dataset.push_to_hub("shireesh-uop/nhs_classification")

In [None]:
!pip -q install datasets
!pip -q install transformers[torch]
!pip -q install accelerate -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h

### 2. For the binary classification problem you came up previously, build your own model by combining BERT with a classifier.  (30 points)


In [None]:
from datasets import load_dataset
dataset = load_dataset("shireesh-uop/nhs_classification")

In [None]:
df=dataset["train"].to_pandas()
df

In [None]:
indexes = {}
for i in df.label.unique():
  indexes[i] = df[df.label == i][:1]["idx"].item()

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
tokenizer("Attention is all you need")

{'input_ids': [101, 1335, 5208, 2116, 1110, 1155, 1128, 1444, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
def tokenize_data(example):
    return tokenizer(example['data'], padding='max_length', truncation=True,
                     return_tensors="pt")

def transform_labels(label):
    label = label['label']
    num = label["idx"]
    return {'labels': num}


dataset = dataset.map(tokenize_data, batched=True)

remove_columns = ['label']
dataset = dataset.remove_columns(remove_columns)


In [None]:
from transformers import TrainingArguments
import accelerate
# Batch size per GPU for training
per_device_train_batch_size = 20

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

training_args = TrainingArguments("test_trainer",
                                  num_train_epochs=30,
                                  hub_strategy="checkpoint",
                                  save_steps=2000,
                                  per_device_train_batch_size=per_device_train_batch_size)
device_map = {"": 0}

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("shireesh-uop/nhs_classification", num_labels=978)

In [None]:
from transformers import AutoModelForSequenceClassification, AutoConfig
config = AutoConfig.from_pretrained("bert-base-cased", num_labels=978, device_map=device_map)
model_untrained = AutoModelForSequenceClassification.from_config(config)

In [None]:
dataset = dataset["train"]

In [None]:
start = int(len(dataset)*.9)
end = int(len(dataset) *0.1)
dataset = dataset.rename_column('data', 'sentence')
dataset = dataset.rename_column('idx', 'labels')

In [None]:
train_dataset = dataset.shuffle(seed=10).select(range(start))
eval_dataset = dataset.shuffle(seed=10).select(range(start, end+start))

In [None]:
from transformers import Trainer
import torch
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# move model to tpu
trainer = Trainer(
    model=model, args=training_args,
    train_dataset=train_dataset, eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

In [None]:
train_dataset

Dataset({
    features: ['sentence', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 24411
})

In [None]:
for i in train_dataset:
    print(i)
    break

{'sentence': 'spot close Almost spot form round slightly oval blister 1 spot appear flatterThe blister pink shiny', 'labels': 764, 'input_ids': [101, 3205, 1601, 8774, 3205, 1532, 1668, 2776, 13102, 171, 7276, 1200, 122, 3205, 2845, 3596, 2083, 1942, 4638, 171, 7276, 1200, 5325, 13388, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

### 3. Train your own model by fine-tuning BERT. And save your model and use it to classify sentences (50 points)



In [106]:
trainer.train()

Step,Training Loss
500,4.4847
1000,3.9345


Step,Training Loss
500,4.4847
1000,3.9345
1500,3.3233
2000,2.8634
2500,2.6228
3000,2.1661
3500,2.0693
4000,1.8228
4500,1.6272
5000,1.479


KeyboardInterrupt: ignored

In [108]:

trainer.save_model("./model")
model.push_to_hub("shireesh-uop/nhs_classification")

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shireesh-uop/nhs_classification/commit/1a8c0930c04222525402ff3fd6232a25efb60e0c', commit_message='Upload BertForSequenceClassification', commit_description='', oid='1a8c0930c04222525402ff3fd6232a25efb60e0c', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
model = model.from_pretrained("./model")

In [107]:
"""
before training
{'eval_loss': 6.946279525756836,
 'eval_accuracy': 0.0003687315634218289,
 'eval_runtime': 28.4205,
 'eval_samples_per_second': 95.424,
 'eval_steps_per_second': 11.928}
 """

trainer.evaluate()

Step,Training Loss,Validation Loss,Accuracy
500,4.4847,,
1000,3.9345,,
1500,3.3233,,
2000,2.8634,,
2500,2.6228,,
3000,2.1661,,
3500,2.0693,,
4000,1.8228,,
4500,1.6272,,
5000,1.479,,


{'eval_loss': 2.526455879211426, 'eval_accuracy': 0.5339233038348082}

### 4. Summarize what you have learned and discovered from Task 1-3. (10 points)


- I see that thee model is converging towards higher accuracy but needs more training time.
- this would have been faster with SVM or ANN, but they will have a limit on how accurate they can be.

So, models need to be chosen based on the use case.



