In [1]:
import warnings
import nltk
warnings.filterwarnings("ignore")
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import gutenberg as cg
# sentence tokeniser
from nltk.tokenize import sent_tokenize as st 
# word tokeniser
from nltk.tokenize import word_tokenize as wt 
from nltk.book import FreqDist
from collections import Counter
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.corpus import wordnet

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


This is aimed at developing a preprocessing process for **TEXT NORMALISATION**

**Steps of text normalisation**

- lowercase
- removal of stopwords
- remove punctuations
- remove numbers
- tokenisation
- lemmatisation/stemming

Take in raw text

In [3]:
raw_text="""During the conference, within the machine learning session, Xiu Tang, Sai Wu, Gang Chen, Ke Chen, and Lidan Shou of College of Computer Science and Technology, Zhejiang University, Zhejiang, China, gave a presentation on the “Learning to Label with Active Learning and Reinforcement Learning”.  They pointed out that it is financially training data labelling in domain-specific learning applications, which relies on the intelligence from domain experts. They proposed a learning-to-label (L2L) framework leveraging active learning and reinforcement learning to iteratively select data to label for Name Entity Recognition (NER) task. They pointed out that, neural models are built on top of few open datasets with well-defined labels, such as; ImageNet, Coco, and Wikipedia dataset, which cannot be directly applied to new domain. This means that neural model using domain-specific labels must be trained, but this is quite expensive. To address the problem of lacking labeled training data, they proposed the use of transfer learning technique which establishes a model on the source domain and transfers the knowledge to a target domain, but with an extension by exploiting active learning and reinforcement learning techniques. By use of a label approach, L2L, which consists of two models, a transfer learning model and an active learning model designed by a reinforcement learning process named T-model and A-model, respectively, to rank the data for labelling for the Name Entity Recognition (NER) application. The L2L architecture consisted of three main components which are: NER model, multi-granularity attention, and learning to rank. The proposed architecture reduced the required number of labels for training a domain-specific neural model. The idea of this model is to first transfer a learning model from a source domain to a target domain, and then apply the active learning to gradually improve the performance of the model using as few labeled data in the target domain as possible. Their experimental results showed that their approach is more effective than strong previous methods using heuristics and reinforcement learning. With the same number of labeled data, their approach improved the accuracy of NER by 11.91%. Moreover, this approach is superior to state-of-the-art learning- to-label method, with an improvement of accuracy by 6.49%."""
print(raw_text)

During the conference, within the machine learning session, Xiu Tang, Sai Wu, Gang Chen, Ke Chen, and Lidan Shou of College of Computer Science and Technology, Zhejiang University, Zhejiang, China, gave a presentation on the “Learning to Label with Active Learning and Reinforcement Learning”.  They pointed out that it is financially training data labelling in domain-specific learning applications, which relies on the intelligence from domain experts. They proposed a learning-to-label (L2L) framework leveraging active learning and reinforcement learning to iteratively select data to label for Name Entity Recognition (NER) task. They pointed out that, neural models are built on top of few open datasets with well-defined labels, such as; ImageNet, Coco, and Wikipedia dataset, which cannot be directly applied to new domain. This means that neural model using domain-specific labels must be trained, but this is quite expensive. To address the problem of lacking labeled training data, they pr

In [4]:
# Eexplore the tags of the text
print(pos_tag(wt(raw_text)))

[('During', 'IN'), ('the', 'DT'), ('conference', 'NN'), (',', ','), ('within', 'IN'), ('the', 'DT'), ('machine', 'NN'), ('learning', 'NN'), ('session', 'NN'), (',', ','), ('Xiu', 'NNP'), ('Tang', 'NNP'), (',', ','), ('Sai', 'NNP'), ('Wu', 'NNP'), (',', ','), ('Gang', 'NNP'), ('Chen', 'NNP'), (',', ','), ('Ke', 'NNP'), ('Chen', 'NNP'), (',', ','), ('and', 'CC'), ('Lidan', 'NNP'), ('Shou', 'NNP'), ('of', 'IN'), ('College', 'NNP'), ('of', 'IN'), ('Computer', 'NNP'), ('Science', 'NNP'), ('and', 'CC'), ('Technology', 'NNP'), (',', ','), ('Zhejiang', 'NNP'), ('University', 'NNP'), (',', ','), ('Zhejiang', 'NNP'), (',', ','), ('China', 'NNP'), (',', ','), ('gave', 'VBD'), ('a', 'DT'), ('presentation', 'NN'), ('on', 'IN'), ('the', 'DT'), ('“', 'NN'), ('Learning', 'NNP'), ('to', 'TO'), ('Label', 'NNP'), ('with', 'IN'), ('Active', 'NNP'), ('Learning', 'NNP'), ('and', 'CC'), ('Reinforcement', 'NNP'), ('Learning', 'NNP'), ('”', 'NNP'), ('.', '.'), ('They', 'PRP'), ('pointed', 'VBD'), ('out', 'RP')

Function definitions for text normalisation

In [5]:
def wordlowercase(raw_text):
    return raw_text.lower()

def stopwordremove2(sentence):
    stop = set(stopwords.words("english"))
    removed_stopwords_text = " ".join(i for i in sentence.split() if i not in stop)
    return removed_stopwords_text


def word_tokenise2(removed_stopwords_text):
    return wt(removed_stopwords_text)


def stemmer_porter(word_tokens):
    port = PorterStemmer()
    # stemmed = " ".join([port.stem(i) for i in word_tokens])
    stemmed = [port.stem(i) for i in word_tokens]
    return stemmed

'''Individual Lemmatisation'''
def lemmtiser_verb(sentence):
    lemmaword = WordNetLemmatizer()
    # lemma_text = " ".join(lemmaword.lemmatize(i, "v") for i in sentence.split()) # only verbs
    lemma_text = [lemmaword.lemmatize(i, "v") for i in sentence] # only verbs
    return lemma_text

def lemmtiser_adjective(sentence):
    lemmaword = WordNetLemmatizer()
    # lemma_text = " ".join(lemmaword.lemmatize(i, "a") for i in sentence.split()) # only adjective
    lemma_text = [lemmaword.lemmatize(i, "a") for i in sentence] # only adjective
    return lemma_text
def lemmtiser_noun(sentence):
    lemmaword = WordNetLemmatizer()
    # lemma_text = " ".join(lemmaword.lemmatize(i, "n") for i in sentence.split()) # only noun
    lemma_text = [lemmaword.lemmatize(i, "n") for i in sentence]# only noun
    return lemma_text

### Stemming approach

In [6]:
# lowercased text
word_lowered = wordlowercase(raw_text)
# removal of stopwords
stop_wording = stopwordremove2(word_lowered)

# tokenisation
tokens = word_tokenise2(stop_wording)

# Stemming
stemmed = stemmer_porter(tokens)

# Frequency of occurrence
fdist3 = FreqDist(stemmed)

print(f"\nLowercase text\n{word_lowered}")
print(f"\nstop word removal:\n{stop_wording}")
print(f"\nTokenised lowercase text:\n{tokens}")
print(f"\nStemmed lowercase text:\n{stemmed}")
print(f"\nStemmed lowercase sentence:\n{' '.join(stemmed)}")
print(f"\nPOS tagging lowercase text:\n{pos_tag(stemmed)}")
print(f"\nFrequency of occurrence: \n{fdist3.most_common()}") 
# The serves the same function as the FreqDist
# print(f"\nFrequency of occurrence with counter: \n{Counter(fdist3)}") 


Lowercase text
during the conference, within the machine learning session, xiu tang, sai wu, gang chen, ke chen, and lidan shou of college of computer science and technology, zhejiang university, zhejiang, china, gave a presentation on the “learning to label with active learning and reinforcement learning”.  they pointed out that it is financially training data labelling in domain-specific learning applications, which relies on the intelligence from domain experts. they proposed a learning-to-label (l2l) framework leveraging active learning and reinforcement learning to iteratively select data to label for name entity recognition (ner) task. they pointed out that, neural models are built on top of few open datasets with well-defined labels, such as; imagenet, coco, and wikipedia dataset, which cannot be directly applied to new domain. this means that neural model using domain-specific labels must be trained, but this is quite expensive. to address the problem of lacking labeled traini

### Lemmatisation approach version-1

In [7]:
# Lemmatisation-1 approach

# lowercased text
word_lowered = wordlowercase(raw_text)

# removal of stopwords
stop_wording = stopwordremove2(word_lowered)

# tokenisation
tokens = word_tokenise2(stop_wording)

# Lemmatisation
lemma = lemmtiser_verb(tokens) #verbs
# lemma = lemmtiser_adjective(tokens) # adjectives
# lemma = lemmtiser_noun(tokens) # nouns


# Frequency of occurrence
fdist3 = FreqDist(lemma)


print(f"\nLowercase text\n{word_lowered}")
print(f"\nstop word removal:\n{stop_wording}")
print(f"\nTokenised lowercase text:\n{tokens}")
print(f"\nLemmatised lowercase text:\n{lemma}")
print(f"\nLemmatised lowercase sentence:\n{' '.join(lemma)}")
print(f"\nPOS tagging lowercase text:\n{pos_tag(lemma)}")
print(f"\nFrequency of occurrence: \n{fdist3.most_common()}") 
# print(f"\nFrequency of occurrence with counter: \n{Counter(fdist3)}") 


Lowercase text
during the conference, within the machine learning session, xiu tang, sai wu, gang chen, ke chen, and lidan shou of college of computer science and technology, zhejiang university, zhejiang, china, gave a presentation on the “learning to label with active learning and reinforcement learning”.  they pointed out that it is financially training data labelling in domain-specific learning applications, which relies on the intelligence from domain experts. they proposed a learning-to-label (l2l) framework leveraging active learning and reinforcement learning to iteratively select data to label for name entity recognition (ner) task. they pointed out that, neural models are built on top of few open datasets with well-defined labels, such as; imagenet, coco, and wikipedia dataset, which cannot be directly applied to new domain. this means that neural model using domain-specific labels must be trained, but this is quite expensive. to address the problem of lacking labeled traini

### Lemmatisation approach version-2: using POS tagging

In [11]:
mapped_pos = {
    # Nouns
    'NN':wordnet.NOUN,
    # Verbs
    'VB':wordnet.VERB,
    # Adjectives
    'JJ':wordnet.ADJ,
    # Adverbs
    'RB':wordnet.ADV,
}

In [12]:
# Lemmatisation-2 approach

lemmaword = WordNetLemmatizer()
# lowercased text
word_lowered = wordlowercase(raw_text)

# removal of stopwords
stop_wording = stopwordremove2(word_lowered)

# tokenisation
tokens = word_tokenise2(stop_wording)

# tagging the tokens
pos_tagged_tokens = pos_tag(tokens)

normalised_sequence = []
for tuples in pos_tagged_tokens:
    temp = tuples[0]
    if tuples[1] == "NNP" or tuples[1] == "NNPS":
        continue
    elif tuples[1][:2] in mapped_pos.keys():
        temp = lemmaword.lemmatize(tuples[0],pos=mapped_pos[tuples[1][:2]])
        normalised_sequence.append(temp)

# Frequency of occurrence
fdist3 = FreqDist(normalised_sequence)


print(f"\nLowercase text\n{word_lowered}")
print(f"\nstop word removal:\n{stop_wording}")
print(f"\nTokenised lowercase text:\n{tokens}")
print(f"\nTagged tokens:\n{pos_tagged_tokens}")
print(f"\nLemmatised words:\n{normalised_sequence}")
print(f"\nLemmatised lowercase sentence:\n{' '.join(normalised_sequence)}")
print(f"\nPOS tagging of lemmatised words:\n{pos_tag(normalised_sequence)}")
print(f"\nFrequency of occurrence: \n{fdist3.most_common()}") 


Lowercase text
during the conference, within the machine learning session, xiu tang, sai wu, gang chen, ke chen, and lidan shou of college of computer science and technology, zhejiang university, zhejiang, china, gave a presentation on the “learning to label with active learning and reinforcement learning”.  they pointed out that it is financially training data labelling in domain-specific learning applications, which relies on the intelligence from domain experts. they proposed a learning-to-label (l2l) framework leveraging active learning and reinforcement learning to iteratively select data to label for name entity recognition (ner) task. they pointed out that, neural models are built on top of few open datasets with well-defined labels, such as; imagenet, coco, and wikipedia dataset, which cannot be directly applied to new domain. this means that neural model using domain-specific labels must be trained, but this is quite expensive. to address the problem of lacking labeled traini