In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
doc = nlp("this is happening in 1998 !!! :)") #contains lexical entities (token) with properties of each
print(doc)

this is happening in 1998 !!! :)


In [3]:
for token in doc:
    print(token.idx, ' | ',token.text, '--> ', token.pos_, '--- stop?', token.is_stop, '--- lemma:', token.lemma_)

0  |  this -->  PRON --- stop? True --- lemma: this
5  |  is -->  AUX --- stop? True --- lemma: be
8  |  happening -->  VERB --- stop? False --- lemma: happen
18  |  in -->  ADP --- stop? True --- lemma: in
21  |  1998 -->  NUM --- stop? False --- lemma: 1998
26  |  ! -->  PUNCT --- stop? False --- lemma: !
27  |  ! -->  PUNCT --- stop? False --- lemma: !
28  |  ! -->  PUNCT --- stop? False --- lemma: !
30  |  :) -->  PUNCT --- stop? False --- lemma: :)


In [4]:
complete_text = (
     "Gus Proto is a Python developer currently"
     " working for a London-based Fintech company. He is"
     " interested in learning Natural Language Processing."
     " There is a developer conference happening on 21 July"
     ' 2019 in London. It is titled "Applications of Natural'
     ' Language Processing". There is a helpline number'
     " available at +44-1234567891. Gus is helping organize it."
     " He keeps organizing local Python meetups and several"
     " internal talks at his workplace. Gus is also presenting"
     ' a talk. The talk will introduce the reader about "Use'
     ' cases of Natural Language Processing in Fintech".'
     " Apart from his work, he is very passionate about music."
     " Gus is learning to play the Piano. He has enrolled"
     " himself in the weekend batch of Great Piano Academy."
     " Great Piano Academy is situated in Mayfair or the City"
     " of London and has world-class piano instructors."
)

In [5]:
complete_doc = nlp(complete_text)
print(complete_doc)

Gus Proto is a Python developer currently working for a London-based Fintech company. He is interested in learning Natural Language Processing. There is a developer conference happening on 21 July 2019 in London. It is titled "Applications of Natural Language Processing". There is a helpline number available at +44-1234567891. Gus is helping organize it. He keeps organizing local Python meetups and several internal talks at his workplace. Gus is also presenting a talk. The talk will introduce the reader about "Use cases of Natural Language Processing in Fintech". Apart from his work, he is very passionate about music. Gus is learning to play the Piano. He has enrolled himself in the weekend batch of Great Piano Academy. Great Piano Academy is situated in Mayfair or the City of London and has world-class piano instructors.


In [6]:
#Sentences detection
sentences = list(complete_doc.sents)
print("There are",len(sentences),"sentences")
#for sentence in sentences:
#     print(f"{sentence[:5]}...")

There are 13 sentences


# Lower casing

In [7]:
def lowercase_sentences(text):
    doc = nlp(text)

    modified_text = ""

    for sent in doc.sents:
        modified_text += sent.text.lower()+ " " 

    return modified_text


In [8]:

complete_text = lowercase_sentences(complete_text)
print(complete_text)

gus proto is a python developer currently working for a london-based fintech company. he is interested in learning natural language processing. there is a developer conference happening on 21 july 2019 in london. it is titled "applications of natural language processing". there is a helpline number available at +44-1234567891. gus is helping organize it. he keeps organizing local python meetups and several internal talks at his workplace. gus is also presenting a talk. the talk will introduce the reader about "use cases of natural language processing in fintech". apart from his work, he is very passionate about music. gus is learning to play the piano. he has enrolled himself in the weekend batch of great piano academy. great piano academy is situated in mayfair or the city of london and has world-class piano instructors. 


# Removal of Punctuations

In [9]:
def remove_punctuation(text):
    doc = nlp(text)

    modified_tokens = [token.text for token in doc if not token.is_punct]

    modified_text = " ".join(modified_tokens)

    return modified_text

In [10]:
complete_text = remove_punctuation(complete_text)
print(complete_text)

gus proto is a python developer currently working for a london based fintech company he is interested in learning natural language processing there is a developer conference happening on 21 july 2019 in london it is titled applications of natural language processing there is a helpline number available at +44 1234567891 gus is helping organize it he keeps organizing local python meetups and several internal talks at his workplace gus is also presenting a talk the talk will introduce the reader about use cases of natural language processing in fintech apart from his work he is very passionate about music gus is learning to play the piano he has enrolled himself in the weekend batch of great piano academy great piano academy is situated in mayfair or the city of london and has world class piano instructors


# Removal of Stopwords

In [11]:
def remove_stopwords(text):
    doc = nlp(text)

    modified_tokens = [token.text for token in doc if not token.is_stop]

    modified_text = " ".join(modified_tokens)

    return modified_text

In [12]:
complete_text = remove_stopwords(complete_text)
print(complete_text)

gus proto python developer currently working london based fintech company interested learning natural language processing developer conference happening 21 july 2019 london titled applications natural language processing helpline number available +44 1234567891 gus helping organize keeps organizing local python meetups internal talks workplace gus presenting talk talk introduce reader use cases natural language processing fintech apart work passionate music gus learning play piano enrolled weekend batch great piano academy great piano academy situated mayfair city london world class piano instructors


# Removal of Frequent words

In [13]:
from collections import Counter
def remove_frequent_words(text, num_most_common=10):
    # Parse the input text using spaCy
    doc = nlp(text)

    words = [token.text for token in doc]
    word_freq = Counter(words)
    print(word_freq)

    most_common_words = [word for word, freq in word_freq.most_common(num_most_common)]

    modified_tokens = [token.text for token in doc if token.text not in most_common_words]

    modified_text = " ".join(modified_tokens)

    return modified_text, most_common_words

In [14]:
complete_text, most_common_words = remove_frequent_words(complete_text)
print(most_common_words)
print(complete_text)

Counter({'gus': 4, 'piano': 4, 'london': 3, 'natural': 3, 'language': 3, 'processing': 3, 'python': 2, 'developer': 2, 'fintech': 2, 'learning': 2, 'talk': 2, 'great': 2, 'academy': 2, 'proto': 1, 'currently': 1, 'working': 1, 'based': 1, 'company': 1, 'interested': 1, 'conference': 1, 'happening': 1, '21': 1, 'july': 1, '2019': 1, 'titled': 1, 'applications': 1, 'helpline': 1, 'number': 1, 'available': 1, '+44': 1, '1234567891': 1, 'helping': 1, 'organize': 1, 'keeps': 1, 'organizing': 1, 'local': 1, 'meetups': 1, 'internal': 1, 'talks': 1, 'workplace': 1, 'presenting': 1, 'introduce': 1, 'reader': 1, 'use': 1, 'cases': 1, 'apart': 1, 'work': 1, 'passionate': 1, 'music': 1, 'play': 1, 'enrolled': 1, 'weekend': 1, 'batch': 1, 'situated': 1, 'mayfair': 1, 'city': 1, 'world': 1, 'class': 1, 'instructors': 1})
['gus', 'piano', 'london', 'natural', 'language', 'processing', 'python', 'developer', 'fintech', 'learning']
proto currently working based company interested conference happening 2

# Removal of Rare words

In [15]:
def remove_rare_words(text, min_frequency=2):
    doc = nlp(text)

    words = [token.text for token in doc]
    word_freq = Counter(words)

    rare_words = [word for word, freq in word_freq.items() if freq < min_frequency]

    modified_tokens = [token.text for token in doc if token.text not in rare_words]

    modified_text = " ".join(modified_tokens)

    return modified_text, rare_words

In [16]:
complete_text, rare_words = remove_frequent_words(complete_text)
print(rare_words)
print(complete_text)

Counter({'talk': 2, 'great': 2, 'academy': 2, 'proto': 1, 'currently': 1, 'working': 1, 'based': 1, 'company': 1, 'interested': 1, 'conference': 1, 'happening': 1, '21': 1, 'july': 1, '2019': 1, 'titled': 1, 'applications': 1, 'helpline': 1, 'number': 1, 'available': 1, '+44': 1, '1234567891': 1, 'helping': 1, 'organize': 1, 'keeps': 1, 'organizing': 1, 'local': 1, 'meetups': 1, 'internal': 1, 'talks': 1, 'workplace': 1, 'presenting': 1, 'introduce': 1, 'reader': 1, 'use': 1, 'cases': 1, 'apart': 1, 'work': 1, 'passionate': 1, 'music': 1, 'play': 1, 'enrolled': 1, 'weekend': 1, 'batch': 1, 'situated': 1, 'mayfair': 1, 'city': 1, 'world': 1, 'class': 1, 'instructors': 1})
['talk', 'great', 'academy', 'proto', 'currently', 'working', 'based', 'company', 'interested', 'conference']
happening 21 july 2019 titled applications helpline number available +44 1234567891 helping organize keeps organizing local meetups internal talks workplace presenting introduce reader use cases apart work pass

# Stemming

In [17]:
import nltk
from nltk.stem import PorterStemmer
def stem_text(text):
    doc = nlp(text)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token.text) for token in doc]

    stemmed_text = " ".join(stemmed_tokens)

    return stemmed_text


In [18]:
complete_text = stem_text(complete_text)

# Print the stemmed text
print(complete_text)

happen 21 juli 2019 titl applic helplin number avail +44 1234567891 help organ keep organ local meetup intern talk workplac present introduc reader use case apart work passion music play enrol weekend batch situat mayfair citi world class instructor


# Lemmatization

In [19]:
def lemmatize_text(text):
    doc = nlp(text)

    lemmatized_tokens = [token.lemma_ for token in doc]
    lemmatized_text = " ".join(lemmatized_tokens)

    return lemmatized_text

In [20]:
lemmatized_text = lemmatize_text(complete_text)

# Print the lemmatized text
print(lemmatized_text)

happen 21 juli 2019 titl applic helplin number avail +44 1234567891 help organ keep organ local meetup intern talk workplac present introduc reader use case apart work passion music play enrol weekend batch situat mayfair citi world class instructor
