In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
import spacy
from spacy.lang.en import English
model = spacy.load("en_core_web_sm")

In [None]:
# To tokenize a given text
from spacy.tokenizer import Tokenizer
tokenizer_spacy = model.tokenizer

text = "Last week, the University of Cambridge shared its own research that shows if everyone wears a mask outside home,dreaded ‘second wave’ of the pandemic can be avoided."
tokens = tokenizer_spacy(text)
for token in tokens:
  print(token)

Last
week
,
the
University
of
Cambridge
shared
its
own
research
that
shows
if
everyone
wears
a
mask
outside
home
,
dreaded
‘
second
wave
’
of
the
pandemic
can
be
avoided
.


In [None]:
# To get the sentences of a text document
text="""The outbreak of coronavirus disease 2019 (COVID-19) has created a global health crisis that has had a deep impact on the way we perceive our world and our everyday lives. Not only the rate of contagion and patterns of transmission threatens our sense of agency, but the safety measures put in place to contain the spread of the virus also require social distancing by refraining from doing what is inherently human, which is to find solace in the company of others. Within this context of physical threat, social and physical distancing, as well as public alarm, what has been (and can be) the role of the different mass media channels in our lives on individual, social and societal levels? Mass media have long been recognized as powerful forces shaping how we experience the world and ourselves. This recognition is accompanied by a growing volume of research, that closely follows the footsteps of technological transformations (e.g. radio, movies, television, the internet, mobiles) and the zeitgeist (e.g. cold war, 9/11, climate change) in an attempt to map mass media major impacts on how we perceive ourselves, both as individuals and citizens. Are media (broadcast and digital) still able to convey a sense of unity reaching large audiences, or are messages lost in the noisy crowd of mass self-communication? """

doc = model(text)
for sent in doc.sents:
  print(sent)

The outbreak of coronavirus disease 2019 (COVID-19) has created a global health crisis that has had a deep impact on the way we perceive our world and our everyday lives.
Not only the rate of contagion and patterns of transmission threatens our sense of agency, but the safety measures put in place to contain the spread of the virus also require social distancing by refraining from doing what is inherently human, which is to find solace in the company of others.
Within this context of physical threat, social and physical distancing, as well as public alarm, what has been (and can be)
the role of the different mass media channels in our lives on individual, social and societal levels?
Mass media have long been recognized as powerful forces shaping how we experience the world and ourselves.
This recognition is accompanied by a growing volume of research, that closely follows the footsteps of technological transformations (e.g. radio, movies, television, the internet, mobiles) and the zeit

In [None]:
# To tokenize a text using the `transformers` package 

from transformers import AutoTokenizer

tokenizer_transformers = AutoTokenizer.from_pretrained('bert-base-uncased') # base model
text="I love spring season. I go hiking with my friends"
tokens = tokenizer_transformers.tokenize(text)
ids = tokenizer_transformers.convert_tokens_to_ids(tokens)

print(ids)
print(tokens)

[1045, 2293, 3500, 2161, 1012, 1045, 2175, 13039, 2007, 2026, 2814]
['i', 'love', 'spring', 'season', '.', 'i', 'go', 'hiking', 'with', 'my', 'friends']


In [None]:
# To tokenize text with stopwords as delimiters 
import string
text = "Walter was feeling anxious. He was diagnosed today. He probably is the best person I know."

stop_words = set(stopwords.words('english'))

tokens = tokenizer_spacy(text)

tokens = [str(token) for token in tokens if (str(token) not in stop_words) and (str(token) not in string.punctuation)]
print(tokens)

['Walter', 'feeling', 'anxious', 'He', 'diagnosed', 'today', 'He', 'probably', 'best', 'person', 'I', 'know']


In [None]:
# To remove stop words in a text

text = """the outbreak of coronavirus disease 2019 (COVID-19) has created a global health crisis that has had a deep impact on the way we perceive our world and our everyday lives. Not only the rate of contagion and patterns of transmission threatens our sense of agency, but the safety measures put in place to contain the spread of the virus also require social distancing by refraining from doing what is inherently human, which is to find solace in the company of others. Within this context of physical threat, social and physical distancing, as well as public alarm, what has been (and can be) the role of the different mass media channels in our lives on individual, social and societal levels? Mass media have long been recognized as powerful forces shaping how we experience the world and ourselves. This recognition is accompanied by a growing volume of research, that closely follows the footsteps of technological transformations (e.g. radio, movies, television, the internet, mobiles) and the zeitgeist (e.g. cold war, 9/11, climate change) in an attempt to map mass media major impacts on how we perceive ourselves, both as individuals and citizens. Are media (broadcast and digital) still able to convey a sense of unity reaching large audiences, or are messages lost in the noisy crowd of mass self-communication? """

tokens = tokenizer_spacy(text)

new_text = ' '.join([str(token) for token in tokens if str(token) not in stop_words])
print(new_text)

outbreak coronavirus disease 2019 ( COVID-19 ) created global health crisis deep impact way perceive world everyday lives . Not rate contagion patterns transmission threatens sense agency , safety measures put place contain spread virus also require social distancing refraining inherently human , find solace company others . Within context physical threat , social physical distancing , well public alarm , ( ) role different mass media channels lives individual , social societal levels ? Mass media long recognized powerful forces shaping experience world . This recognition accompanied growing volume research , closely follows footsteps technological transformations ( e.g. radio , movies , television , internet , mobiles ) zeitgeist ( e.g. cold war , 9/11 , climate change ) attempt map mass media major impacts perceive , individuals citizens . Are media ( broadcast digital ) still able convey sense unity reaching large audiences , messages lost noisy crowd mass self - communication ?


In [None]:
# To add custom stop words in spaCy: "NIL" and "JUNK"

stopwords = ['NIL', 'JUNK', ' ']

text = " Jonas was a JUNK great guy NIL Adam was evil NIL Martha JUNK was more of a fool "
tokens = tokenizer_spacy(text)
filtered_text = ' '.join([str(token) for token in tokens if str(token) not in stopwords and str(token) not in stop_words])

print(filtered_text)

Jonas great guy Adam evil Martha fool


In [None]:
# To remove punctuations

text = "The match has concluded !!! India has won the match . Will we fin the finals too ? !"

tokens = tokenizer_spacy(text)
filtered = ' '.join([str(token) for token in tokens if str(token) not in string.punctuation])
print(filtered)


The match has concluded India has won the match Will we fin the finals too


In [None]:
# To perform stemming
# Stemming: convert each token to it’s root form in the given text

from nltk.stem import PorterStemmer
porter = PorterStemmer()

text = "Dancing is an art. Students should be taught dance as a subject in schools . I danced in many of my school function. Some people are always hesitating to dance."

tokens = tokenizer_spacy(text)
filtered = ' '.join(porter.stem(str(token)) for token in tokens)
print(filtered)

danc is an art . student should be taught danc as a subject in school . I danc in mani of my school function . some peopl are alway hesit to danc .


In [None]:
# To lemmatize a given text

text= "Dancing is an art. Students should be taught dance as a subject in schools . I danced in many of my school function. Some people are always hesitating to dance."
tokens = tokenizer_spacy(text)

# token.lemma: The lemma (hash)
# token.lemma_: The lemma

print(' '.join([str(token.lemma_) for token in tokens]))

Dancing be a art . Students should be teach dance a a subject in school . I dance in many of my school function . Some people be always hesitate to dance .


In [None]:
# To extract usernames from emails

text= "The new registrations are potter709@gmail.com , elixir101@gmail.com. If you find any disruptions, kindly contact granger111@gamil.com or severus77@gamil.com "

tokens = tokenizer_spacy(text)

usernames = []

for token in tokens:
  if '@' in str(token):
    usernames.append(str(token).split('@')[0])

print(usernames)

['potter709', 'elixir101', 'granger111', 'severus77']


In [None]:
# To find the most common words in the text excluding stopwords
text="""Junkfood - Food that do no good to our body. And there's no need of them in our body but still we willingly eat them because they are great in taste and easy to cook or ready to eat. Junk foods have no or very less nutritional value and irrespective of the way they are marketed, they are not healthy to consume.The only reason of their gaining popularity and increased trend of consumption is 
that they are ready to eat or easy to cook foods. People, of all age groups are moving towards Junkfood as it is hassle free and often ready to grab and eat. Cold drinks, chips, noodles, pizza, burgers, French fries etc. are few examples from the great variety of junk food available in the market.
 Junkfood is the most dangerous food ever but it is pleasure in eating and it gives a great taste in mouth examples of Junkfood are kurkure and chips.. cold rings are also source of junk food... they shud nt be ate in high amounts as it results fatal to our body... it cn be eated in a limited extend ... in research its found tht ths junk foods r very dangerous fr our health
Junkfood is very harmful that is slowly eating away the health of the present generation. The term itself denotes how dangerous it is for our bodies. Most importantly, it tastes so good that people consume it on a daily basis. However, not much awareness is spread about the harmful effects of Junkfood .
The problem is more serious than you think. Various studies show that Junkfood impacts our health negatively. They contain higher levels of calories, fats, and sugar. On the contrary, they have very low amounts of healthy nutrients and lack dietary fibers. Parents must discourage their children from consuming junk food because of the ill effects it has on one’s health.
Junkfood is the easiest way to gain unhealthy weight. The amount of fats and sugar in the food makes you gain weight rapidly. However, this is not a healthy weight. It is more of fats and cholesterol which will have a harmful impact on your health. Junk food is also one of the main reasons for the increase in obesity nowadays.
This food only looks and tastes good, other than that, it has no positive points. The amount of calorie your body requires to stay fit is not fulfilled by this food. For instance, foods like French fries, burgers, candy, and cookies, all have high amounts of sugar and fats. Therefore, this can result in long-term illnesses like diabetes and high blood pressure. This may also result in kidney failure."""

tokens = tokenizer_spacy(text)
tokens = [str(token) for token in tokens if str(token) not in stop_words and str(token) not in string.punctuation]

counts = {}
for token in tokens:
  if token not in counts:
    counts[token] = 1
  else: counts[token] += 1
print(counts)

{'Junkfood': 8, 'Food': 1, 'good': 3, 'body': 4, 'And': 1, "'s": 1, 'need': 1, 'still': 1, 'willingly': 1, 'eat': 4, 'great': 3, 'taste': 2, 'easy': 2, 'cook': 2, 'ready': 3, 'Junk': 2, 'foods': 4, 'less': 1, 'nutritional': 1, 'value': 1, 'irrespective': 1, 'way': 2, 'marketed': 1, 'healthy': 3, 'consume': 2, 'The': 5, 'reason': 1, 'gaining': 1, 'popularity': 1, 'increased': 1, 'trend': 1, 'consumption': 1, '\n': 5, 'People': 1, 'age': 1, 'groups': 1, 'moving': 1, 'towards': 1, 'hassle': 1, 'free': 1, 'often': 1, 'grab': 1, 'Cold': 1, 'drinks': 1, 'chips': 2, 'noodles': 1, 'pizza': 1, 'burgers': 2, 'French': 2, 'fries': 2, 'etc': 1, 'examples': 2, 'variety': 1, 'junk': 4, 'food': 8, 'available': 1, 'market': 1, '\n ': 1, 'dangerous': 3, 'ever': 1, 'pleasure': 1, 'eating': 2, 'gives': 1, 'mouth': 1, 'kurkure': 1, '..': 1, 'cold': 1, 'rings': 1, 'also': 3, 'source': 1, '...': 3, 'shud': 1, 'nt': 1, 'ate': 1, 'high': 3, 'amounts': 3, 'results': 1, 'fatal': 1, 'cn': 1, 'eated': 1, 'limited

In [None]:
# To do spell correction in a given text

text="He is a gret person. He beleives in bod"

from  textblob import TextBlob
blb = TextBlob(text)
print(blb.correct())

He is a great person. He believes in god


In [None]:
# To tokenize tweets 
text = " Having lots of fun #goa #vaction #summervacation. Fancy dinner @Beachbay restro :) "

import re
cleaned = re.sub()
tokens = tokenizer_spacy(text)
print(tokenized)

['Having', 'lots', 'of', 'fun', 'goa', 'vaction', 'summervacation', 'Fancy', 'dinner', '@Beachbay', 'restro', ':)']
