In [1]:
import pandas as pd       
from nltk.tokenize import sent_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
# from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import nltk

In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/cka/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
# Define Merck text.
text = 'Merck (NYSE: MRK), known as MSD outside of the United States and Canada, today announced the completion of the acquisition of Harpoon Therapeutics, Inc. (Nasdaq: HARP). Harpoon is now a wholly-owned subsidiary of Merck, and Harpoon’s common stock will no longer be publicly traded or listed on the Nasdaq Stock Market. Harpoon’s lead candidate, MK-6070 (formerly known as HPN328), is a T-cell engager targeting delta-like ligand 3 (DLL3), an inhibitory canonical Notch ligand that is expressed at high levels in small cell lung cancer (SCLC) and neuroendocrine tumors. The safety, tolerability and pharmacokinetics of MK-6070 is currently being evaluated as monotherapy in a Phase 1/2 clinical trial (NCT04471727) in certain patients with advanced cancers associated with expression of DLL3. The study is also evaluating MK-6070 in combination with atezolizumab in certain patients with SCLC. In March 2022, the U.S. Food and Drug Administration (FDA) granted Orphan Drug Designation to MK-6070 for the treatment of SCLC.'
print(text)

Merck (NYSE: MRK), known as MSD outside of the United States and Canada, today announced the completion of the acquisition of Harpoon Therapeutics, Inc. (Nasdaq: HARP). Harpoon is now a wholly-owned subsidiary of Merck, and Harpoon’s common stock will no longer be publicly traded or listed on the Nasdaq Stock Market. Harpoon’s lead candidate, MK-6070 (formerly known as HPN328), is a T-cell engager targeting delta-like ligand 3 (DLL3), an inhibitory canonical Notch ligand that is expressed at high levels in small cell lung cancer (SCLC) and neuroendocrine tumors. The safety, tolerability and pharmacokinetics of MK-6070 is currently being evaluated as monotherapy in a Phase 1/2 clinical trial (NCT04471727) in certain patients with advanced cancers associated with expression of DLL3. The study is also evaluating MK-6070 in combination with atezolizumab in certain patients with SCLC. In March 2022, the U.S. Food and Drug Administration (FDA) granted Orphan Drug Designation to MK-6070 for t

# Pre-Processing 

- Remove special characters
- Tokenizing
- Lemmatizing/Stemming
- Stop word removal

In [4]:
# sentence tokenizer  ## May affect NER
headlines = sent_tokenize(text.lower())

['merck (nyse: mrk), known as msd outside of the united states and canada, today announced the completion of the acquisition of harpoon therapeutics, inc. (nasdaq: harp).',
 'harpoon is now a wholly-owned subsidiary of merck, and harpoon’s common stock will no longer be publicly traded or listed on the nasdaq stock market.',
 'harpoon’s lead candidate, mk-6070 (formerly known as hpn328), is a t-cell engager targeting delta-like ligand 3 (dll3), an inhibitory canonical notch ligand that is expressed at high levels in small cell lung cancer (sclc) and neuroendocrine tumors.',
 'the safety, tolerability and pharmacokinetics of mk-6070 is currently being evaluated as monotherapy in a phase 1/2 clinical trial (nct04471727) in certain patients with advanced cancers associated with expression of dll3.',
 'the study is also evaluating mk-6070 in combination with atezolizumab in certain patients with sclc.',
 'in march 2022, the u.s. food and drug administration (fda) granted orphan drug design

In [6]:
# # word tokenizer 
# word_tokenize(text.lower())

## Lemmatizing & Stemming

**Lemmatizing** and **stemming** are two forms of shortening words so we can combine similar forms of the same word.

When we "**lemmatize**" data, we take words and attempt to return their *lemma*, or the base/dictionary form of a word.

In [7]:
# Instantiate RegExp Tokenizer
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')

# "Run" Tokenizer
text_tokens = tokenizer.tokenize(text.lower())

In [8]:
# Instantiate lemmatizer. 
lemmatizer = WordNetLemmatizer()

In [9]:
# Lemmatize tokens.
tokens_lem = [lemmatizer.lemmatize(i) for i in text_tokens]

In [11]:
# Compare tokens to lemmatized version.
# list(zip(text_tokens, tokens_lem))

# Print only those lemmatized tokens that are different.
[(text_tokens[i], tokens_lem[i]) for i in range(len(text_tokens)) if text_tokens[i] != tokens_lem[i]]

[('as', 'a'),
 ('states', 'state'),
 ('therapeutics', 'therapeutic'),
 ('as', 'a'),
 ('levels', 'level'),
 ('tumors', 'tumor'),
 ('as', 'a'),
 ('patients', 'patient'),
 ('cancers', 'cancer'),
 ('patients', 'patient')]

## Stop Word Removal

In [12]:
# Remove stopwords from "text_tokens."
no_stop_words = [token for token in text_tokens if token not in stopwords.words('english')]

In [13]:
# Check it
print(no_stop_words)

['merck', '(nyse:', 'mrk', '),', 'known', 'msd', 'outside', 'united', 'states', 'canada', ',', 'today', 'announced', 'completion', 'acquisition', 'harpoon', 'therapeutics', ',', 'inc', '.', '(nasdaq:', 'harp', ').', 'harpoon', 'wholly', '-owned', 'subsidiary', 'merck', ',', 'harpoon', '’s', 'common', 'stock', 'longer', 'publicly', 'traded', 'listed', 'nasdaq', 'stock', 'market', '.', 'harpoon', '’s', 'lead', 'candidate', ',', 'mk', '-6070', '(formerly', 'known', 'hpn328', '),', '-cell', 'engager', 'targeting', 'delta', '-like', 'ligand', '3', '(dll3),', 'inhibitory', 'canonical', 'notch', 'ligand', 'expressed', 'high', 'levels', 'small', 'cell', 'lung', 'cancer', '(sclc)', 'neuroendocrine', 'tumors', '.', 'safety', ',', 'tolerability', 'pharmacokinetics', 'mk', '-6070', 'currently', 'evaluated', 'monotherapy', 'phase', '1', '/2', 'clinical', 'trial', '(nct04471727)', 'certain', 'patients', 'advanced', 'cancers', 'associated', 'expression', 'dll3', '.', 'study', 'also', 'evaluating', 'm