# NLP with Transformers
https://github.com/jamescalam/transformers/blob/main/installation.md

### Preprocessing

#### Stopwords

In [1]:
!pip install -U nltk



In [2]:
my_str = 'This is an absolutely amazing sentence that I, myself could not have come up with.'

In [3]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\divanma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
stop_words = stopwords.words('english')

stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [5]:
stop_words = set(stop_words)

In [6]:
my_str = my_str.lower().split()

In [7]:
my_str_no_stopwords = [ w for w in my_str if w not in stop_words ]

In [8]:
my_str_no_stopwords

print(' '.join(my_str_no_stopwords))

absolutely amazing sentence i, could come with.


#### Tokens

User-defined Tokens
- Single characters
- Parts of words
- Words
- Replacing things like @elonmust with [USER]
- Replacing things like https://123.456 with [URL]

Special Model Tokens
- [PAD] for padding to make the vector a fixed size. 512 for BERT.
- [UNK] Unknown words to the model.
- [CLS] Indicates the start of a sequence.
- [SEP] Indicated the end of a token.
- [MASK] Used when masking tokens. Typically the tokens a model would have to predict.

#### Stemming
Simplify text before usage to theior more basic form.

In [9]:
txt = 'I am amazed by how amazingly amazing you are.'

In [10]:
from nltk.stem import PorterStemmer, LancasterStemmer

In [11]:
porter = PorterStemmer()
lancaster = LancasterStemmer()

In [12]:
stemmed = [(word, porter.stem(word), lancaster.stem(word)) for word in txt.split()]

In [13]:
stemmed

[('I', 'i', 'i'),
 ('am', 'am', 'am'),
 ('amazed', 'amaz', 'amaz'),
 ('by', 'by', 'by'),
 ('how', 'how', 'how'),
 ('amazingly', 'amazingli', 'amaz'),
 ('amazing', 'amaz', 'amaz'),
 ('you', 'you', 'you'),
 ('are.', 'are.', 'are.')]

#### Lemmatizations
Reduces words to real-world simpler routes.

In [14]:
import nltk

In [15]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\divanma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\divanma\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [16]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [17]:
lemmatizer = WordNetLemmatizer()

In [18]:
[(word, lemmatizer.lemmatize(word, wordnet.VERB)) for word in txt.split()]

[('I', 'I'),
 ('am', 'be'),
 ('amazed', 'amaze'),
 ('by', 'by'),
 ('how', 'how'),
 ('amazingly', 'amazingly'),
 ('amazing', 'amaze'),
 ('you', 'you'),
 ('are.', 'are.')]

#### Unicode Normalization - Canonical & Compatibility Equivalence

- Composition composes canonical characters.
- Decomposition is the inverse.

In [1]:
# Canonical decomposition (NFD) - Normal Form Decomposition
import unicodedata

In [2]:
c_with_cedilla = '\u00C7'
c_with_cedilla

'Ç'

In [3]:
c_plus_cedilla = '\u0043\u0327'
c_plus_cedilla

'Ç'

In [4]:
c_with_cedilla == c_plus_cedilla

False

In [6]:
unicodedata.normalize('NFD', c_with_cedilla) == unicodedata.normalize('NFD', c_plus_cedilla)

True

In [7]:
# Canonical decomposition (NFD) followed by canonical composition (NFC) - Normal Form Composition
unicodedata.normalize('NFC', c_with_cedilla) == unicodedata.normalize('NFC', c_plus_cedilla)

True

In [10]:
# Compatibility equivelant (NFKD) - Normal Form Compatibility Decomposition.
unicodedata.normalize('NFKD', '\u210C') == 'H'

True

In [13]:
# Compatibility equivelant (NFKC) - Normal Form Compatibility Decomposition followed by Composition. This is the ultimate composition function which we will use for all normalization.
unicodedata.normalize('NFKC', '\u210B\u0327') == '\u1e28'

True