# Natural Language Processing - Text Preprocessing

## Libraries and settings

In [23]:
# Libraries
import os
import re
import string
import numpy as np
import pandas as pd
from pprint import pprint

import nltk

# Import only once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.chunk import tree2conlltags
from nltk.chunk import conlltags2tree
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

Current working directory: /workspaces/data_analytics/Week_11


[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Defining documents

In [24]:
# Defining documents (=sentenses)
d1 = 'The sun dipped below the horizon, casting a warm glow across the tranquil lake.'
d2 = 'With a burst of laughter, friends gathered around the table, enjoying a delightful evening together.'
d3 = 'The old bookstore exuded a nostalgic charm, its shelves filled with tales of adventure and mystery.'

corpus_01 = d1 + ' ' + d2 + ' ' + d3
corpus_01

'The sun dipped below the horizon, casting a warm glow across the tranquil lake. With a burst of laughter, friends gathered around the table, enjoying a delightful evening together. The old bookstore exuded a nostalgic charm, its shelves filled with tales of adventure and mystery.'

## Text preprocessing
#### Steps:
- Text to lowercase
- Removing punctuations
- Tokenization
- Removal of stop words
- Lemmatization

### Text to lowercase

In [25]:
# Text to lowercase function
def text_lowercase(text):
    return text.lower()

# Text to lowercase
corpus_02 = text_lowercase(corpus_01)
corpus_02

'the sun dipped below the horizon, casting a warm glow across the tranquil lake. with a burst of laughter, friends gathered around the table, enjoying a delightful evening together. the old bookstore exuded a nostalgic charm, its shelves filled with tales of adventure and mystery.'

### Removing punctuation

In [26]:
# Remove punctuation function
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Remove punctuation
corpus_03 = remove_punctuation(corpus_02)
corpus_03

'the sun dipped below the horizon casting a warm glow across the tranquil lake with a burst of laughter friends gathered around the table enjoying a delightful evening together the old bookstore exuded a nostalgic charm its shelves filled with tales of adventure and mystery'

### Tokenize text & removal of stopwords

In [27]:
# Show english stopwords
eng_stopwords = set(stopwords.words('english'))
print("List of english stopwords:")
print(eng_stopwords)

List of english stopwords:
{'my', 'y', "should've", 'no', 'off', 'should', 'what', 'mustn', 'has', 'nor', 'than', 've', 'to', 'not', 'those', "doesn't", 'did', 'but', 'between', 'a', 'while', 'on', 'was', 'its', 'now', 'don', 'until', 'd', 'out', "wasn't", 'few', 't', "don't", "didn't", "won't", 'before', 'so', 'he', 'just', 'ma', 'that', 'again', 're', 'will', "wouldn't", 'hers', "you'd", 'can', 'the', 'through', "weren't", 'an', 'she', 'only', 'once', "haven't", 'when', 'any', 'yourselves', "aren't", 'shan', 'isn', 'be', 'had', 'doesn', 'you', 'her', 'themselves', 'with', 'haven', 'me', 'it', 'o', 'shouldn', 'have', 'more', 'do', 'll', "hadn't", 'above', 'wasn', 'itself', 'some', "isn't", 'i', "shouldn't", 'below', 'from', "you're", 'this', 'him', 'who', 'as', 'each', 'such', 'them', 'am', 'for', 'hadn', "mustn't", 'doing', 'over', 'during', 'why', 'how', 'their', 'because', "you'll", 'couldn', "needn't", 'won', 'your', 'they', 'in', 'if', 'up', 'needn', 'were', "shan't", 'further', 

In [28]:
# Function for tokenization and the removal of stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
# Remove stopwords
corpus_04 = remove_stopwords(corpus_03)
print(corpus_04, end="")

['sun', 'dipped', 'horizon', 'casting', 'warm', 'glow', 'across', 'tranquil', 'lake', 'burst', 'laughter', 'friends', 'gathered', 'around', 'table', 'enjoying', 'delightful', 'evening', 'together', 'old', 'bookstore', 'exuded', 'nostalgic', 'charm', 'shelves', 'filled', 'tales', 'adventure', 'mystery']

### Lemmatization

In [29]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize string function
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Lemmatize
lem = []
for i in corpus_04:
    lem.append(lemmatize_word(i))

# Nested list to list
corpus_05 = [' '.join([str(x) for x in lst]) for lst in lem]

print('Before lemmatization:')
print(corpus_04, '\n')

print('After lemmatization:')
print(corpus_05, end="")

Before lemmatization:
['sun', 'dipped', 'horizon', 'casting', 'warm', 'glow', 'across', 'tranquil', 'lake', 'burst', 'laughter', 'friends', 'gathered', 'around', 'table', 'enjoying', 'delightful', 'evening', 'together', 'old', 'bookstore', 'exuded', 'nostalgic', 'charm', 'shelves', 'filled', 'tales', 'adventure', 'mystery'] 

After lemmatization:
['sun', 'dip', 'horizon', 'cast', 'warm', 'glow', 'across', 'tranquil', 'lake', 'burst', 'laughter', 'friends', 'gather', 'around', 'table', 'enjoy', 'delightful', 'even', 'together', 'old', 'bookstore', 'exude', 'nostalgic', 'charm', 'shelve', 'fill', 'tales', 'adventure', 'mystery']

## Redefine the text corpus (pre-processed)

In [30]:
# We will use the lemmatized words above to re-define our corpus 
corpus = ['sun dip horizon cast warm glow across tranquil lake burst laughter', 
          'friends gather around table enjoy delightful even together old bookstore', 
          'exude nostalgic charm shelve fill tales adventure mystery']

## Document-term matrix with ngram_range=(1,1)

In [31]:
# Vectorizer with ngram_range=(1,1)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(1,1))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   across  adventure  around  bookstore  burst  cast  charm  delightful  dip  \
0       1          0       0          0      1     1      0           0    1   
1       0          0       1          1      0     0      0           1    0   
2       0          1       0          0      0     0      1           0    0   

   enjoy  ...  mystery  nostalgic  old  shelve  sun  table  tales  together  \
0      0  ...        0          0    0       0    1      0      0         0   
1      1  ...        0          0    1       0    0      1      0         1   
2      0  ...        1          1    0       1    0      0      1         0   

   tranquil  warm  
0         1     1  
1         0     0  
2         0     0  

[3 rows x 29 columns]


## Document-term matrix with ngram_range=(2,2)

In [32]:
# Vectorizer with with ngram_range=(2,2)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(2,2))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   across tranquil  adventure mystery  around table  burst laughter  \
0                1                  0             0               1   
1                0                  0             1               0   
2                0                  1             0               0   

   cast warm  charm shelve  delightful even  dip horizon  enjoy delightful  \
0          1             0                0            1                 0   
1          0             0                1            0                 1   
2          0             1                0            0                 0   

   even together  ...  lake burst  nostalgic charm  old bookstore  \
0              0  ...           1                0              0   
1              1  ...           0                0              1   
2              0  ...           0                1              0   

   shelve fill  sun dip  table enjoy  tales adventure  together old  \
0            0        1          

## Term frequency-inverse document frequency (TF-IDF)
- For details see: https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency

### Term Frequency (TF)

In [33]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))

print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))

Number of words in the corpus: 29 

The words in the corpus: 
 {'tales', 'glow', 'fill', 'charm', 'adventure', 'dip', 'horizon', 'burst', 'delightful', 'old', 'shelve', 'mystery', 'nostalgic', 'lake', 'gather', 'cast', 'sun', 'around', 'laughter', 'even', 'exude', 'warm', 'friends', 'table', 'bookstore', 'together', 'tranquil', 'enjoy', 'across'}

Term Frequency (TF):
   tales    glow   fill  charm  adventure     dip  horizon   burst  \
0  0.000  0.0909  0.000  0.000      0.000  0.0909   0.0909  0.0909   
1  0.000  0.0000  0.000  0.000      0.000  0.0000   0.0000  0.0000   
2  0.125  0.0000  0.125  0.125      0.125  0.0000   0.0000  0.0000   

   delightful  old  ...  even  exude    warm  friends  table  bookstore  \
0         0.0  0.0  ...   0.0  0.000  0.0909      0.0    0.0        0.0   
1         0.1  0.1  ...   0.1  0.000  0.0000      0.1    0.1        0.1   
2         0.0  0.0  ...   0.0  0.125  0.0000      0.0    0.0        0.0   

   together  tranquil  enjoy  across  
0       

### Inverse Document Frequency (IDF)

In [34]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
          tales:     0.4771
           glow:     0.4771
           fill:     0.4771
          charm:     0.4771
      adventure:     0.4771
            dip:     0.4771
        horizon:     0.4771
          burst:     0.4771
     delightful:     0.4771
            old:     0.4771
         shelve:     0.4771
        mystery:     0.4771
      nostalgic:     0.4771
           lake:     0.4771
         gather:     0.4771
           cast:     0.4771
            sun:     0.4771
         around:     0.4771
       laughter:     0.4771
           even:     0.4771
          exude:     0.4771
           warm:     0.4771
        friends:     0.4771
          table:     0.4771
      bookstore:     0.4771
       together:     0.4771
       tranquil:     0.4771
          enjoy:     0.4771
         across:     0.4771


### Term Frequency - Inverse Document Frequency (TF-IDF)

In [35]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
    tales    glow    fill   charm  adventure     dip  horizon   burst  \
0  0.0000  0.0434  0.0000  0.0000     0.0000  0.0434   0.0434  0.0434   
1  0.0000  0.0000  0.0000  0.0000     0.0000  0.0000   0.0000  0.0000   
2  0.0596  0.0000  0.0596  0.0596     0.0596  0.0000   0.0000  0.0000   

   delightful     old  ...    even   exude    warm  friends   table  \
0      0.0000  0.0000  ...  0.0000  0.0000  0.0434   0.0000  0.0000   
1      0.0477  0.0477  ...  0.0477  0.0000  0.0000   0.0477  0.0477   
2      0.0000  0.0000  ...  0.0000  0.0596  0.0000   0.0000  0.0000   

   bookstore  together  tranquil   enjoy  across  
0     0.0000    0.0000    0.0434  0.0000  0.0434  
1     0.0477    0.0477    0.0000  0.0477  0.0000  
2     0.0000    0.0000    0.0000  0.0000  0.0000  

[3 rows x 29 columns]


## Part-of-Speach (POS) tagging
For meaning of POS-tags see: https://pythonexamples.org/nltk-pos-tagging

In [38]:
text = '''The gentle rustle of leaves overhead provided a soothing backdrop to the couple's quiet conversation in the peaceful park.'''

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)

# Print the POS-tags
pprint(iob_tagged)

[('The', 'DT', 'B-NP'),
 ('gentle', 'JJ', 'I-NP'),
 ('rustle', 'NN', 'I-NP'),
 ('of', 'IN', 'O'),
 ('leaves', 'NNS', 'O'),
 ('overhead', 'VBP', 'O'),
 ('provided', 'VBD', 'O'),
 ('a', 'DT', 'B-NP'),
 ('soothing', 'JJ', 'I-NP'),
 ('backdrop', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('the', 'DT', 'B-NP'),
 ('couple', 'NN', 'I-NP'),
 ("'s", 'POS', 'O'),
 ('quiet', 'JJ', 'B-NP'),
 ('conversation', 'NN', 'I-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('peaceful', 'JJ', 'I-NP'),
 ('park', 'NN', 'I-NP'),
 ('.', '.', 'O')]


### breafly explanation of 5 POS tags

('The', 'DT', 'B-NP'): 'The' is a determiner (DT) and is the beginning (B) of a noun phrase (NP). <br>
('gentle', 'JJ', 'I-NP'): 'gentle' is an adjective (JJ) and is inside (I) a noun phrase (NP). <br>
('rustle', 'NN', 'I-NP'): 'rustle' is a noun (NN) and is inside (I) a noun phrase (NP). <br>
('of', 'IN', 'O'): 'of' is a preposition (IN) and is outside (O) any named entity. <br>
('leaves', 'NNS', 'O'): 'leaves' is a plural noun (NNS) and is outside (O) any named entity. <br>
('overhead', 'VBP', 'O'): 'overhead' is a verb (VBP) and is outside (O) any named entity.

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [37]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.2.0-1016-azure
Datetime: 2023-11-28 07:34:31
Python Version: 3.10.13
-----------------------------------
