# Natural Language Processing - Text Preprocessing

## Libraries and settings

In [1]:
# Libraries
import os
import re
import string
import numpy as np
import pandas as pd
from pprint import pprint

import nltk

# Import only once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.chunk import tree2conlltags
from nltk.chunk import conlltags2tree
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...


Current working directory: /workspaces/data_analytics/Week_11


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


## Defining documents

In [16]:
# Defining documents (=sentenses)
d1 = 'Andres Ambuehl plays for HC Davos.'
d2 = 'Denis Malgin plays for ZSC Lions.'
d3 = 'Austin Czarnik plays for SC Bern.'

corpus_01 = d1 + ' ' + d2 + ' ' + d3
corpus_01

'Andres Ambuehl plays for HC Davos. Denis Malgin plays for ZSC Lions. Austin Czarnik plays for SC Bern.'

## Text preprocessing
#### Steps:
- Text to lowercase
- Removing punctuations
- Tokenization
- Removal of stop words
- Lemmatization

### Text to lowercase

In [17]:
# Text to lowercase function
def text_lowercase(text):
    return text.lower()

# Text to lowercase
corpus_02 = text_lowercase(corpus_01)
corpus_02

'andres ambuehl plays for hc davos. denis malgin plays for zsc lions. austin czarnik plays for sc bern.'

### Removing punctuation

In [18]:
# Remove punctuation function
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Remove punctuation
corpus_03 = remove_punctuation(corpus_02)
corpus_03

'andres ambuehl plays for hc davos denis malgin plays for zsc lions austin czarnik plays for sc bern'

### Tokenize text & removal of stopwords

In [19]:
# Show english stopwords
eng_stopwords = set(stopwords.words('english'))
print("List of english stopwords:")
print(eng_stopwords)

List of english stopwords:
{'below', 'mustn', "isn't", 'mightn', 'himself', 'other', 'just', 'you', 'that', 'off', 'further', 'yourselves', 'each', 'at', 'hasn', "wasn't", 'such', 'wasn', 'this', 'before', 'ma', "that'll", 'ain', 'hadn', "should've", 'again', 'isn', 'nor', 'its', 'be', 'can', "you'd", 'ours', 'i', 'our', 'they', 'not', 'won', 'yourself', "hadn't", "you're", 'ourselves', 'into', 'we', 'after', "couldn't", 'm', "doesn't", 'wouldn', 's', 'he', 'her', 'haven', 'these', "she's", 'doesn', 'should', 'too', 'are', 'll', 'had', 'o', 'if', 'me', 'will', 'it', 'itself', 'which', 'from', "shouldn't", 'herself', 'd', 'above', 'when', 'once', 'whom', "shan't", 'but', 'she', 'there', 'those', 'shan', 'against', 'what', 'needn', 't', "needn't", 'only', 're', 'does', 've', 'the', 'their', 'while', "you'll", 'few', 'been', "aren't", 'to', 'his', 'over', 'between', 'all', 'until', 'in', 'a', 'weren', 'or', "wouldn't", 'doing', 'of', 'both', "hasn't", 'y', 'because', 'why', 'up', 'them', 

In [20]:
# Function for tokenization and the removal of stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
# Remove stopwords
corpus_04 = remove_stopwords(corpus_03)
print(corpus_04, end="")

['andres', 'ambuehl', 'plays', 'hc', 'davos', 'denis', 'malgin', 'plays', 'zsc', 'lions', 'austin', 'czarnik', 'plays', 'sc', 'bern']

### Lemmatization

In [21]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize string function
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Lemmatize
lem = []
for i in corpus_04:
    lem.append(lemmatize_word(i))

# Nested list to list
corpus_05 = [' '.join([str(x) for x in lst]) for lst in lem]

print('Before lemmatization:')
print(corpus_04, '\n')

print('After lemmatization:')
print(corpus_05, end="")

Before lemmatization:
['andres', 'ambuehl', 'plays', 'hc', 'davos', 'denis', 'malgin', 'plays', 'zsc', 'lions', 'austin', 'czarnik', 'plays', 'sc', 'bern'] 

After lemmatization:
['andres', 'ambuehl', 'play', 'hc', 'davos', 'denis', 'malgin', 'play', 'zsc', 'lions', 'austin', 'czarnik', 'play', 'sc', 'bern']

## Redefine the text corpus (pre-processed)

In [23]:
# We will use the lemmatized words above to re-define our corpus 
corpus = ['andres ambuehl play hc davos', 
          'denis malgin play zsc lion', 
          'austin czarnik play sc bern']

## Document-term matrix with ngram_range=(1,1)

In [24]:
# Vectorizer with ngram_range=(1,1)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(1,1))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   ambuehl  andres  austin  bern  czarnik  davos  denis  hc  lion  malgin  \
0        1       1       0     0        0      1      0   1     0       0   
1        0       0       0     0        0      0      1   0     1       1   
2        0       0       1     1        1      0      0   0     0       0   

   play  sc  zsc  
0     1   0    0  
1     1   0    1  
2     1   1    0  


## Document-term matrix with ngram_range=(2,2)

In [25]:
# Vectorizer with with ngram_range=(2,2)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(2,2))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   ambuehl play  andres ambuehl  austin czarnik  czarnik play  denis malgin  \
0             1               1               0             0             0   
1             0               0               0             0             1   
2             0               0               1             1             0   

   hc davos  malgin play  play hc  play sc  play zsc  sc bern  zsc lion  
0         1            0        1        0         0        0         0  
1         0            1        0        0         1        0         1  
2         0            0        0        1         0        1         0  


## Term frequency-inverse document frequency (TF-IDF)
- For details see: https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency

### Term Frequency (TF)

In [26]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))

print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))

Number of words in the corpus: 13 

The words in the corpus: 
 {'lion', 'bern', 'sc', 'czarnik', 'ambuehl', 'austin', 'malgin', 'davos', 'andres', 'denis', 'hc', 'play', 'zsc'}

Term Frequency (TF):
   lion  bern   sc  czarnik  ambuehl  austin  malgin  davos  andres  denis  \
0   0.0   0.0  0.0      0.0      0.2     0.0     0.0    0.2     0.2    0.0   
1   0.2   0.0  0.0      0.0      0.0     0.0     0.2    0.0     0.0    0.2   
2   0.0   0.2  0.2      0.2      0.0     0.2     0.0    0.0     0.0    0.0   

    hc  play  zsc  
0  0.2   0.2  0.0  
1  0.0   0.2  0.2  
2  0.0   0.2  0.0  


### Inverse Document Frequency (IDF)

In [27]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
           lion:     0.4771
           bern:     0.4771
             sc:     0.4771
        czarnik:     0.4771
        ambuehl:     0.4771
         austin:     0.4771
         malgin:     0.4771
          davos:     0.4771
         andres:     0.4771
          denis:     0.4771
             hc:     0.4771
           play:        0.0
            zsc:     0.4771


### Term Frequency - Inverse Document Frequency (TF-IDF)

In [28]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
     lion    bern      sc  czarnik  ambuehl  austin  malgin   davos  andres  \
0  0.0000  0.0000  0.0000   0.0000   0.0954  0.0000  0.0000  0.0954  0.0954   
1  0.0954  0.0000  0.0000   0.0000   0.0000  0.0000  0.0954  0.0000  0.0000   
2  0.0000  0.0954  0.0954   0.0954   0.0000  0.0954  0.0000  0.0000  0.0000   

    denis      hc  play     zsc  
0  0.0000  0.0954   0.0  0.0000  
1  0.0954  0.0000   0.0  0.0954  
2  0.0000  0.0000   0.0  0.0000  


## Part-of-Speach (POS) tagging
For meaning of POS-tags see: https://pythonexamples.org/nltk-pos-tagging

In [29]:
text = '''Andres Ambuehl plays for HC Davos and the Swiss national team. He is the record player for both teams. In both teams he has played more games than anybody else.'''

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)

# Print the POS-tags
pprint(iob_tagged)

[('Andres', 'NNS', 'O'),
 ('Ambuehl', 'NNP', 'O'),
 ('plays', 'NNS', 'O'),
 ('for', 'IN', 'O'),
 ('HC', 'NNP', 'O'),
 ('Davos', 'NNP', 'O'),
 ('and', 'CC', 'O'),
 ('the', 'DT', 'B-NP'),
 ('Swiss', 'JJ', 'I-NP'),
 ('national', 'JJ', 'I-NP'),
 ('team', 'NN', 'I-NP'),
 ('.', '.', 'O'),
 ('He', 'PRP', 'O'),
 ('is', 'VBZ', 'O'),
 ('the', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('player', 'NN', 'B-NP'),
 ('for', 'IN', 'O'),
 ('both', 'DT', 'O'),
 ('teams', 'NNS', 'O'),
 ('.', '.', 'O'),
 ('In', 'IN', 'O'),
 ('both', 'DT', 'O'),
 ('teams', 'NNS', 'O'),
 ('he', 'PRP', 'O'),
 ('has', 'VBZ', 'O'),
 ('played', 'VBN', 'O'),
 ('more', 'RBR', 'O'),
 ('games', 'NNS', 'O'),
 ('than', 'IN', 'O'),
 ('anybody', 'NN', 'B-NP'),
 ('else', 'RB', 'O'),
 ('.', '.', 'O')]


NNP is a Proper Noun in singular.

NNS is a Noun in singular.

DT is a Determiner.

IN is a Preposition/Subordinating Conjunction.

PRP is a Personal Pronoun.

RBR is an Adverb in the comparative form.

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [30]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.5.0-1025-azure
Datetime: 2024-11-04 12:25:10
Python Version: 3.11.10
-----------------------------------
