# Data Preprocessing

In [1]:
# Downlaod nltk packages
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
import uuid 
import os
import PyPDF2

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


ModuleNotFoundError: No module named 'PyPDF2'

In [2]:
# Import libraries
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
with open("paragraphs.txt", "r", encoding="utf-8") as file:
    text = file.read()

print(text) 

A significant subset of natural language data includes documents that span thousands of tokens.
The ability to process such long sequences is critical for many NLP tasks including document classification, summarization, multi-hop, and opendomain question answering, and document-level or
multi-document relationship extraction and coreference resolution. These tasks have important
practical applications in domains such as scientific
document understanding and the digital humanities (Ammar et al., 2018; Cohan et al., 2018; Kocisky et al. ´ , 2018; Lo et al., 2020; Wang et al.,
2020a). Yet, scaling state-of-the-art models to
long sequences is challenging as many models are
designed and tested for shorter sequences. One
notable example is transformer models (Vaswani
et al., 2017) that have O(N2
) computational cost in
the sequence length N, making them prohibitively
expensive to run for many long sequence tasks.
This is reflected in many widely-used models such
as RoBERTa and BERT where the

In [4]:
sentences = sent_tokenize(text)

df = pd.DataFrame(sentences, columns=["Sentences"])
df.head()

Unnamed: 0,Sentences
0,A significant subset of natural language data ...
1,The ability to process such long sequences is ...
2,These tasks have important\npractical applicat...
3,"´ , 2018; Lo et al., 2020; Wang et al.,\n2020a)."
4,"Yet, scaling state-of-the-art models to\nlong ..."


### Text Preprocessing

##### Text Cleaning

In [5]:
# Text Cleanning Function
def clean_text(text):
    # Convert into lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra blanks
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [6]:
df["Sentences"] = df["Sentences"].map(clean_text)
df.head()

Unnamed: 0,Sentences
0,a significant subset of natural language data ...
1,the ability to process such long sequences is ...
2,these tasks have important practical applicati...
3,lo et al wang et al a
4,yet scaling state of the art models to long se...


##### Tokenization with Stopword Removal

In [7]:
# Initialize Treebank tokenizer
tokenizer = TreebankWordTokenizer()

# Get English stopwords
stop_words = set(stopwords.words("english"))

def tokenize_tokens(text):
    tokens = tokenizer.tokenize(text)  
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]  
    return filtered_tokens

df["Sentences_Tokenize"] = df["Sentences"].apply(tokenize_tokens)

df.head()


Unnamed: 0,Sentences,Sentences_Tokenize
0,a significant subset of natural language data ...,"[significant, subset, natural, language, data,..."
1,the ability to process such long sequences is ...,"[ability, process, long, sequences, critical, ..."
2,these tasks have important practical applicati...,"[tasks, important, practical, applications, do..."
3,lo et al wang et al a,"[lo, et, al, wang, et, al]"
4,yet scaling state of the art models to long se...,"[yet, scaling, state, art, models, long, seque..."


##### Stemming and Lemmatization

In [8]:
# Stemming
def stemming_tokens(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

df["Stemming_Sentences"] = df["Sentences_Tokenize"].apply(stemming_tokens)
df.head()

Unnamed: 0,Sentences,Sentences_Tokenize,Stemming_Sentences
0,a significant subset of natural language data ...,"[significant, subset, natural, language, data,...","[signific, subset, natur, languag, data, inclu..."
1,the ability to process such long sequences is ...,"[ability, process, long, sequences, critical, ...","[abil, process, long, sequenc, critic, mani, n..."
2,these tasks have important practical applicati...,"[tasks, important, practical, applications, do...","[task, import, practic, applic, domain, scient..."
3,lo et al wang et al a,"[lo, et, al, wang, et, al]","[lo, et, al, wang, et, al]"
4,yet scaling state of the art models to long se...,"[yet, scaling, state, art, models, long, seque...","[yet, scale, state, art, model, long, sequenc,..."


In [9]:
# Lemmatization
def lemmatizing_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]
    return lemmatized_tokens

df["Lemmatized_Sentences"] = df["Sentences_Tokenize"].apply(lemmatizing_tokens)
df.head()


Unnamed: 0,Sentences,Sentences_Tokenize,Stemming_Sentences,Lemmatized_Sentences
0,a significant subset of natural language data ...,"[significant, subset, natural, language, data,...","[signific, subset, natur, languag, data, inclu...","[significant, subset, natural, language, data,..."
1,the ability to process such long sequences is ...,"[ability, process, long, sequences, critical, ...","[abil, process, long, sequenc, critic, mani, n...","[ability, process, long, sequence, critical, m..."
2,these tasks have important practical applicati...,"[tasks, important, practical, applications, do...","[task, import, practic, applic, domain, scient...","[task, important, practical, applications, dom..."
3,lo et al wang et al a,"[lo, et, al, wang, et, al]","[lo, et, al, wang, et, al]","[lo, et, al, wang, et, al]"
4,yet scaling state of the art models to long se...,"[yet, scaling, state, art, models, long, seque...","[yet, scale, state, art, model, long, sequenc,...","[yet, scale, state, art, model, long, sequence..."


### Feature Engineering

##### Term Frequency-Inverse Document Frequency (TF-IDF)
Best for the text clustering and document classification

In [10]:
# Apply TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)  

# Join the tokens back into a string before applying TF-IDF
df["Lemmatized_Sentences_String"] = df['Lemmatized_Sentences'].apply(' '.join)
tfidf_matrix = vectorizer.fit_transform(df["Lemmatized_Sentences_String"])  

feature_names = vectorizer.get_feature_names_out()

# Convert TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display a sample of the transformed data
tfidf_df.head()

Unnamed: 0,ability,al,ammar,answer,applications,art,bert,challenge,classification,cohan,...,task,test,thousands,tokens,transformer,understand,use,vaswani,wang,widely
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.335549,0.278535,0.0,0.0,0.0,0.0,0.0,0.0
1,0.197618,0.0,0.0,0.197618,0.0,0.0,0.0,0.0,0.197618,0.0,...,0.140216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.463739,0.217862,0.0,0.217862,0.0,0.0,0.0,0.0,0.217862,...,0.15458,0.0,0.0,0.0,0.0,0.217862,0.0,0.0,0.0,0.0
3,0.0,0.578008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.407317,0.0
4,0.0,0.0,0.0,0.0,0.0,0.301032,0.0,0.301032,0.0,0.0,...,0.0,0.301032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Save to CSV file
tfidf_df.to_csv("tfidf_words_lemmatized.csv", index=False)

print("TF-IDF words saved to 'tfidf_words_lemmatized.csv'")

TF-IDF words saved to 'tfidf_words_lemmatized.csv'


##### Word Embeddings
Best for finding similar words and performing NLP tasks

In [12]:
word2vec_model = Word2Vec(sentences=df["Lemmatized_Sentences"], vector_size=100, window=5, min_count=2, workers=4)

# Save the model
word2vec_model.save("word2vec_model.model")

In [13]:
# Find most similar words
example_text = ["art", "model"]

for word in example_text:
    if word in word2vec_model.wv:
        similar_words = word2vec_model.wv.most_similar(word, topn=5) 
        print(f"Most similar words to '{word}':")
        for sim_word, similarity in similar_words:
            print(f"  {sim_word} ({similarity:.2f})")
    else:
        print(f"'{word}' not found in vocabulary.")
    print("\n")

'art' not found in vocabulary.


Most similar words to 'model':
  sequence (0.17)
  include (0.15)
  many (0.14)
  task (0.03)
  et (0.00)




### Save into CSV file for Future use

In [14]:
# Save as csv file
output_file = "processed_data.csv"
df.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")

Processed data saved to processed_data.csv
