# Data Transformation Script

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [2]:
nltk.download('punkt')  # Download tokenizer
nltk.download('stopwords')  # Download stopwords
nltk.download('wordnet')  # Download WordNet (for lemmatization)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\638658\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\638658\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\638658\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
data_path = '../data/SQuAD_Cleaned_DF.h5'

In [4]:
df = pd.read_hdf(data_path, 'df')

## Ensure each word is lower case to allow lemmatization, stemming, and stopwords to occur

In [5]:
df['question'] = df['question'].str.lower()
df['context'] = df['context'].str.lower()
df['text'] = df['text'].str.lower()

## Lemmatize, Stem, and Remove Stopwords from the question, context, and text columns

In [6]:
def lemmatize_stem_remove_stopwords(text):
    """
    This function performs lemmatization and stopword removal on text.
     
    Args:
        text: A string containing the text to process.
    
    Returns:
        A list of preprocessed tokens.
    """
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)  # tokenize
    lemmatizer = WordNetLemmatizer()    
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens 
                        if token not in stop_words]  # Lemmatize and remove stopwords
    ps = PorterStemmer()
    stemmed_tokens = [ps.stem(token) for token in lemmatized_tokens] # Stem
    return stemmed_tokens

In [7]:
df['question'] = df['question'].apply(lemmatize_stem_remove_stopwords)
df['context'] = df['context'].apply(lemmatize_stem_remove_stopwords)
df['text'] = df['text'].apply(lemmatize_stem_remove_stopwords)

## Check to see if the first context cell was lemmatized, stemmed, and has the stopwords removed

In [8]:
df['context'][0]

['architectur',
 ',',
 'school',
 'cathol',
 'charact',
 '.',
 'atop',
 'main',
 'build',
 "'s",
 'gold',
 'dome',
 'golden',
 'statu',
 'virgin',
 'mari',
 '.',
 'immedi',
 'front',
 'main',
 'build',
 'face',
 ',',
 'copper',
 'statu',
 'christ',
 'arm',
 'uprais',
 'legend',
 '``',
 'venit',
 'ad',
 'omn',
 "''",
 '.',
 'next',
 'main',
 'build',
 'basilica',
 'sacr',
 'heart',
 '.',
 'immedi',
 'behind',
 'basilica',
 'grotto',
 ',',
 'marian',
 'place',
 'prayer',
 'reflect',
 '.',
 'replica',
 'grotto',
 'lourd',
 ',',
 'franc',
 'virgin',
 'mari',
 'reputedli',
 'appear',
 'saint',
 'bernadett',
 'soubir',
 '1858.',
 'end',
 'main',
 'drive',
 '(',
 'direct',
 'line',
 'connect',
 '3',
 'statu',
 'gold',
 'dome',
 ')',
 ',',
 'simpl',
 ',',
 'modern',
 'stone',
 'statu',
 'mari',
 '.']

## Output the lemmatized, stemmed, and removed stop words dataset to an updated .csv file

In [9]:
df.to_hdf('../data/SQuAD_Processed_DF.h5', key = 'df')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['index', 'question', 'context', 'text'], dtype='object')]

  df.to_hdf('../data/SQuAD_Processed_DF.h5', key = 'df')
