In [None]:
import pandas as pd
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [None]:
data_df = pd.read_csv("./data/input/train.csv")
data_df.drop(columns=['uuid', 'title', 'author', 'Keywords'], inplace=True, axis=1)
print(data_df.shape)

In [None]:
def tokenize_dataframe(row):
    row['tokenized'] = nltk.tokenize.word_tokenize(row['abstract'])
    return row

data_df['tokenized'] = [list() for _ in range(data_df.shape[0])]
data_df = data_df.apply(tokenize_dataframe, axis=1)

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
stopwords = set(nltk.corpus.stopwords.words('english'))

In [None]:
import re

do_remove_punct = True
do_lemmatize = True
do_lowercase = True
do_remove_stop = True

def custom_tokenize(row):
    text = row['abstract']
    
    # 01 - Punctuations
    if do_remove_punct:
        text = re.sub(r'([^\w\s])', '', text)

    # 02 - Lemmatization
    if do_lemmatize:
        tokens_list = nltk.word_tokenize(text)
        text = ' '.join([lemmatizer.lemmatize(t, get_wordnet_pos(t)) for t in tokens_list])

    # 03 - Lowercasing
    if do_lowercase:
        text = text.lower()

    # 04 - Removing stop words (i.e. grammar defining words, not adding value to main topic)
    if do_remove_stop:
        text = ' '.join([t for t in text.split() if t not in stopwords])

    row['custom_tokenized'] = text.split()
    return row

data_df['custom_tokenized'] = [list() for _ in range(data_df.shape[0])]
data_df = data_df.apply(custom_tokenize, axis=1)

data_df.head(20)