In [3]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from tqdm import tqdm
import pickle
import pandas as pd
from pathlib import Path

def clean_text(text):
    '''
    cleans text of punctuation, strange characters, and numbers
    '''
    text = str(text)
    text = text.replace("\n", " ").replace("\r", " ")
    punc_list = '!"#$%&()*+, -./:;<=>?@[\]^_{|}~' + '0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.translate(t)

    # Replace single quote with empty character
    t = str.maketrans(dict.fromkeys("'`", ""))
    text = text.translate(t)

    return text.lower()

def tokenize_dataframe(df, source_col):
    '''
    Takes df[col] and tokenizes the string into a list of tokens
    '''
    tokenized = []
    lemmatizer = WordNetLemmatizer()
    porter = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    
    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = row[source_col]
        text = clean_text(text)
        tokens = word_tokenize(text)
        lemstem = []
        for word in tokens:
            lemstem.append(porter.stem(lemmatizer.lemmatize(word)))
        final = [w for w in lemstem if not w in stop_words]

        tokenized.append(final)
    return tokenized

print('Loading Dataframe...')
with open(Path.cwd() / 'sp_df_tokenized.pkl', 'rb') as f:
    df = pickle.load(f)

print('Tokenizing DataFrame') 
tokenized = tokenize_dataframe(df, 'meta_text')

df['meta_tokenized'] = tokenized

  0%|          | 0/61 [00:00<?, ?it/s]

Loading Dataframe...
Tokenizing DataFrame


100%|██████████| 61/61 [00:04<00:00, 14.70it/s]


In [6]:
with open(Path.cwd() / 'sp_df_tokenized.pkl', 'wb') as f:
    pickle.dump(df, f)