In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import types
from joblib import Parallel, delayed
from concurrent.futures import ThreadPoolExecutor

nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet as wn
wn.ensure_loaded()
def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            try:
                yield val.__name__, val.__version__
                pass
            except:
                yield val.__name__
                pass
            pass
        pass
    pass

list(imports())

# Load the preprocessed reviews
file_location='../data/interim_data/04_text_mining/preprocessed/'
FPS_reviews = pd.read_csv(f'{file_location}reviews_preprocessed.csv.gz', compression="gzip",low_memory=False)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\charl\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\charl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\charl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\charl\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_review(review):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(str(review))
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]
    lemmatized_output = ' '.join(lemmatized_tokens)
    return lemmatized_output

def parallel_lemmatize_reviews(reviews, num_workers):
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        lemmatized_reviews = list(tqdm(executor.map(lemmatize_review, reviews), total=len(reviews), desc="Lemmatizing reviews"))
    return lemmatized_reviews

# Assuming FPS_reviews is your DataFrame and it has a column 'review'
num_workers = 24
lemmatized_reviews = parallel_lemmatize_reviews(FPS_reviews['review'], num_workers)

# Updating the DataFrame with lemmatized reviews
FPS_reviews['review'] = lemmatized_reviews

Lemmatizing reviews: 100%|████████████████████████████████████████████████| 3621927/3621927 [3:55:09<00:00, 256.70it/s]


In [3]:
# saving files
# FPS_reviews["review"] = lemmatize(FPS_reviews["review"].tolist())
FPS_reviews.to_csv(f'{file_location}reviews_lemmatized.csv.gz', index=False, compression='gzip')