In [None]:
import pandas as pd

# utils
import re
from tqdm import tqdm
tqdm.pandas()

# text processing
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = set(stopwords.words('english'))

train_path = 'imdb/plain_text/train-00000-of-00001.parquet'
test_path = 'imdb/plain_text/test-00000-of-00001.parquet'

[nltk_data] Downloading package stopwords to /Users/geko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/geko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df_train = pd.read_parquet(train_path)
df_test = pd.read_parquet(test_path)

df = pd.concat([df_train, df_test], ignore_index=True, sort=False)

In [3]:
def rm_link(text):
  return re.sub(r'https?://\S+|www\.\S+', '', text)

def rm_html(text):
  return re.sub(r'<[^>]+>', '', text)

def space_bt_punct(text):
  pattern = r'([.,!?-])'
  # add whitespaces between punctuation
  s = re.sub(pattern, r' \1 ', text)
  # remove double whitespaces
  s = re.sub(r'\s{2,}', ' ', s)
  return s

def clean_html(text):
  # Remove specific sequences
  text = re.sub(r"<br|/><br|/>", "", text)
  return text

def rm_punct(text):
  return re.sub(r'[\"\#\$\%\&\'\(\)\*\+\/\:\;\<\=\>\@\[\\\]\^\_\`\{\|\}\~\.\!\?\-\,]', ' ', text)

df['text'] = df['text'].apply(rm_link)
df['text'] = df['text'].apply(rm_html)
df['text'] = df['text'].apply(space_bt_punct)
df['text'] = df['text'].apply(clean_html)
df['text'] = df['text'].apply(rm_punct)

df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

df['text'] = df['text'].apply(lambda x: x.lower())

In [None]:
# preprocessing
def tokenize(text):
  return word_tokenize(text)

def rm_stopwords(text):
  return [i for i in text if i not in stopwords]

def lemmatize(text):
  lemmatizer = WordNetLemmatizer()    
  lemmas = [lemmatizer.lemmatize(t) for t in text]
  # make sure lemmas does not contains stopwords
  return rm_stopwords(lemmas)

def preprocess_pipeline(text):
  tokens = tokenize(text)
  no_stopwords = rm_stopwords(tokens)
  lemmas = lemmatize(no_stopwords)
  return ' '.join(lemmas)

In [7]:
df[['text', 'label']].to_csv('./imdb_processed.csv', index=False, header=True)