In [2]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
stopwords = set(stopwords.words('english'))

np.random.seed(1337)

# read processed data
data = pd.read_csv('./imdb_processed_full.csv')
print(data['text'].str.len().describe())

[nltk_data] Downloading package stopwords to /Users/geko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/geko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/geko/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


count    50000.000000
mean       858.389040
std        658.428061
min         22.000000
25%        452.000000
50%        633.000000
75%       1044.000000
max       9434.000000
Name: text, dtype: float64


In [3]:
# preprocessing
def tokenize(text):
  return word_tokenize(text)

def rm_stopwords(text):
  return [i for i in text if i not in stopwords]

def lemmatize(text):
  lemmatizer = WordNetLemmatizer()    
  lemmas = [lemmatizer.lemmatize(t) for t in text]
  # make sure lemmas does not contains stopwords
  return rm_stopwords(lemmas)

def preprocess_pipeline(text):
  tokens = tokenize(text)
  no_stopwords = rm_stopwords(tokens)
  lemmas = lemmatize(no_stopwords)
  return ' '.join(lemmas)

In [4]:
data['text'] = data['text'].apply(preprocess_pipeline)
print(data.head)
# data[['text', 'label']].to_csv('./TMP.csv', index=False, header=True)
print(data['text'].str.len().describe())

<bound method NDFrame.head of                                                     text  label
0      rented curious yellow video store controversy ...      0
1      curious yellow risible pretentious steaming pi...      0
2      avoid making type film future film interesting...      0
3      film probably inspired godard masculin féminin...      0
4      oh brother hearing ridiculous film umpteen yea...      0
...                                                  ...    ...
49995  got around seeing monster man yesterday long w...      1
49996  got part competition prize watched really expe...      1
49997  got monster man box set three film mainly want...      1
49998  five minute started feel naff looking got comp...      1
49999  caught movie sci fi channel recently actually ...      1

[50000 rows x 2 columns]>
count    50000.000000
mean       805.422300
std        623.917502
min         17.000000
25%        420.000000
50%        592.000000
75%        978.000000
max       9133.000000