In [56]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
keras = tf.keras
from keras import Input, Sequential
from keras.layers import Dense, LSTM, Dropout, Embedding
import csv
import math
import numpy as np
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from functools import lru_cache

In [57]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /home/deep/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/deep/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/deep/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [58]:
train_df = pd.read_csv("Train.csv")
test_df = pd.read_csv("Test.csv")
val_df = pd.read_csv("Valid.csv")
train_df.head(3)

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0


In [59]:
train_df.shape

(40000, 2)

In [60]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [61]:
train_df.duplicated().sum()

277

In [62]:
train_df = train_df.drop_duplicates()
test_df = test_df.drop_duplicates()
val_df = val_df.drop_duplicates()
train_df.shape

(39723, 2)

In [63]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [64]:
@lru_cache(maxsize=None)
def preprocess(text: str) -> str:
    text = text.lower()
    text = re.sub('\[.*?\]', "", text)
    text = re.sub('\\W', " ", text)
    text = re.sub("https?://\S+|www\.\S+", "", text)
    text = re.sub('<.*?>+', "", text)
    text = re.sub('[%s]' % re.escape(string.punctuation), "", text)
    text = re.sub('\n', "", text)
    text = re.sub('\w*\d\w*', "", text)

    return text

In [65]:
stop_words = set(stopwords.words('english'))

@lru_cache(maxsize=None)
def tokenize_and_lemmatize(text: str) -> [str]:
    tokens = word_tokenize(text)
    stemmed = [stemmer.stem(lemmatizer.lemmatize(token)) for token in tokens if token not in stop_words]

    # words = [word for word in stemmed if word not in stop_words]
    # corrected = [str(TextBlob(word).correct()) for word in words]

    return stemmed

In [66]:
train_df.iloc[:, 0] = train_df.iloc[:, 0].apply(lambda x: tokenize_and_lemmatize(preprocess(x)))
train_df.head(3)

Unnamed: 0,text,label
0,"[grew, b, watch, love, thunderbird, mate, scho...",0
1,"[put, movi, dvd, player, sat, coke, chip, expe...",0
2,"[peopl, know, particular, time, past, like, fe...",0


In [67]:
test_df.iloc[:, 0] = test_df.iloc[:, 0].apply(lambda x: tokenize_and_lemmatize(preprocess(x)))
test_df.head(3)

Unnamed: 0,text,label
0,"[alway, wrote, seri, complet, stink, fest, jim...",0
1,"[watch, dir, steve, purcel, typic, mari, kate,...",0
2,"[movi, poorli, written, direct, fell, asleep, ...",0


In [68]:
val_df.iloc[:, 0] = val_df.iloc[:, 0].apply(lambda x: tokenize_and_lemmatize(preprocess(x)))
val_df.head(3)

Unnamed: 0,text,label
0,"[year, sinc, sharon, stone, award, viewer, leg...",0
1,"[someon, need, make, car, payment, truli, aw, ...",0
2,"[guidelin, state, comment, must, contain, mini...",0
