In [1]:
# Import packages
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow.keras
from tensorflow.keras.models import Sequential
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import nltk
from nltk.stem import WordNetLemmatizer

In [2]:
# Specify column names
column_names = ['Review', 'Label']

# Read in txt files and set column headers
amazon = pd.read_csv('amazon_cells_labelled.txt', delimiter='\t', header=None, names=column_names)
print(f'Amazon shape: {amazon.shape}')
imdb = pd.read_csv('imdb_labelled.txt', delimiter='\t', header=None, names=column_names)
print(f'IMDB shape: {imdb.shape}')
yelp = pd.read_csv('yelp_labelled.txt', delimiter='\t', header=None, names=column_names)
print(f'Yelp shape: {yelp.shape}')

Amazon shape: (1000, 2)
IMDB shape: (748, 2)
Yelp shape: (1000, 2)


In [3]:
# Concatenate the three dataframes together
df = pd.concat([amazon, imdb, yelp], ignore_index=True)

In [4]:
# Check for null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2748 entries, 0 to 2747
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  2748 non-null   object
 1   Label   2748 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 43.1+ KB


In [5]:
# Check Label to make sure values look appropriate
df['Label'].value_counts()

Label
1    1386
0    1362
Name: count, dtype: int64

In [6]:
# Set 'Review' to lowercase
print(f'Before applying lowercase: {df['Review'].iloc[0]}')
df['Review'] = df['Review'].str.lower()
print(f'After applying lowercase: {df['Review'].iloc[0]}')

Before applying lowercase: So there is no way for me to plug it in here in the US unless I go by a converter.
After applying lowercase: so there is no way for me to plug it in here in the us unless i go by a converter.


In [7]:
# Remove punctuation
pattern = re.compile(r'[^\w\s]')

print(f'Before removing punctuation: {df['Review'].iloc[0]}')
df['Review'] = df['Review'].apply(lambda x: pattern.sub('', x))
print(f'After removing punctuation: {df['Review'].iloc[0]}')

Before removing punctuation: so there is no way for me to plug it in here in the us unless i go by a converter.
After removing punctuation: so there is no way for me to plug it in here in the us unless i go by a converter


# Preprocess Text with NTLK

In [8]:
# Items needed for NLTK

# Needed for Tokenization
nltk.download('punkt')

# Needed for Stopwords
nltk.download('stopwords')

# Needed for Lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [9]:
#https://www.geeksforgeeks.org/removing-stop-words-nltk-python/

# Create Tokenization function
def tokenize_text(text):
    if pd.notnull(text):
        tokens = word_tokenize(text)
        return tokens
    else:
        return []

# Apply Tokenization function
print(f'Before Tokenization: {df['Review'].iloc[0]}')
df['Review'] = df['Review'].apply(tokenize_text)
print(f'After Tokenization: {df['Review'].iloc[0]}')

Before Tokenization: so there is no way for me to plug it in here in the us unless i go by a converter
After Tokenization: ['so', 'there', 'is', 'no', 'way', 'for', 'me', 'to', 'plug', 'it', 'in', 'here', 'in', 'the', 'us', 'unless', 'i', 'go', 'by', 'a', 'converter']


In [10]:


stop_words = set(stopwords.words('english'))

# Create Stopwords function
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Apply Stopwords function
print(f'Before Stopwords: {df['Review'].iloc[0]}')
df['Review'] = df['Review'].apply(remove_stopwords)
print(f'After Stopwords: {df['Review'].iloc[0]}')

Before Stopwords: ['so', 'there', 'is', 'no', 'way', 'for', 'me', 'to', 'plug', 'it', 'in', 'here', 'in', 'the', 'us', 'unless', 'i', 'go', 'by', 'a', 'converter']
After Stopwords: ['way', 'plug', 'us', 'unless', 'go', 'converter']


In [11]:


# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Create Lemmatization function
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

# Apply Lemmatization function
print(f'Before Lemmatization: {df['Review'].iloc[0]}')
df['Review'] = df['Review'].apply(lemmatize_tokens)
print(f'After Lemmatization: {df['Review'].iloc[0]}')

Before Lemmatization: ['way', 'plug', 'us', 'unless', 'go', 'converter']
After Lemmatization: ['way', 'plug', 'u', 'unless', 'go', 'converter']
