In [21]:
# Import packages
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow.keras
from tensorflow.keras.models import Sequential
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

In [22]:
# Specify column names
column_names = ['Review', 'Label']

# Read in txt files and set column headers
amazon = pd.read_csv('amazon_cells_labelled.txt', delimiter='\t', header=None, names=column_names)
print(f'Amazon shape: {amazon.shape}')
imdb = pd.read_csv('imdb_labelled.txt', delimiter='\t', header=None, names=column_names)
print(f'IMDB shape: {imdb.shape}')
yelp = pd.read_csv('yelp_labelled.txt', delimiter='\t', header=None, names=column_names)
print(f'Yelp shape: {yelp.shape}')

Amazon shape: (1000, 2)
IMDB shape: (748, 2)
Yelp shape: (1000, 2)


In [23]:
# Concatenate the three dataframes together
df = pd.concat([amazon, imdb, yelp], ignore_index=True)

In [24]:
# Check for null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2748 entries, 0 to 2747
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  2748 non-null   object
 1   Label   2748 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 43.1+ KB


In [25]:
# Check Label to make sure values look appropriate
df['Label'].value_counts()

Label
1    1386
0    1362
Name: count, dtype: int64

In [26]:
# Initial look at sentences
df.head(10)

Unnamed: 0,Review,Label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
5,I have to jiggle the plug to get it to line up...,0
6,If you have several dozen or several hundred c...,0
7,If you are Razr owner...you must have this!,1
8,"Needless to say, I wasted my money.",0
9,What a waste of money and time!.,0


In [27]:
# Set 'Review' to lowercase
df['Review'] = df['Review'].str.lower()

In [28]:
# Remove punctuation
pattern = re.compile(r'[^\w\s]')

df['Review'] = df['Review'].apply(lambda x: pattern.sub('', x))

In [29]:
# Re-inspect sentences to confirm they are now lower case and that there is no punctuation
df.head(10)

Unnamed: 0,Review,Label
0,so there is no way for me to plug it in here i...,0
1,good case excellent value,1
2,great for the jawbone,1
3,tied to charger for conversations lasting more...,0
4,the mic is great,1
5,i have to jiggle the plug to get it to line up...,0
6,if you have several dozen or several hundred c...,0
7,if you are razr owneryou must have this,1
8,needless to say i wasted my money,0
9,what a waste of money and time,0


In [None]:
#https://www.geeksforgeeks.org/removing-stop-words-nltk-python/