In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

import tensorflow as tf
from tf.keras.optimizers import Adam
from tf.keras.models import Model, Sequential
from tf.keras.preprocessing.text import Tokenizer
from tf.keras.layers import Conv1D, Dense, Dropout, Embedding, Flatten, Input, MaxPooling1D

import re
import bz2
import pickle
from tqdm import tqdm

import os
data_dir = './amazon'
print(os.listdir(data_dir))  # files present in directory



['test.ft.txt.bz2', 'train.ft.txt.bz2', '.ipynb_checkpoints']


In [3]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        d = line.decode('utf-8')  # decode 8-bit encodings of source text to ascii
        labels.append(int(d[9]) - 1)  # extract labels, shifts index [1,2] to [0,1]
        texts.append(d[10:])  # append review content
    return np.array(labels), texts

train_labels, train_texts = get_labels_and_texts('./amazon/train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('./amazon/test.ft.txt.bz2')

In [4]:
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        no_cap = text.lower()  # change uppercase to lowercase
        no_pun = re.sub(r'[^\w\s]', '', no_cap)  # remove punctuation
        no_non = re.sub(r'[^\x00-\x7F]', '', no_pun)  # remove non-ascii
        no_spa = no_non.strip()  # remove leading/trailing spaces
        normalized_texts.append(no_spa)
    return normalized_texts
        
train_texts = normalize_texts(train_texts)
test_texts = normalize_texts(test_texts)

In [5]:
train_texts[:4]  # texts successfully normalized

['stuning even for the nongamer this sound track was beautiful it paints the senery in your mind so well i would recomend it even to people who hate vid game music i have played the game chrono cross but out of all of the games i have ever played it has the best music it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras it would impress anyone who cares to listen _',
 'the best soundtrack ever to anything im reading a lot of reviews saying that this is the best game soundtrack and i figured that id write a review to disagree a bit this in my opinino is yasunori mitsudas ultimate masterpiece the music is timeless and im been listening to it for years now and its beauty simply refuses to fadethe price tag on this is pretty staggering i must say but if you are going to buy any cd for this much money this is the only one that i feel would be worth every penny',
 'amazing this soundtrack is my favorite music of all time hands down the inten

In [6]:
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_texts)

In [None]:
# train_sequences = tokenizer.texts_to_sequences(train_texts)
# train_pickle = pickle.dumps(train_sequences)
# with open('train_pickle.pkl', 'wb') as file:
#     file.write(train_pickle)

# test_sequences = tokenizer.texts_to_sequences(test_texts)
# test_pickle = pickle.dumps(test_sequences)
# with open('test_pickle.pkl', 'wb') as file:
#     file.write(test_pickle)

In [7]:
with open('train_pickle.pkl', 'rb') as file:
    train_pickle = file.read()
train_sequences = pickle.loads(train_pickle)

In [8]:
pd.DataFrame(train_sequences[:4])  # texts successfully tokenized

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,108,109,110,111,112,113,114,115,116,117
0,72,11,1,8,173,495,13,363,7,6002,...,,,,,,,,,,
1,1,87,1001,131,5,247,106,147,4,170,...,,,,,,,,,,
2,347,8,1001,9,21,297,119,6,27,55,...,5.0,82.0,2194.0,332.0,3.0,99.0,174.0,7.0,489.0,259.0
3,180,1001,3,470,32,8,1001,2,3,278,...,,,,,,,,,,


In [9]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

max_features = 10000
embedding_dim = 100

max_length = max(len(sequence) for sequence in train_sequences)
print(max_length)

2326947
254


In [10]:
def build_model():
    sequences = layers.Input(shape=(max_length,))
    embedding = layers.Embedding(input_dim=max_features, output_dim=embedding_dim)(sequences)
    
    x = layers.Conv1D(64, 5, activation='relu')(embedding)  # capture higher-level patterns
    x = layers.MaxPool1D(5)(x)  # reduce dimensionality
    x = layers.Conv1D(64, 3, activation='relu')(x)  # capture more fine-grained patterns
    x = layers.MaxPool1D(3)(x)
    x = layers.Flatten()(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dropout(.5)(x)
    x = layers.Dense(16, activation='relu')(x)
    x = layers.Dropout(.5)(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [11]:
model = build_model()
model.summary()

2024-01-02 20:47:16.328292: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 254)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 254, 100)          1000000   
_________________________________________________________________
conv1d (Conv1D)              (None, 250, 64)           32064     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 50, 64)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 48, 64)            12352     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 16, 64)            0         
_________________________________________________________________
flatten (Flatten)            (None, 1024)              0     

In [12]:
with open('test_pickle.pkl', 'rb') as file:
    test_pickle = file.read()
test_sequences = pickle.loads(test_pickle)

In [None]:
# tested different methods of preparing data for model fitting
# convert_to_tensor: only takes sequences of same length but data not padded

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
train_sequences = pad_sequences(train_sequences, maxlen=max_length)
test_sequences = pad_sequences(test_sequences, maxlen=max_length)

In [16]:
model.fit(
    train_sequences, train_labels,
    batch_size=128, epochs=2, verbose=1,
    validation_data=(test_sequences, test_labels)
)

2024-01-03 13:07:49.035141: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fcd853401c0>

In [None]:
# tested different methods of scoring model performance
# classification_report: only takes binary values but test_labels_pred values are float

In [26]:
test_labels_pred = model.predict(test_sequences)
print(f'Accuracy score: {np.round(accuracy_score(test_labels, 1 * (test_labels_pred > 0.5)), 4)}')
print(f'F1 score: {np.round(f1_score(test_labels, 1 * (test_labels_pred > 0.5)),4)}')
print(f'ROC-AUC score: {np.round(roc_auc_score(test_labels, test_labels_pred),4)}')

Accuracy score: 0.9488
F1 score: 0.9486
ROC-AUC score: 0.9877
