In [42]:
# Importing necessary libraries

import re
import pandas as pd
import numpy as np
from numpy import loadtxt
import matplotlib.pyplot as plt
import nltk
import nltk.data
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout,Conv1D, MaxPooling1D
from sklearn.model_selection import train_test_split
import keras

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\BERAT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\BERAT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BERAT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\BERAT\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [25]:
# This function extracts individual reviews from the raw lines of a review file.
def extract_reviews_from_lines(lines):
    reviews = []
    review = []
    inside_review = False
    for line in lines:
        if '<review>' in line:
            inside_review = True
        elif '</review>' in line:
            inside_review = False
            reviews.append(''.join(review).strip())  # Join all lines of a review into a single string
            review = []  # Reset for the next review
        elif inside_review:
            review.append(line.strip())
    return reviews

# This function loads reviews for specified product types and labels.
def load_reviews(product_types, labels):
    data = {}

    for product in product_types:
        data[product] = {}

        for label in labels:
            file_path = f"sorted_data_acl/{product}/{label}.review"
            try:
                with open(file_path, 'r') as file:
                    lines = file.readlines()
                    data[product][label] = extract_reviews_from_lines(lines)
            except FileNotFoundError:
                print(f"File {file_path} not found. Skipping...")
                data[product][label] = []

    return data

product_types = ['books', 'dvd', 'electronics', 'kitchen_&_housewares']
labels = ['negative', 'positive']

# Load reviews for all product and label combinations.
reviews_data = load_reviews(product_types, labels)

# Flatten out negative and positive reviews from all product categories into two separate lists.
all_negative_reviews = [review for product in product_types for review in reviews_data[product]['negative']]
all_positive_reviews = [review for product in product_types for review in reviews_data[product]['positive']]

print(all_positive_reviews[:1]) 


['<unique_id>0785758968:one_of_the_best_crichton_novels:joseph_m</unique_id><asin>0785758968</asin><product_name>Sphere: Books: Michael Crichton</product_name><product_type>books</product_type><helpful>0 of 1</helpful><rating>5.0</rating><title>One of the best Crichton novels</title><date>July 1, 2006</date><reviewer>Joseph M</reviewer><reviewer_location>Colorado, USA</reviewer_location><review_text>Sphere by Michael Crichton is an excellant novel. This was certainly the hardest to put down of all of the Crichton novels that I have read.The story revolves around a man named Norman Johnson. Johnson is a phycologist. He travels with 4 other civilans to a remote location in the Pacific Ocean to help the Navy in a top secret misssion. They quickly learn that under the ocean is a half mile long spaceship. The civilans travel to a center 1000 feet under the ocean to live while researching the spacecraft. They are joined by 5 Navy personel to help them run operations. However on the surface a

In [26]:
# This function extracts the actual review text from the full review content.

def extract_review_text(review):
    pattern = r"<review_text>(.*?)</review_text>"
    match = re.search(pattern, review, re.DOTALL) 
    return [match.group(1).strip()] if match else []

# For each review in the all_negative(or positive)_reviews list, the actual review text is extracted 
# using the function above. If there's a valid extracted review, it's added to the list.
all_negative_reviews = [extract_review_text(review) for review in all_negative_reviews if extract_review_text(review)]
all_positive_reviews = [extract_review_text(review) for review in all_positive_reviews if extract_review_text(review)]

print(all_negative_reviews[:2])


[["THis book was horrible.  If it was possible to rate it lower than one star i would have.  I am an avid reader and picked this book up after my mom had gotten it from a friend.  I read half of it, suffering from a headache the entire time, and then got to the part about the relationship the 13 year old boy had with a 33 year old man and i lit this book on fire.  One less copy in the world...don't waste your money.I wish i had the time spent reading this book back so i could use it for better purposes.  THis book wasted my life"], ['I like to use the Amazon reviews when purchasing books, especially alert for dissenting perceptions about higly rated items, which usually disuades me from a selection.  So I offer this review that seriously questions the popularity of this work - I found it smug, self-serving and self-indulgent, written by a person with little or no empathy, especially for the people he castigates. For example, his portrayal of the family therapist seems implausible and r

In [27]:
# Converting all reviews' letters to lowercase
all_negative_reviews = [[text[0].lower()] for text in all_negative_reviews]
all_positive_reviews = [[text[0].lower()] for text in all_positive_reviews]

print(all_negative_reviews[:2]) 


[["this book was horrible.  if it was possible to rate it lower than one star i would have.  i am an avid reader and picked this book up after my mom had gotten it from a friend.  i read half of it, suffering from a headache the entire time, and then got to the part about the relationship the 13 year old boy had with a 33 year old man and i lit this book on fire.  one less copy in the world...don't waste your money.i wish i had the time spent reading this book back so i could use it for better purposes.  this book wasted my life"], ['i like to use the amazon reviews when purchasing books, especially alert for dissenting perceptions about higly rated items, which usually disuades me from a selection.  so i offer this review that seriously questions the popularity of this work - i found it smug, self-serving and self-indulgent, written by a person with little or no empathy, especially for the people he castigates. for example, his portrayal of the family therapist seems implausible and r

# Cleaning and tokenizing the data

In [28]:
def clean_review_text(text):

    # Extract the review string from the list
    review = text[0]

    # Remove underscores and slashes
    review = review.replace('_',' ').replace('/','')

    # Remove numbers
    review = re.sub(r'\b[0-9]+\b\s*', '', review)

    # Remove hyperlinks
    review = re.sub(r'https?://\S+', '', review)

    # Remove the <a> tags but keep their contents
    review = re.sub(r'<a[^>]*>(.*?)</a>', r'\1', review)

    # Remove other HTML tags but keep their contents
    review = re.sub(r'<.*?>', '', review)

    # Remove alphanumerics that don't mean anything
    review = re.sub(r'\w*\d\w*', '', review)

    # Tokenize without undesired punctuation and return as a list of tokens
    tokens = RegexpTokenizer(r"\w+(?:'\w+)?|!|\?").tokenize(review)

    return tokens  # We return a list of lists (with the tokens as a sub-list)

# Apply the function to each review in the lists of lists
cleaned_negative_reviews = [clean_review_text(review) for review in all_negative_reviews]
cleaned_positive_reviews = [clean_review_text(review) for review in all_positive_reviews]


# Removing stop words

In [29]:
stop_words = set(stopwords.words('english'))

retain_words = {'not', 'no', "isn't", "hasn't", "wasn't", "don't", "doesn't", "didn't",
                "can't", "couldn't", "won't", "wouldn't", "shouldn't", "hardly", "just",
                "only", "always", "never", "could"}

# Remove retained words from the set of stopwords
stop_words = stop_words - retain_words

# This function takes in a review and removes any word in it that's part of the stopwords. 
def remove_stopwords(review):
    return [[word for word in tokens if word not in stop_words] for tokens in review]

# Remove stopwords from the tokenized reviews
cleaned_negative_reviews = remove_stopwords(cleaned_negative_reviews)
cleaned_positive_reviews = remove_stopwords(cleaned_positive_reviews)


# Chunking and POS tagging

In [30]:
# Noun Phrase (NP)
# Verb Phrase (VP)
# Adjective Phrase (ADJP) (there are 2 patterns we can use {<RB>?<JJ> and <JJ><NN|JJ>?}.
# I am using both.
# Adverb Phrase (ADVP)
# Prepositional Phrase (PP)
grammar = r"""
    NP: {<DT>?<JJ.*>*<NN.*>+} 
    VP: {<VB.*><NP|JJ|PP|ADVP|VB>*}
    ADJP: {<RB>?<JJ>|<JJ><NN|JJ>?}
    ADVP: {<RB.*><RB>?}
    PP: {<IN><NP>} 
"""

chunk_parser = nltk.RegexpParser(grammar)

# Function to POS tag each review
def pos_tag_reviews(reviews):
    return [nltk.pos_tag(review) for review in reviews]

# Function to chunk each tagged review
def chunk_reviews(tagged_reviews):
    return [chunk_parser.parse(tagged_review) for tagged_review in tagged_reviews]

# POS tag the reviews
tagged_negative_reviews = pos_tag_reviews(cleaned_negative_reviews)
tagged_positive_reviews = pos_tag_reviews(cleaned_positive_reviews)

# Chunk the tagged reviews
chunked_negative_reviews = chunk_reviews(tagged_negative_reviews)
chunked_positive_reviews = chunk_reviews(tagged_positive_reviews)

# Lemmatization

In [31]:
def get_wordnet_pos(treebank_tag):
    """Map treebank POS tag to first character used by WordNetLemmatizer."""
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(treebank_tag[0], wordnet.NOUN)

lemmatizer = WordNetLemmatizer()

# Lemmatize words in a tagged review using their respective POS tag.
def lemmatize_review(tagged_review):
    return [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged_review]

# Function to lemmatize each review in the list of reviews
def lemmatize_reviews(tagged_reviews):
    return [lemmatize_review(review) for review in tagged_reviews]

# Apply the lemmatization function to the tagged reviews
lemmatized_negative_reviews = lemmatize_reviews(tagged_negative_reviews)
lemmatized_positive_reviews = lemmatize_reviews(tagged_positive_reviews)


# Frequency distribution

In [32]:
# Flatten the reviews to create all_words
all_words = [word for review in (lemmatized_negative_reviews + lemmatized_positive_reviews) for word in review]

# Calculate the frequency distribution
fdist = FreqDist(all_words)

# Create word-to-int mapping
sorted_vocab = sorted(fdist, key=fdist.get, reverse=True)
word2int = {word: i+1 for i, word in enumerate(sorted_vocab)}  # start from 1 to reserve 0 for padding later

# Function to encode a single review
def encode_review(review):
    return [word2int[word] for word in review if word in word2int]

# Encode the reviews while maintaining the list of lists structure
encoded_negative_reviews = [encode_review(review) for review in lemmatized_negative_reviews]
encoded_positive_reviews = [encode_review(review) for review in lemmatized_positive_reviews]


In [33]:
# print the most common words
print(fdist.most_common(10))

[('not', 6972), ('book', 5242), ('one', 5145), ('!', 4365), ('get', 4030), ('use', 3625), ('like', 3402), ('make', 3376), ('just', 3211), ('good', 3110)]


In [34]:
word_freq = dict(fdist)

# for top 10 words
n = 10  
top_words = {k: word_freq[k] for k in list(word_freq)[:n]}
print(top_words)


{'book': 5242, 'horrible': 155, 'possible': 140, 'rate': 148, 'lower': 21, 'one': 5145, 'star': 744, 'would': 2759, 'avid': 22, 'reader': 435}


# Encode the labels for ‘positive’ and ‘negative’

In [35]:
# Creating labels for positive and negative reviews. 
# For our sentiment analysis, we're using binary classification: 
# 1 represents positive sentiments, and 0 represents negative sentiments.

# Generate a list of ones with the same length as `lemmatized_positive_reviews`
labels_positive = [1] * len(lemmatized_positive_reviews)

# Generate a list of zeros with the same length as `lemmatized_negative_reviews`
labels_negative = [0] * len(lemmatized_negative_reviews)

# Concatenate the two lists to get a complete list of labels for both positive and negative reviews
all_labels = labels_negative + labels_positive


# outlier removal

In [36]:
# Calculate the length of each review in the combined list of both lemmatized positive and negative reviews
review_lengths = [len(review) for review in lemmatized_negative_reviews + lemmatized_positive_reviews]

# Compute the average (mean) length of all reviews
avg_length = np.mean(review_lengths)

# Compute the standard deviation of the lengths of all reviews. 
std_length = np.std(review_lengths)


# Define a minimum threshold for review length as one standard deviation below the average length. 
min_threshold = avg_length - std_length  # 1 standard deviation below the mean

# Define a maximum threshold for review length as two standard deviations above the average length. 
max_threshold = avg_length + 2 * std_length  # 2 standard deviations above the mean


In [37]:
# Create a list of reviews that only includes those whose lengths are between the defined min_threshold and max_threshold.
# It filters out reviews that are too short (less than min_threshold) or too long (greater than max_threshold).
filtered_reviews = [review for review in lemmatized_negative_reviews + lemmatized_positive_reviews if min_threshold <= len(review) <= max_threshold]

# Similarly, for labels corresponding to the reviews, we only want to keep those that are associated with reviews 
# whose lengths are between the defined min_threshold and max_threshold.
filtered_labels = [label for review, label in zip(lemmatized_negative_reviews + lemmatized_positive_reviews, all_labels) if min_threshold <= len(review) <= max_threshold]


# pad/truncate data

In [38]:
# First, we're filtering encoded reviews based on their length to remove those that are too short or too long.
# The result is a list of reviews whose lengths are between min_threshold and max_threshold.
filtered_encoded_reviews = [review for review in encoded_negative_reviews + encoded_positive_reviews if min_threshold <= len(review) <= max_threshold]

# Similarly, we filter the labels of the reviews to ensure they correspond to the lengths of the reviews 
# that we're keeping after the above filtering process.
filtered_labels = [label for review, label in zip(encoded_negative_reviews + encoded_positive_reviews, all_labels) if min_threshold <= len(review) <= max_threshold]

# max_sequence_length is set based on max_threshold, which defines the maximum length for a review.
# This value is used for padding and truncating reviews to ensure they all have the same length.
max_sequence_length = int(max_threshold)  

# Here, we're padding (or truncating) each review in our filtered list to make sure every review has the same length.
# If a review is shorter than max_sequence_length, it gets padded with zeros at the end. 
# If a review is longer than max_sequence_length, it's truncated to fit the specified length.
padded_reviews = pad_sequences(filtered_encoded_reviews, maxlen=max_sequence_length, padding='post', truncating='post')


# split the dataset

In [40]:
# First, split data into training and temp (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(padded_reviews, filtered_labels, test_size=0.2, stratify=filtered_labels, random_state=42)

# Then, split temp data into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)


# building the ann model

In [54]:
vocab_size = len(word2int) + 1  
embedding_dim = 64  
max_length = int(max_threshold)  

model = Sequential()

# Embedding layer to convert integer sequences to dense vectors of fixed size.
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))

# Flatten the 2D embedding output to 1D
model.add(Flatten())

# Hidden layer with 64 units and ReLU activation
model.add(Dense(64, activation='relu'))

# Dropout layer to prevent overfitting
model.add(Dropout(0.5))

# Output layer with 1 unit (binary classification: positive or negative)
model.add(Dense(1, activation='sigmoid'))

# Compile the model using binary cross-entropy loss, since it's a binary classification problem
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 236, 64)           2217792   
                                                                 
 flatten_6 (Flatten)         (None, 15104)             0         
                                                                 
 dense_21 (Dense)            (None, 64)                966720    
                                                                 
 dropout_10 (Dropout)        (None, 64)                0         
                                                                 
 dense_22 (Dense)            (None, 1)                 65        
                                                                 
Total params: 3184577 (12.15 MB)
Trainable params: 3184577 (12.15 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [55]:
# Convert y_train and y_val to numpy arrays.
# Neural networks in TensorFlow/Keras expect the input data and labels to be numpy arrays.
y_train = np.array(y_train)
y_val = np.array(y_val)

# Fit the model using the training data.
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [57]:
# Convert y_test to a numpy array.
# Neural networks in TensorFlow/Keras expect the labels to be numpy arrays.
y_test = np.array(y_test)

# Evaluate the performance of the trained model on the test set.
loss, accuracy = model.evaluate(X_test, y_test)

# Print the accuracy of the model on the test set.
print(f"Test Accuracy: {accuracy*100:.2f}%")


Test Accuracy: 79.11%


In [20]:
# Save the model for the website
model.save('model.h5')

  saving_api.save_model(


In [21]:
import pickle

# Variables to save for the website
variables = {
    "max_length": max_length,
    "word2int": word2int
}

# Save the variables to a pickle file
with open('variables.pkl', 'wb') as f:
    pickle.dump(variables, f)
