In [43]:
# # prerequisites or dependencies required for project 

# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install seaborn
# %pip install nltk
# %pip install spacy
# %pip install scikit-learn
# %pip install keras
# %pip install tensorflow
# %pip install gensim

In [44]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
# from keras.preprocessing import sequence
from tensorflow.keras.preprocessing import sequence 
import spacy
from gensim.models import Word2Vec

In [45]:
# # nltk downloads
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')

# # Spacy download
# spacy.cli.download("en_core_web_sm")
# spacy.cli.download("en_core_web_md")

In [46]:
df = pd.read_csv("AmazonReview/Reviews.csv")
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [56]:
# Step 1: Exploratory Data Analysis (EDA)
def perform_eda(data):
    # Explore the dataset and analyze the relevant statistics
    # Identify any missing values, outliers, or patterns in the data
    
    # Map the scores to sentiment labels
    data['Sentiment'] = data['Score'].apply(lambda score: 'Positive' if score > 3 else 'Negative' if score < 3 else 'Neutral')

# Step 2: Preprocessing the DataFrame
def preprocess_dataframe(data):
    # Perform data cleaning, remove unwanted characters, convert to lowercase
    # Tokenize the text, remove stopwords
    # Apply stemming or lemmatization to reduce words to their base form
    # Perform Part-of-Speech (POS) tagging for feature extraction
    
    # Example code:
    stop_words = set(stopwords.words('english'))
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    # Data cleaning
    data['clean_text'] = data['Text'].str.replace(r'[^a-zA-Z\s]', '').str.lower()
    
    # Tokenization and removing stopwords
    data['tokens'] = data['clean_text'].apply(lambda x: [word for word in word_tokenize(x) if word not in stop_words])
    
    # Stemming
    data['stemmed'] = data['tokens'].apply(lambda x: [porter.stem(word) for word in x])
    
    # Lemmatization
    data['lemmatized'] = data['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    
    # POS tagging
    data['pos_tags'] = data['tokens'].apply(lambda x: pos_tag(x))
    
    return data

# Step 3: Word Embedding using libraries
def word_embedding(data):
    # # Use spaCy for word embedding
    
    # # Load the spaCy model
    # nlp = spacy.load('en_core_web_md')
    
    # # Create a list to store the word vectors
    # embeddings = []
    
    # # Iterate over each preprocessed text
    # for text in data['clean_text']:
    #     # Process the text with spaCy
    #     doc = nlp(text)
        
    #     # Get the average vector for the text
    #     avg_vector = doc.vector
        
    #     # Append the average vector to the embeddings list
    #     embeddings.append(avg_vector)
    
    # # Convert the embeddings list to a NumPy array
    # embedding_matrix = np.array(embeddings)
    
    # return embedding_matrix

    # Use Word2Vec for word embedding
    
    # Convert the preprocessed text data to a list of sentences
    sentences = [sentence.split() for sentence in data['clean_text']]
    
    # Train Word2Vec model
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    
    # Get the embedding matrix
    embedding_matrix = model.wv.vectors
    
    return embedding_matrix

# Step 4: Modeling using RNN
def create_rnn_model(embedding_matrix, max_len):
    # Create an RNN model (e.g., LSTM) for sentiment analysis
    # Use the embedding_matrix for word embedding
    
    model = Sequential()
    model.add(Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], 
                        weights=[embedding_matrix], input_length=max_len, trainable=False))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(64))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    
    return model
    

# Step 5: Test-Train Splitting
def perform_train_test_split(data, splitsize):
    # Split the preprocessed data into training and testing sets
    
    # Split the dataset into text samples and sentiment labels
    X = data['Text']
    y = data['Score']

    # Split the dataset into training and testing sets
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=splitsize, stratify=y, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=splitsize, random_state=42)
        
    return X_train, X_test, y_train, y_test

# Step 6: Model Training and Testing
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, max_len):
    # Compile and train the model on the training data
    # Evaluate the model on the testing data
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)
    # vocab_size = len(tokenizer.word_index) + 1
    # max_len = 300

    X_train = sequence.pad_sequences(X_train, padding='post', maxlen=max_len)
    X_test = sequence.pad_sequences(X_test, padding='post', maxlen=max_len)

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
    
    # Evaluate the model on the testing data
    loss, accuracy = model.evaluate(X_test, y_test)
    print("Test Loss:", loss)
    print("Test Accuracy:", accuracy)


    # model testing 

    # Obtain predicted labels from the sentiment analysis model for the test data
    y_pred = model.predict(X_test)

    # Assign the true labels of the test data to y_true
    y_true = y_test

    # Convert probabilities to predicted class labels
    y_pred_classes = np.argmax(y_pred, axis=1)  # Assuming one-hot encoded labels

    # Calculate precision, recall, and F1 score
    precision = precision_score(y_true, y_pred_classes, average='macro')
    recall = recall_score(y_true, y_pred_classes, average='macro')
    f1 = f1_score(y_true, y_pred_classes, average='macro')

    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)


In [48]:
# Load the dataset
data = df[:2000].copy()
data.shape

(2000, 10)

In [49]:
# Step 1: Exploratory Data Analysis
perform_eda(data)

In [50]:

# Step 2: Preprocessing the DataFrame
preprocessed_data = preprocess_dataframe(data)


In [51]:

# Step 3: Word Embedding using libraries
embedding_matrix = word_embedding(preprocessed_data)


In [52]:
max_len = 200

In [53]:

# Step 4: Modeling using RNN
model = create_rnn_model(embedding_matrix, max_len)


In [54]:

# Step 5: Test-Train Splitting
X_train, X_test, y_train, y_test = perform_train_test_split(preprocessed_data, 0.2)


In [57]:

# Step 6: Model Training and Testing
train_and_evaluate_model(model, X_train, X_test, y_train, y_test, max_len)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: -238.21351623535156
Test Accuracy: 0.10000000149011612
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# import pickle
# import numpy as np
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# from keras.models import load_model

# def test_model(model, X_test, y_test):
#     # Preprocess the test data (similar to preprocessing done during training)
#     # Pass the preprocessed test data through the trained model
#     # Calculate evaluation metrics (e.g., accuracy, precision, recall, F1-score)

#     # Preprocess the test data (similar to preprocessing done during training)
#     tokenizer = Tokenizer()
#     tokenizer.fit_on_texts(X_test)
#     X_test = tokenizer.texts_to_sequences(X_test)
#     max_len = 100
#     X_test = sequence.pad_sequences(X_test, padding='post', maxlen=max_len)

#     # Pass the preprocessed test data through the trained model
#     y_pred = model.predict(X_test)
#     y_pred = np.round(y_pred).flatten()

#     # Calculate evaluation metrics
#     accuracy = accuracy_score(y_test, y_pred)
#     precision = precision_score(y_test, y_pred)
#     recall = recall_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred)

#     # Print evaluation metrics
#     print("Test Accuracy:", accuracy)
#     print("Precision:", precision)
#     print("Recall:", recall)
#     print("F1-score:", f1)

#     # Additional steps for saving the model
#     model.save("sentiment_analysis_model.h5")
#     with open("tokenizer.pickle", "wb") as handle:
#         pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


# # Load the trained model and tokenizer
# model = load_model("sentiment_analysis_model.h5")
# with open("tokenizer.pickle", "rb") as handle:
#     tokenizer = pickle.load(handle)

# # Perform testing on new, unseen data
# new_data = ["This restaurant exceeded my expectations! The food was delicious and the service was impeccable."]
# new_data = tokenizer.texts_to_sequences(new_data)
# max_len = 100
# new_data = sequence.pad_sequences(new_data, padding='post', maxlen=max_len)
# predictions = model.predict(new_data)
# print("Predictions:", predictions)

# # Further improvements, hyperparameter tuning, and model evaluation can be performed based on the task requirements.
