In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'reviews-shl:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5753003%2F9462344%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240926%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240926T225432Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Dcafa1aacaac12bdb20c918acaaded3d261adf93760db0ba888e986aa2b35aa22cc930e12b946a11745d85a240e24ff0c1c57ec4333daf25c39738f3260c83f849d7fab15f39bef76034e1a6a2c6f70d2b5fac8ae4e6b968b2c89884b500197140ba17c0b8e23da51b2e72b79c6705e9d78004cd2dfdf104ff0cc51cd613fa8ec191101a9b913b318ce2891e4ec0f9817c4fbbfd5c6321ff12065f67afe2f79cdb93a2356c08d6b9980d9f0d89f22a42b243f9ff0561ccba4712c9c403f8d95ee47b1cec94a9fd403086902e9ec40debb130f143c65711090ba83534b8954b45aea60345d95c4098bb841dfa1d1d571dc4fa580af3d36b41f727b7fbc3e010499'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
import keras.utils as ku

import pandas as pd
import numpy as np
import string, os

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import nltk

# Attempt to download the WordNet corpus
nltk.download('wordnet', download_dir='/usr/share/nltk_data')


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import pandas as pd
import string
import spacy

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load the CSV file directly
article_df = pd.read_csv('/kaggle/input/reviews-shl/reviews_supplements.csv')

# Extract the 'text' column data and store it in a list, dropping NaN values
all_texts = article_df['text'].dropna().values  # Removes NaN (missing) values

# Remove rows with 'Unknown' values
all_texts = [t for t in all_texts if t != "Unknown"]

# Function to clean text using spaCy
def clean_text(txt):
    # Process the text with spaCy
    doc = nlp(txt)
    # Remove punctuation and stop words, and lemmatize
    cleaned_words = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(cleaned_words)  # Return cleaned text as a single string

# Apply the clean_text function to each entry in all_texts
corpus = [clean_text(x) for x in all_texts]

# View the first 10 cleaned texts
print(corpus[:10])


NameError: name '_C' is not defined

In [None]:
from tensorflow.keras.utils import to_categorical


# Initialize the Tokenizer with a limited vocabulary size
max_words = 8000  # Adjust this value based on your needs
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(corpus)

# Convert data to sequence of tokens
max_sequence_len = 150  # Adjust this value based on your needs

def generate_sequences(texts):
    sequences = []
    for text in texts:
        token_list = tokenizer.texts_to_sequences([text])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            if len(n_gram_sequence) > max_sequence_len:
                n_gram_sequence = n_gram_sequence[-max_sequence_len:]
            sequences.append(n_gram_sequence)
    return sequences

# Generate sequences
input_sequences = generate_sequences(corpus)

# Pad sequences
padded_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Separate predictors and label
predictors, label = padded_sequences[:,:-1], padded_sequences[:,-1]

# One-hot encode labels
label = to_categorical(label, num_classes=max_words)

In [None]:
# After generating sequences and padding
print("Predictors shape:", predictors.shape)
print("Label shape:", label.shape)

# Update these variables based on the actual shapes
max_sequence_len = predictors.shape[1] + 1  # +1 because predictors don't include the last word
max_words = label.shape[1]

# Create the model
def create_model(max_words, max_sequence_len):
    model = Sequential([
        Embedding(max_words, 10, input_length=max_sequence_len-1),
        LSTM(50, return_sequences=False),
        Dropout(0.1),
        Dense(max_words, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

# Create and compile the model
model = create_model(max_words, max_sequence_len)

# Build the model with the correct input shape
model.build(input_shape=(None, max_sequence_len-1))

model.summary()

In [None]:
def create_model(total_words, max_sequence_len=300):
    input_len = max_sequence_len - 1

    model = Sequential()
    model.add(Embedding(input_dim=total_words, output_dim=10, input_length=input_len))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dropout(0.1))
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model

# Create the model with explicit input shape
input_len = predictors.shape[1]
model = create_model(total_words, max_sequence_len=input_len+1)

# Build the model with sample input
model.build(input_shape=(None, input_len))

# Print model summary
model.summary()

NameError: name 'predictors' is not defined

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense, Bidirectional

def create_model(max_words, max_sequence_len):
    model = Sequential([
        Embedding(max_words, 100, input_length=max_sequence_len-1),  # Increase embedding dimension
        Bidirectional(LSTM(128, return_sequences=True)),  # First Bidirectional LSTM layer
        Dropout(0.3),
        Bidirectional(LSTM(128)),  # Second Bidirectional LSTM layer
        Dropout(0.3),
        Dense(max_words, activation='softmax')  # Softmax for multi-class classification
    ])

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model


In [None]:
# Train the model
batch_size = 26  # Adjust this value based on your memory constraints
epochs = 50  # You may want to increase this for better results

history = model.fit(predictors, label, epochs=epochs, batch_size=batch_size, verbose=1)

print("Model training completed.")


Epoch 1/50
[1m8242/8242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 7ms/step - loss: 5.8652
Epoch 2/50
[1m8242/8242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 8ms/step - loss: 5.8255
Epoch 3/50
[1m8242/8242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 8ms/step - loss: 5.7979
Epoch 4/50
[1m8242/8242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 7ms/step - loss: 5.7761
Epoch 5/50
[1m8242/8242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 8ms/step - loss: 5.7465
Epoch 6/50
[1m8242/8242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 7ms/step - loss: 5.7334
Epoch 7/50
[1m8242/8242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 8ms/step - loss: 5.7022
Epoch 8/50
[1m8242/8242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 8ms/step - loss: 5.6885
Epoch 9/50
[1m8242/8242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 8ms/step - loss: 5.6650
Epoch 10/50
[1m8242/8242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [None]:
def generate_text(model, tokenizer, seed_text, next_words, max_sequence_len, temperature=1.0):
    generated_text = seed_text

    for _ in range(next_words):
        # Tokenize the seed text
        token_list = tokenizer.texts_to_sequences([generated_text])[0]
        # Pad the sequence
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        # Predict the next word
        predicted = model.predict(token_list, verbose=0)[0]

        # Apply temperature scaling
        predicted = np.log(predicted + 1e-10) / temperature
        predicted = np.exp(predicted) / np.sum(np.exp(predicted))

        # Sample the next word
        predicted_word_index = np.random.choice(range(len(predicted)), p=predicted)

        # Get the actual word from the index
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                output_word = word
                break

        # Append the predicted word to the seed text
        generated_text += " " + output_word

    return generated_text

# Example usage with temperature
generated_text = generate_text(model, tokenizer, seed_text, next_words=20, max_sequence_len=max_sequence_len, temperature=0.8)
print(generated_text)

# Example usage with different temperatures
generated_text_low_temp = generate_text(model, tokenizer, seed_text, next_words=20, max_sequence_len=max_sequence_len, temperature=0.5)  # Less random
print("Low Temperature Output:", generated_text_low_temp)

generated_text_high_temp = generate_text(model, tokenizer, seed_text, next_words=20, max_sequence_len=max_sequence_len, temperature=1.5)  # More random
print("High Temperature Output:", generated_text_high_temp)

# Example usage with different temperatures
generated_text_low_temp = generate_text(model, tokenizer, seed_text, next_words=20, max_sequence_len=max_sequence_len, temperature=0.5)  # Less random
print("Low Temperature Output:", generated_text_low_temp)

generated_text_high_temp = generate_text(model, tokenizer, seed_text, next_words=20, max_sequence_len=max_sequence_len, temperature=1)  # More random
print("High Temperature Output:", generated_text_high_temp)




This product is fast read low check new thermometer br time find guy honest accuracy flexible tip little large light big use 100
Low Temperature Output: This product is work work great price good product good price great price product reorder product good price price great value try taste
High Temperature Output: This product is tough rate size fragile notice outside temp loss hormone drop state important 5 mind grow need take necessary exposure try
Low Temperature Output: This product is work work work month stop work week ago get week get 3 week use disappointed seller receive refund new product
High Temperature Output: This product is get dr month disappointed work describe friend start lemon time feel day need daily daily result kick need thank process


In [None]:
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences

# Define the function to generate text with temperature scaling
def generate_text(model, tokenizer, seed_text, next_words, max_sequence_len, temperature=1.0):
    generated_text = seed_text

    for _ in range(next_words):
        # Tokenize the seed text
        token_list = tokenizer.texts_to_sequences([generated_text])[0]
        # Pad the sequence
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        # Predict the next word
        predicted = model.predict(token_list, verbose=0)[0]

        # Apply temperature scaling
        predicted = np.log(predicted + 1e-10) / temperature
        predicted = np.exp(predicted) / np.sum(np.exp(predicted))

        # Sample the next word
        predicted_word_index = np.random.choice(range(len(predicted)), p=predicted)

        # Get the actual word from the index
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                output_word = word
                break

        # Append the predicted word to the seed text
        generated_text += " " + output_word

    return generated_text

# Function to generate multiple synthetic reviews
def generate_multiple_reviews(model, tokenizer, num_reviews=500):
    reviews = []

    # Define seed phrases for different sentiments
    seed_phrases = {
        "good": ["Great product", "I love this supplement", "Highly effective", "Fantastic results", "Worth every penny"],
        "bad": ["Did not work for me", "Waste of money", "Not worth it", "Terrible experience", "Would not recommend"],
        "neutral": ["It's okay", "Average supplement", "Nothing special", "Mediocre product", "Satisfactory"],
    }

    # Flatten the seed phrases into a list for random selection
    all_seeds = [phrase for sentiments in seed_phrases.values() for phrase in sentiments]

    for i in range(num_reviews):
        # Randomly choose a seed from the available phrases
        seed_text = np.random.choice(all_seeds)

        # Set random temperature between 0.5 and 1.5
        temperature = np.random.uniform(0.5, 1.5)

        # Set random length for the generated review between 5 and 10 words
        next_words = np.random.randint(5, 11)

        # Generate text using the generate_text function
        generated_text = generate_text(model, tokenizer, seed_text, next_words=next_words, max_sequence_len=max_sequence_len, temperature=temperature)

        # Append the generated review to the list
        reviews.append(generated_text)

        if (i + 1) % 50 == 0:  # Print status every 50 reviews
            print(f"Generated {i + 1} reviews so far...")

    return reviews

# Generate reviews
generated_reviews = generate_multiple_reviews(model, tokenizer)

# Save generated reviews to a DataFrame
reviews_df = pd.DataFrame(generated_reviews, columns=["Review"])

# Save to CSV
reviews_df.to_csv("synthetic_amazon_reviews.csv", index=False)

print("500 synthetic reviews generated and saved to 'synthetic_amazon_reviews.csv'.")


Generated 50 reviews so far...
Generated 100 reviews so far...
Generated 150 reviews so far...
Generated 200 reviews so far...
Generated 250 reviews so far...
Generated 300 reviews so far...
Generated 350 reviews so far...
Generated 400 reviews so far...
Generated 450 reviews so far...
Generated 500 reviews so far...
500 synthetic reviews generated and saved to 'synthetic_amazon_reviews.csv'.


In [None]:
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences

# Define the function to generate text with temperature scaling
def generate_text(model, tokenizer, seed_text, next_words, max_sequence_len, temperature=1.0):
    generated_text = seed_text

    for _ in range(next_words):
        # Tokenize the seed text
        token_list = tokenizer.texts_to_sequences([generated_text])[0]
        # Pad the sequence
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        # Predict the next word
        predicted = model.predict(token_list, verbose=0)[0]

        # Apply temperature scaling
        predicted = np.log(predicted + 1e-10) / temperature
        predicted = np.exp(predicted) / np.sum(np.exp(predicted))

        # Sample the next word
        predicted_word_index = np.random.choice(range(len(predicted)), p=predicted)

        # Get the actual word from the index
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                output_word = word
                break

        # Append the predicted word to the seed text
        generated_text += " " + output_word

    return generated_text

# Function to generate multiple synthetic reviews
def generate_multiple_reviews(model, tokenizer, num_reviews=200):  # Set to 200 reviews
    reviews = []

    # Define seed phrases for different sentiments
    seed_phrases = {
        "good": ["Great product", "I love this supplement", "Highly effective", "Fantastic results", "Worth every penny"],
        "bad": ["Did not work for me", "Waste of money", "Not worth it", "Terrible experience", "Would not recommend"],
        "neutral": ["It's okay", "Average supplement", "Nothing special", "Mediocre product", "Satisfactory"],
    }

    # Flatten the seed phrases into a list for random selection
    all_seeds = [phrase for sentiments in seed_phrases.values() for phrase in sentiments]

    for i in range(num_reviews):
        # Randomly choose a seed from the available phrases
        seed_text = np.random.choice(all_seeds)

        # Set random temperature between 0.5 and 1.5
        temperature = np.random.uniform(0.5, 1.5)

        # Set random length for the generated review between 5 and 10 words
        next_words = np.random.randint(5, 11)

        # Generate text using the generate_text function
        generated_text = generate_text(model, tokenizer, seed_text, next_words=next_words, max_sequence_len=max_sequence_len, temperature=temperature)

        # Append the generated review to the list
        reviews.append(generated_text)

        if (i + 1) % 50 == 0:  # Print status every 50 reviews
            print(f"Generated {i + 1} reviews so far...")

    return reviews

# Generate reviews
generated_reviews = generate_multiple_reviews(model, tokenizer)

# Save generated reviews to a DataFrame
reviews_df = pd.DataFrame(generated_reviews, columns=["Review"])

# Save to CSV file in Kaggle environment
output_file_path = "/kaggle/working/synthetic_amazon_reviews.csv"
reviews_df.to_csv(output_file_path, index=False)

print(f"200 synthetic reviews generated and saved to '{output_file_path}'.")


Generated 50 reviews so far...
Generated 100 reviews so far...
Generated 150 reviews so far...
Generated 200 reviews so far...
200 synthetic reviews generated and saved to '/kaggle/working/synthetic_amazon_reviews.csv'.


In [None]:
import os
print("Files in working directory:", os.listdir("/kaggle/working/"))

Files in working directory: ['synthetic_amazon_reviews.csv', '.virtual_documents']


In [None]:
import pandas as pd

# Load the generated CSV file
reviews_df = pd.read_csv('/kaggle/working/synthetic_amazon_reviews.csv')

# Display the first few rows of the DataFrame
print(reviews_df.iloc[132])


Review    Terrible experience take vitamin try good tast...
Name: 132, dtype: object


In [None]:
import pandas as pd

# Load the generated CSV file
reviews_df = pd.read_csv('/kaggle/working/synthetic_amazon_reviews.csv')

# Display the first few rows of the DataFrame
print(reviews_df.iloc[133])

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/working/synthetic_amazon_reviews.csv'