In [1]:
print("hello")

hello


In [9]:
import numpy as np
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --- 1. Example Data (Movie Reviews) ---
# In a real project, you'd have thousands of these
reviews = [
    "this movie was amazing and fantastic",
    "I hated this movie it was boring",
    "loved it so good",
    "what a waste of time"
]
# 0 = negative, 1 = positive
labels = np.array([1, 0, 1, 0])

# --- 2. Train Word2Vec (The "Dictionary") ---

# Tokenize the text (split sentences into lists of words)
tokenized_reviews = [review.split() for review in reviews]

# Define Word2Vec model parameters
embedding_dim = 50  # Each word will be a 50-dimensional vector
min_word_count = 1  # Count words that appear at least once
window_size = 2  # Look at 2 words to the left and 2 to the right

# Train the Word2Vec model on our reviews
print("Training Word2Vec model...")
word2vec_model = Word2Vec(
    sentences=tokenized_reviews,
    vector_size=embedding_dim,
    window=window_size,
    min_count=min_word_count
)
print("Word2Vec model trained.")

# --- 3. Prepare Data for LSTM (Tokenizing & Padding) ---

# Keras needs integer sequences, not words.
# We create a new Tokenizer to build an integer-to-word index.
keras_tokenizer = Tokenizer()
keras_tokenizer.fit_on_texts(reviews)

# Convert reviews to sequences of integers
sequences = keras_tokenizer.texts_to_sequences(reviews)

# Get the vocabulary size from the Keras tokenizer
vocab_size = len(keras_tokenizer.word_index) + 1  # +1 for the 0 padding

# Pad sequences so they are all the same length
max_length = 10  # Max length of a review (in words)
padded_sequences = pad_sequences(
    sequences,
    maxlen=max_length,
    padding='post'
)

print("\nKeras Tokenizer Word Index:", keras_tokenizer.word_index)
print("\nPadded Integer Sequences:\n", padded_sequences)

# --- 4. Create the Embedding Matrix (Connecting Word2Vec to Keras) ---

# We create a matrix where the i-th row is the Word2Vec vector
# for the word with index 'i' in the Keras tokenizer.

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in keras_tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]
    # Words not in the Word2Vec model (e.g., if min_count was > 1)
    # will be left as all-zero vectors.

print(f"\nEmbedding Matrix shape: {embedding_matrix.shape}")

# --- 5. Build the LSTM Model ---

print("Building Keras LSTM model...")
model = Sequential()

# Add the Embedding layer
# This layer is our "lookup table" (Word2Vec)
model.add(Embedding(
    input_dim=vocab_size,  # Size of our vocabulary
    output_dim=embedding_dim,  # Dimension of our vectors (from Word2Vec)
    weights=[embedding_matrix],  # Pre-load the Word2Vec weights
    input_length=max_length,  # Length of our padded sequences
    trainable=False  # **Crucial: Freeze the embeddings!**
    # We don't want to re-train them.
))

# Add the LSTM layer (The "Thinker")
# It processes the sequences of vectors
model.add(LSTM(units=100))  # 100 is the number of memory units

# Add the final output layer
# Sigmoid is used for binary (0 or 1) classification
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

# --- 6. Train the LSTM Model ---

print("\nTraining LSTM model...")
# With a real dataset, you would use a validation_split
model.fit(
    padded_sequences,
    labels,
    epochs=20,
    verbose=1
)

# --- 7. Make a Prediction ---

print("\nMaking a prediction...")
test_review = "this movie was great and good"
test_seq = keras_tokenizer.texts_to_sequences([test_review])
test_pad = pad_sequences(test_seq, maxlen=max_length, padding='post')

prediction = model.predict(test_pad)
print(f"Review: '{test_review}'")
print(f"Prediction (Raw): {prediction[0][0]}")
print(f"Predicted Label: {'Positive' if prediction[0][0] > 0.5 else 'Negative'}")


Training Word2Vec model...
Word2Vec model trained.

Keras Tokenizer Word Index: {'this': 1, 'movie': 2, 'was': 3, 'it': 4, 'amazing': 5, 'and': 6, 'fantastic': 7, 'i': 8, 'hated': 9, 'boring': 10, 'loved': 11, 'so': 12, 'good': 13, 'what': 14, 'a': 15, 'waste': 16, 'of': 17, 'time': 18}

Padded Integer Sequences:
 [[ 1  2  3  5  6  7  0  0  0  0]
 [ 8  9  1  2  4  3 10  0  0  0]
 [11  4 12 13  0  0  0  0  0  0]
 [14 15 16 17 18  0  0  0  0  0]]

Embedding Matrix shape: (19, 50)
Building Keras LSTM model...



Training LSTM model...
Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 970ms/step - accuracy: 0.5000 - loss: 0.6930
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.5000 - loss: 0.6926
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.7500 - loss: 0.6922
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.5000 - loss: 0.6918
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.7500 - loss: 0.6914
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 1.0000 - loss: 0.6909
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 1.0000 - loss: 0.6904
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 1.0000 - loss: 0.6898
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━