In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample data
texts = [
    "I love machine learning",
    "Deep learning is fascinating",
    "Natural language processing is a part of AI",
    "Embeddings capture semantic information",
]

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences to ensure uniform input size
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')


In [None]:
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=50, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


In [None]:
# Dummy labels for demonstration
labels = [1, 1, 0, 0]

# Train the model
model.fit(padded_sequences, labels, epochs=10)


In [None]:
# Extract the embeddings
embeddings = model.layers[0].get_weights()[0]

# Save embeddings to a file or use them directly in your application
import numpy as np

np.save('embeddings.npy', embeddings)

# Example: use the embeddings for a word
word_index = tokenizer.word_index
word_embedding = embeddings[word_index['machine']]
print(word_embedding)


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# Sample data
texts = [
    "I love machine learning",
    "Deep learning is fascinating",
    "Natural language processing is a part of AI",
    "Embeddings capture semantic information",
]

# Tokenize the text
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)
vocab_size = len(vectorizer.vocabulary_)

# Convert to PyTorch tensors
X = torch.tensor(X.toarray(), dtype=torch.float32)

# Dummy labels for demonstration
labels = torch.tensor([1, 1, 0, 0], dtype=torch.float32)


In [None]:
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(EmbeddingModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_dim, sparse=True)
        self.fc = nn.Linear(embedding_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        x = self.fc(x)
        return torch.sigmoid(x)

embedding_dim = 50
model = EmbeddingModel(vocab_size, embedding_dim)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [None]:
# Create DataLoader
dataset = torch.utils.data.TensorDataset(X, labels)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for batch_X, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = model(batch_X.long())
        loss = criterion(outputs, batch_y.unsqueeze(1))
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')


In [None]:
# Extract the embeddings
embeddings = model.embedding.weight.data.numpy()

# Save embeddings to a file or use them directly in your application
import numpy as np

np.save('embeddings.npy', embeddings)

# Example: use the embeddings for a word
word_index = vectorizer.vocabulary_
word_embedding = embeddings[word_index['machine']]
print(word_embedding)
