In [1]:
!pip install gensim datasets transformers tensorflow scikit-learn matplotlib pandas




In [11]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import torch
from transformers import BertTokenizer, BertModel
import tensorflow as tf



In [29]:
from datasets import load_dataset

# CNN/DailyMail dataset - news articles
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
texts = [x['article'] for x in dataset if x['article']]
sentences = [t.lower().split() for t in texts]

README.md: 0.00B [00:00, ?B/s]

3.0.0/train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

3.0.0/validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

3.0.0/test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [30]:
len(sentences)

2871

**Experiment 1: Creating Word Embedding using Word2Vec**

In [31]:
model_w2v = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)
words_w2v = list(model_w2v.wv.index_to_key)
embeddings_w2v = [model_w2v.wv[w].tolist() for w in words_w2v]
df_w2v = pd.DataFrame({'word': words_w2v, 'embedding': embeddings_w2v})

df_w2v.to_csv('word2vec_embeddings.csv', index=False)

In [32]:
word2vec_df = pd.read_csv('word2vec_embeddings.csv')
word2vec_df.shape
word2vec_df.head()

Unnamed: 0,word,embedding
0,the,"[0.7202968001365662, 0.8493690490722656, 0.903..."
1,to,"[0.29217055439949036, -1.58308744430542, 1.175..."
2,of,"[-1.050939679145813, 0.21787719428539276, -1.0..."
3,a,"[-1.1182552576065063, 1.6475926637649536, 0.54..."
4,and,"[-0.7194643616676331, 0.027531515806913376, -0..."


**Experiment 2: Creating Word Embeddings with Embedding Layer in Keras**

In [35]:
# ✅ Imports
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from datasets import load_dataset

# Load CNN/DailyMail dataset - news articles

dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")  # small sample for speed

# Extract articles (you can use 'article' or 'highlights' field)
texts = [x['article'] for x in dataset if x['article'] is not None]

# Filter out empty strings
texts = [text for text in texts if text.strip()]

print(f"Loaded {len(texts)} news articles")

# Convert list of strings to list of lists (sentences)
sentences_as_lists = [text.split() for text in texts]

# Convert list of lists to list of strings (sentences)
sentences_as_strings = [" ".join(sentence) for sentence in sentences_as_lists]

print(f"\nType of sentences_as_strings: {type(sentences_as_strings)}")
if sentences_as_strings:
    print(f"Type of first element in sentences_as_strings: {type(sentences_as_strings[0])}")
    print(f"Sample article preview: {sentences_as_strings[0][:200]}...")

# Define vocabulary size and embedding parameters
voc_size = 10000     # number of unique words to encode
sent_length = 30     # fixed input length per sample
embedding_dim = 100  # embedding vector size

# Convert sentences to one-hot encoded integers
print("\nEncoding sentences...")
onehot_repr = [one_hot(sentence, voc_size) for sentence in sentences_as_strings]

# Pad sequences
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)

print("\n  Example of padded sequences:")
print(embedded_docs[:3])

# Build a simple Embedding model
print("\nBuilding model...")
model = Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=embedding_dim, input_length=sent_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

# Compile and train briefly (with dummy labels)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
labels = np.random.randint(0, 2, size=(len(embedded_docs),))  # dummy binary labels

print("\nTraining model...")
model.fit(embedded_docs, labels, epochs=1, batch_size=32, verbose=1)

# Extract learned embedding matrix
embedding_matrix = model.layers[0].get_weights()[0]  # shape = (voc_size, embedding_dim)
print("\n Embedding matrix shape:", embedding_matrix.shape)

# Save embeddings to CSV
# Since `one_hot` doesn't preserve actual words, we'll create placeholder word labels
words = [f"word_{i}" for i in range(voc_size)]
df_embed = pd.DataFrame({'word': words, 'embedding': embedding_matrix.tolist()})
df_embed.to_csv('embedding_layer_cnn_dailymail.csv', index=False)

print(f"\n Embeddings saved to 'embedding_layer_cnn_dailymail.csv'")

Loaded 2871 news articles

Type of sentences_as_strings: <class 'list'>
Type of first element in sentences_as_strings: <class 'str'>
Sample article preview: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on ...

Encoding sentences...

  Example of padded sequences:
[[2581  542 4021 4550 6397 5452 9159 8982 2654 2378 7577  670 7609 8042
  9771 8099 2654 3915 9366 7011 4512 3405 3411  894 2581 2236 5838 1788
  5769 5295]
 [3069  239 8698 3080 4295 3069 4321 1508 8900 9809  735  894 7797 6175
  5886 6503 4241 4295 4241 4295  648 6796 9174 4335 6752 2378 7577  670
  7609 8042]
 [3069 8514 4074 4295 8839 4714  556 3021 2467 8385 5300 3641 9761 1522
  3069 6843  447 4295 3352 7687 4069 5680 8046 9159 3352 2378 7577  670
  7609 8042]]

Building model...

Training model...
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step 