In [9]:
!pip install gensim datasets transformers tensorflow scikit-learn matplotlib pandas


Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m77.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━

## **I have used CNN News Dataset, becuase I was not able to import NewsQA dataset due to some issue.**

In [3]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import torch
from transformers import BertTokenizer, BertModel
import tensorflow as tf



In [4]:
from datasets import load_dataset

# CNN/DailyMail dataset - news articles
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
texts = [x['article'] for x in dataset if x['article']]
sentences = [t.lower().split() for t in texts]

In [5]:
len(sentences)

2871

**Experiment 1: Creating Word Embedding using Word2Vec**

In [6]:
model_w2v = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)
words_w2v = list(model_w2v.wv.index_to_key)
embeddings_w2v = [model_w2v.wv[w].tolist() for w in words_w2v]
df_w2v = pd.DataFrame({'word': words_w2v, 'embedding': embeddings_w2v})

df_w2v.to_csv('word2vec_embeddings.csv', index=False)

In [7]:
word2vec_df = pd.read_csv('word2vec_embeddings.csv')
word2vec_df.shape
word2vec_df.head()

Unnamed: 0,word,embedding
0,the,"[-0.5605956315994263, 0.023070737719535828, 0...."
1,to,"[-0.8302412629127502, -2.2545676231384277, 1.8..."
2,of,"[-0.31927967071533203, -0.09103371202945709, -..."
3,a,"[-2.663062572479248, 2.3355190753936768, 0.419..."
4,and,"[-0.8691964745521545, 0.5132676959037781, -0.9..."


**Experiment 2: Creating Word Embeddings with Embedding Layer in Keras**

In [8]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from datasets import load_dataset

# Load CNN/DailyMail dataset - news articles

dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")

texts = [x['article'] for x in dataset if x['article'] is not None]

texts = [text for text in texts if text.strip()]

sentences_as_lists = [text.split() for text in texts]

sentences_as_strings = [" ".join(sentence) for sentence in sentences_as_lists]

print(f"\nType of sentences_as_strings: {type(sentences_as_strings)}")
if sentences_as_strings:
    print(f"Type of first element in sentences_as_strings: {type(sentences_as_strings[0])}")
    print(f"Sample article preview: {sentences_as_strings[0][:200]}...")

voc_size = 10000
sent_length = 30
embedding_dim = 100

onehot_repr = [one_hot(sentence, voc_size) for sentence in sentences_as_strings]

# Pad sequences
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)

# Build a simple Embedding model

model = Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=embedding_dim, input_length=sent_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
labels = np.random.randint(0, 2, size=(len(embedded_docs),))  # dummy binary labels


model.fit(embedded_docs, labels, epochs=1, batch_size=32, verbose=1)

embedding_matrix = model.layers[0].get_weights()[0]  # shape = (voc_size, embedding_dim)

words = [f"word_{i}" for i in range(voc_size)]
df_embed = pd.DataFrame({'word': words, 'embedding': embedding_matrix.tolist()})
df_embed.to_csv('embedding_layer_cnn_dailymail.csv', index=False)



Type of sentences_as_strings: <class 'list'>
Type of first element in sentences_as_strings: <class 'str'>
Sample article preview: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on ...




[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.4944 - loss: 0.6937


**Experiment 3**: Creating Embeddings using BERT

In [9]:
pip install transformers torch pandas




In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


In [11]:
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
import torch
import pandas as pd

# Load CNN/DailyMail dataset (for example)
dataset = load_dataset("cnn_dailymail", "3.0.0")

# For demo, take first few articles
articles = dataset['train']['article'][:20]

# Flatten into list of “text units” — you can treat each article or paragraph as one “sentence”
texts = articles  # here using whole articles for simplicity

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

word_embeddings = {}

for text in texts:
    # Tokenize (add special tokens [CLS], [SEP]) — like GfG suggests
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)

    # last_hidden_state: shape [1, seq_len, hidden_dim]
    last_hidden = outputs.last_hidden_state.squeeze(0)
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze(0))

    for token, emb in zip(tokens, last_hidden):
        word_embeddings[token] = emb.tolist()

# Save to CSV
df = pd.DataFrame(list(word_embeddings.items()), columns=['token', 'embedding'])
df.to_csv('bert_embeddings_gfg_style.csv', index=False)



Saved embeddings to bert_embeddings_gfg_style.csv


In [12]:
df_bert = pd.read_csv('bert_embeddings_gfg_style.csv')
df_bert.shape
df_bert.head()

Unnamed: 0,token,embedding
0,[CLS],"[-0.6213639378547668, -0.5218852758407593, 0.0..."
1,london,"[-0.6522814035415649, 0.12825751304626465, -0...."
2,",","[-0.38657742738723755, 0.025143858045339584, 1..."
3,england,"[-1.1262454986572266, -0.4578366279602051, 0.7..."
4,(,"[-1.205855369567871, 0.07306090742349625, 0.81..."
