In [1]:
!pip install gensim



In [2]:
!pip install --upgrade numpy gensim

Collecting numpy
  Using cached numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)


In [3]:
!pip uninstall numpy gensim
!pip install numpy gensim

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Would remove:
    /usr/local/bin/f2py
    /usr/local/lib/python3.11/dist-packages/numpy-1.26.4.dist-info/*
    /usr/local/lib/python3.11/dist-packages/numpy.libs/libgfortran-040039e1.so.5.0.0
    /usr/local/lib/python3.11/dist-packages/numpy.libs/libopenblas64_p-r0-0cf96a72.3.23.dev.so
    /usr/local/lib/python3.11/dist-packages/numpy.libs/libquadmath-96973f99.so.0.0.0
    /usr/local/lib/python3.11/dist-packages/numpy/*
Proceed (Y/n)? y
  Successfully uninstalled numpy-1.26.4
Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Would remove:
    /usr/local/lib/python3.11/dist-packages/gensim-4.3.3.dist-info/*
    /usr/local/lib/python3.11/dist-packages/gensim/*
Proceed (Y/n)? y
  Successfully uninstalled gensim-4.3.3
Collecting numpy
  Using cached numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manyli

In [4]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import multiprocessing

In [7]:
df = pd.read_csv('./bbc_encoded.csv')

In [8]:
def clean_text(text):
  text = re.sub(r'[^ \nA-Za-z0-9À-ÖØ-öø-ÿ/]+', '', text)
  text = re.sub(r'[\\/×\^\]\[÷]', '', text)
  return text

In [9]:
def lower_case(text):
  text = text.lower()
  return text

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
stopwords_list = stopwords.words("english")
def remover(text):
  text_tokens = text.split(" ")
  final_list = [word for word in text_tokens if not word in stopwords_list]
  text = ' '.join(final_list)
  return text

In [12]:
df["text"] = df["text"].astype(str)
df["text"] = df["text"].apply(lower_case)
df["text"] = df["text"].apply(clean_text)
df["text"] = df["text"].apply(remover)

In [13]:
def get_w2vdf(df):
    w2v_df = pd.DataFrame(df["text"]).values.tolist()
    for i in range(len(w2v_df)):
        w2v_df[i] = w2v_df[i][0].split(" ")
    return w2v_df

In [14]:
def train_w2v(w2v_df):
    cores = multiprocessing.cpu_count()  # Using multiple cores for training
    w2v_model = Word2Vec(min_count=4,
                         window=4,
                         vector_size=300,  # size is deprecated in favor of vector_size
                         alpha=0.03,
                         min_alpha=0.0007,
                         sg=1,  # Skip-gram model
                         workers=cores-1)

    w2v_model.build_vocab(w2v_df, progress_per=10000)
    w2v_model.train(w2v_df, total_examples=w2v_model.corpus_count, epochs=100, report_delay=1)
    return w2v_model

In [15]:
w2v_df = get_w2vdf(df)
w2v_model = train_w2v(w2v_df)

In [16]:
def sentence_to_vec(sentence_tokens, model, dim):
    vectors = [model.wv[word] for word in sentence_tokens if word in model.wv]
    if not vectors:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)

In [17]:
embedding_dim = 300
sentence_vectors = np.array([sentence_to_vec(sentence, w2v_model, embedding_dim) for sentence in w2v_df])

In [18]:
np.save("sentence_vectors.npy", sentence_vectors)
print("Saved sentence vectors to 'sentence_vectors.npy' with shape:", sentence_vectors.shape)

Saved sentence vectors to 'sentence_vectors.npy' with shape: (2127, 300)
