# $$Word Embeddings$$

In this notebook, we try to build our very own Word Embedding model from scratch.

In [None]:
# installation of necessary packages
!pip install datasets
!pip install spacy torch wordninja contractions

In [None]:
#importing of all necessary packages for this notebook, as well as connecting to Google Drive, for easier access to saved datasets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import ast
from tqdm import tqdm
from google.colab import drive
from datasets import load_dataset
import random
import re
import spacy
import wordninja
import contractions
from collections import defaultdict, Counter
import itertools
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, top_k_accuracy_score, v_measure_score, adjusted_rand_score, classification_report
from sklearn.datasets import load_files
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression

drive.mount('/content/drive')

We get our raw data from a dataset called bookcorpus on Huggingface. This dataset has sentences from 7185 unique books, with a wide range of genres, which can help our model to learn more semantics. The dataset includes preamble, which might be helpful, as those sections do not follow a certain theme / style of writing, and it is more conventional, thus giving our model a wider variety of data to train on.

In [None]:
#importing of dataset

df = pd.read_csv('/content/drive/MyDrive/WE_book_corpus_data.csv')
print(df.shape)
print(df.head())

Due to the size of the dataset and our computational constraints, we subset 1% of the dataset out.

In [None]:
df_data = df.sample(frac = 0.01, random_state = 42)
df_data.to_csv('/content/drive/MyDrive/WE_book_corpus_final_dataset.csv', index = False)

In [None]:
df_data = pd.read_csv('/content/drive/MyDrive/WE_book_corpus_final_dataset.csv')

In [None]:
#in this cell, we conduct preprocessing on our text

nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    text = re.sub(r"\b(\w+)\s+n't\b", r"\1n't", text)  # Handles "is n't" → "isn't"
    text = re.sub(r"\b(\w+)\s+'ll\b", r"\1'll", text)  # Handles "I 'll" → "I'll"
    text = re.sub(r"\b(\w+)\s+'ve\b", r"\1've", text)  # Handles "I 've" → "I've"
    text = re.sub(r"\b(\w+)\s+'re\b", r"\1're", text)  # Handles "you 're" → "you're"
    text = re.sub(r"\b(\w+)\s+'m\b", r"\1'm", text)    # Handles "I 'm" → "I'm"
    text = re.sub(r"\b(\w+)\s+'d\b", r"\1'd", text)    # Handles "I 'd" → "I'd"
    text = re.sub(r"\b(\w+)\s+'s\b", r"\1's", text)    # Handles "it 's" → "it's"
    text = contractions.fix(text)
    text = text.lower()  # Convert to lowercase
    text = ' '.join(wordninja.split(text)) # Split concatenated words
    lemmatized_tokens = [token.lemma_ for token in nlp(text)]  # Lemmatize
    lemma_text = re.sub(r'[^a-zA-Z\s]', '', ' '.join(lemmatized_tokens)) # Join tokens and clean the full string
    return lemma_text

tqdm.pandas(desc="Preprocessing filtered and limited text")
df_data['processed_text'] = df_data['text'].progress_apply(preprocess_text)

df_data.head()

In [None]:
df_data.to_csv('/content/drive/MyDrive/WE_book_corpus_final_dataset_processed.csv', index = False)

Now, we can further sample our data to get 50% and 25% of the full dataset respectively. The intention of this is to compare the performance of the models when trained on different sizes of the dataset, thus we have three different sizes of the dataset: 100%, 50% and 25%.

In [None]:
df_data = pd.read_csv('/content/drive/MyDrive/WE_book_corpus_final_dataset_processed.csv')

df_data = df_data[df_data['processed_text'].apply(lambda x: isinstance(x, str))]

df_quarter = df_data.sample(frac = 0.25, random_state = 42)
df_half = df_data.sample(frac = 0.5, random_state = 42)
df_quarter.to_csv('/content/drive/MyDrive/WE_book_corpus_final_dataset_processed_quarter.csv', index = False)
df_half.to_csv('/content/drive/MyDrive/WE_book_corpus_final_dataset_processed_half.csv', index = False)
df_data.to_csv('/content/drive/MyDrive/WE_book_corpus_final_dataset_processed.csv', index = False)