## One-Hot Vectors

One-hot encoding is a simple and popular method to represent words as vectors.
Each word in the vocabulary is represented by a vector where one element is `1` and the rest are `0`.

In [None]:
import numpy as np

vocabulary = ['cat', 'dog', 'mouse']

def one_hot(word, vocab):
    vector = np.zeros(len(vocab))
    vector[vocab.index(word)] = 1
    return vector

word = 'cat'
vector = one_hot(word, vocabulary)
print(f"One-hot vector for '{word}': {vector}")

## Bag of Words (BoW)

The Bag of Words model represents text as the multiset of its words, disregarding syntax and word order but keeping multiplicity.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

## TF-IDF

Term Frequency-Inverse Document Frequency (TF-IDF) is a statistical measure used to evaluate the importance of a word to a document in a collection or corpus. The importance increases proportionally to the number of times a word appears in the document but is offset by the frequency of the word in the corpus. TF-IDF is widely used in information retrieval and text mining as a weighting factor in searches, document similarity measures, and model features for machine learning algorithms.

**Components of TF-IDF:**
- **Term Frequency (TF):** This measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (aka. the total number of terms in the document) as a way of normalization.
- **Inverse Document Frequency (IDF):** This measures how important a term is. While computing TF, all terms are considered equally important. However, certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance.

**Calculation of TF-IDF:**
The TF-IDF value is calculated by multiplying TF and IDF scores of the term.

## Implementation from scratch

In [None]:
corpus = ['There is a nodule in the left lung.', 'There is a nodule in right breast.']

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())
print(X.toarray())

In [None]:
import pandas as pd
import math
from collections import Counter

corpus = [
    'Patient showed no adverse reactions to the new medication regimen.',
    'The new clinical trial for cancer treatment shows promising results.',
    'Adverse effects are minimal with the current treatment protocol.',
    'Clinical reviews for the latest medication have been overwhelmingly positive.',
    'The patient reports adverse symptoms following the medication change.'
]

stop_words = set(["the", "and", "is", "in", "to", "of"])

def preprocess_document(doc):
    return [word.lower() for word in doc.split() if word.lower() not in stop_words]

preprocessed_corpus = [preprocess_document(doc) for doc in corpus]

def compute_tf(document):
    tf_doc = Counter(document)
    len_doc = len(document)
    return {word: count / len_doc for word, count in tf_doc.items()}

def compute_idf(doc_list):
    N = len(doc_list)
    idf_dict = dict.fromkeys(set(word for doc in doc_list for word in doc), 0)
    for doc in doc_list:
        for word in set(doc):
            idf_dict[word] += 1
    return {word: math.log((N + 1) / (val + 1)) + 1 for word, val in idf_dict.items()}

tf_per_doc = [compute_tf(doc) for doc in preprocessed_corpus]
idfs = compute_idf(preprocessed_corpus)

def compute_tfidf_and_normalize(tf, idfs):
    tfidf = {word: (tf[word] * idfs[word]) for word in tf}
    norm = math.sqrt(sum(val**2 for val in tfidf.values()))
    return {word: val / norm for word, val in tfidf.items()}

tfidf_normalized = [compute_tfidf_and_normalize(doc, idfs) for doc in tf_per_doc]

df_tfidf = pd.DataFrame(tfidf_normalized).fillna(0)
df_tfidf_sorted = df_tfidf.sort_index(axis=1)

print("TF-IDF Vectors stored in DataFrame, sorted alphabetically by words:")
df_tfidf_sorted.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'Patient showed no adverse reactions to the new medication regimen.',
    'The new clinical trial for cancer treatment shows promising results.',
    'Adverse effects are minimal with the current treatment protocol.',
    'Clinical reviews for the latest medication have been overwhelmingly positive.',
    'The patient reports adverse symptoms following the medication change.'
]

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(corpus)

feature_names = vectorizer.get_feature_names_out()

import pandas as pd
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
df_tfidf.head()

## Co-occurrence Matrix

In [None]:
from collections import Counter, defaultdict
import pandas as pd

corpus = [
    'Patient showed no adverse reactions to the new medication regimen.',
    'The new clinical trial for cancer treatment shows promising results.',
    'Adverse effects are minimal with the current treatment protocol.',
    'Clinical reviews for the latest medication have been overwhelmingly positive.',
    'The patient reports adverse symptoms following the medication change.'
]

def preprocess_document(doc):
    return [word.lower() for word in doc.split()]

preprocessed_corpus = [preprocess_document(doc) for doc in corpus]

co_occurrence = defaultdict(Counter)
window_size = 1

for doc in preprocessed_corpus:
    for i, word in enumerate(doc):
        start = max(i - window_size, 0)
        end = min(i + window_size + 1, len(doc))
        for j in range(start, end):
            if i != j:
                co_occurrence[word][doc[j]] += 1

co_occurrence_df = pd.DataFrame.from_dict(co_occurrence, orient='index').fillna(0)
co_occurrence_df = co_occurrence_df.sort_index().sort_index(axis=1)

print("Co-occurrence Matrix:")
co_occurrence_df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.heatmap(co_occurrence_df, annot=False, cmap='YlGnBu', cbar=True)

plt.title('Word Co-occurrence Heatmap')
plt.xlabel('Word')
plt.ylabel('Word')
plt.show()

## Word2Vec (Skip-gram)

In [None]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

clinical_notes = [
    'Patient showed no adverse reactions to the new medication regimen.',
    'The new clinical trial for cancer treatment shows promising results.',
    'Adverse effects are minimal with the current treatment protocol.',
    'Clinical reviews for the latest medication have been overwhelmingly positive.',
    'The patient reports adverse symptoms following the medication change.'
]

processed_clinical_notes = [simple_preprocess(doc) for doc in clinical_notes]
processed_clinical_notes

In [None]:
model = Word2Vec(sentences=processed_clinical_notes, vector_size=100, window=5, sg=1, min_count=1)

similar_words = model.wv.most_similar('medication', topn=5)
print("Words similar to 'medication':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

In [None]:
def find_similar_words(word):
    try:
        similar_words = model.wv.most_similar(word, topn=5)
        print(f"Words similar to '{word}':")
        for word, similarity in similar_words:
            print(f"{word}: {similarity:.4f}")
    except KeyError:
        print(f"Word '{word}' not found in the vocabulary.")

## PCA Visualization of Word Vectors

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

word_vectors = model.wv

words_of_interest = ['medication', 'treatment', 'clinical', 'patient', 'adverse', 'symptoms', 'trial', 'protocol']

vectors = [word_vectors[word] for word in words_of_interest]

pca = PCA(n_components=2)
vectors_2d = pca.fit_transform(vectors)

plt.figure(figsize=(10, 8))
plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], color='red')

for word, (x, y) in zip(words_of_interest, vectors_2d):
    plt.text(x, y, word, ha='right', va='bottom')

plt.title('Word Vectors Visualized with PCA')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid(True)
plt.show()

## Pre-trained GloVe Embeddings

In [None]:
from gensim.models import KeyedVectors
from gensim.downloader import load

model = load('glove-wiki-gigaword-100')

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib

matplotlib.rcParams.update({'font.size': 22})

words = ['king', 'queen']

fig, axs = plt.subplots(len(words), 1, figsize=(10, len(words)), constrained_layout=True)
if len(words) == 1:
    axs = [axs]

label_offset = -5

for ax, word in zip(axs, words):
    if word in model:
        vector = model[word]
        normalized_vector = (vector - np.min(vector)) / (np.max(vector) - np.min(vector))
        cmap = plt.get_cmap('viridis')
        colors = cmap(normalized_vector)
        for i, color in enumerate(colors):
            ax.fill_between([i, i+1], 0, 1, color=color)
        ax.set_xlim(label_offset, len(vector))
        ax.axis('off')
        ax.text(label_offset, 0.5, word, verticalalignment='center', horizontalalignment='right')
    else:
        ax.axis('off')
        ax.text(label_offset, 0.5, f'"{ word}" - Not in vocabulary', verticalalignment='center', horizontalalignment='right', color='red')

plt.show()

## Embedding Subtraction Visualization

In [None]:
import matplotlib

matplotlib.rcParams.update({'font.size': 26})

word_pairs = [('king', 'queen'), ('man', 'woman')]

subtracted_embeddings = [(model[w1] - model[w2]) for w1, w2 in word_pairs if w1 in model.key_to_index and w2 in model.key_to_index]

def visualize_embeddings(embeddings, labels):
    plt.figure(figsize=(10, len(embeddings)))
    for i, (embedding, label) in enumerate(zip(embeddings, labels)):
        normalized_embedding = (embedding - np.min(embedding)) / (np.max(embedding) - np.min(embedding))
        cmap = plt.get_cmap('viridis')
        colors = cmap(normalized_embedding)
        
        plt.subplot(len(embeddings), 1, i + 1)
        for j, color in enumerate(colors):
            plt.fill_between([j, j+1], i, i+1, color=color)
        plt.xlim(0, len(embedding))
        plt.text(-5, i + 0.25, label, va='center', ha='right', fontsize=22)
        plt.axis('off')
        
    plt.tight_layout()
    plt.show()

labels = [f'{w1} - {w2}' for w1, w2 in word_pairs if w1 in model.key_to_index and w2 in model.key_to_index]

visualize_embeddings(subtracted_embeddings, labels)

## Word2Vec and GloVe Training with Gensim

In [None]:
!pip install gensim
!pip install nltk

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import os

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
import gensim.downloader as api

dataset = api.load('text8')
dataset = list(dataset)

In [None]:
print("First document in the text8 dataset:")
print(' '.join(dataset[10]))

In [None]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

## Train Word2Vec Model

In [None]:
from gensim.models import Word2Vec

preprocessed_texts = [preprocess(' '.join(doc)) for doc in dataset]

word2vec_model = Word2Vec(preprocessed_texts, vector_size=100, window=5, min_count=5, workers=4)
word2vec_model.save("word2vec.model")

In [None]:
width = 45
print("Word2Vec Similarity Tests:")
print(f"{'king and queen similarity:':<30} {word2vec_model.wv.similarity('king', 'queen'):>{width}.4f}")
print(f"{'man and woman similarity:':<30} {word2vec_model.wv.similarity('man', 'woman'):>{width}.4f}")
print(f"{'man and king similarity:':<30} {word2vec_model.wv.similarity('man', 'king'):>{width}.4f}")
print(f"{'woman and queen similarity:':<30} {word2vec_model.wv.similarity('woman', 'queen'):>{width}.4f}")

## Word2Vec and GloVe Analogy & Similarity Tests

In [None]:
print("Word2Vec Analogy Tests:")
word2vec_analogies = [
    ("king", "man", "queen"),
    ("paris", "france", "berlin"),
    ("doctor", "hospital", "teacher"),
    ("nurse", "hospital", "student")
]

for a, b, c in word2vec_analogies:
    result = word2vec_model.wv.most_similar(positive=[c, b], negative=[a])
    print(f"{a} is to {b} as {c} is to {result[0][0]}")

## Train FastText Model

In [None]:
from gensim.models import FastText

fasttext_model = FastText(preprocessed_texts, vector_size=100, window=5, min_count=5, workers=4)
fasttext_model.save("fasttext.model")

## Compare Embeddings: Word2Vec, FastText, GloVe

In [None]:
width = 30

def print_similarity_tests(model, model_name):
    print(f"\n{model_name} Similarity Tests:")
    print(f"{'king and queen similarity:':<30} {model.similarity('king', 'queen'):>{width}.4f}")
    print(f"{'man and woman similarity:':<30} {model.similarity('man', 'woman'):>{width}.4f}")

print_similarity_tests(word2vec_model.wv, "Word2Vec")
print_similarity_tests(fasttext_model.wv, "FastText")

In [None]:
def print_analogy_tests(model, model_name):
    print(f"\n{model_name} Analogy Tests:")
    analogies = [
        ("king", "man", "queen"),
        ("paris", "france", "berlin"),
        ("doctor", "hospital", "teacher"),
        ("nurse", "hospital", "student")
    ]
    for a, b, c in analogies:
        result = model.most_similar(positive=[c, b], negative=[a])
        print(f"{a} is to {b} as {c} is to {result[0][0]}")

print_analogy_tests(word2vec_model.wv, "Word2Vec")
print_analogy_tests(fasttext_model.wv, "FastText")

In [None]:
clinical_examples = [
    ("nurse", "patient"),
    ("hospital", "clinic"),
    ("diagnosis", "treatment"),
    ("medication", "prescription"),
    ("doctor", "physician")
]
width = 10
text_width = 60

def print_clinical_similarity_tests(model, model_name):
    print(f"\n{model_name} Clinical Domain Similarity Tests:")
    for word1, word2 in clinical_examples:
        print(f"{f'{word1} and {word2} similarity: ':<{text_width}} {model.similarity(word1, word2):>{width}.4f}")

print_clinical_similarity_tests(word2vec_model.wv, "Word2Vec")
print_clinical_similarity_tests(fasttext_model.wv, "FastText")