# Develop trigram model

## Import libraries

In [46]:
import json
from nltk.tokenize import sent_tokenize

import glob
import os
import re
import argparse
import random
from collections import defaultdict, Counter
import math
import numpy as np
import pandas as pd
from tqdm import tqdm

# Set random seed for reproducibility
random.seed(42)

## Load data

In [47]:
# Set the path to your dataset directory
path = os.path.expanduser("/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train/*.json")

# Load training file names
files = glob.glob(path)

# Randomly select 20 files
sample_files = random.sample(files, min(20, len(files)))

# Read and store randomly sampled documents
sample_data = []
for path in sample_files:
    with open(path) as f:
        data = json.load(f)
        for entry in data:
            sample_data.append(entry.get("text", ""))

print("Number of sampled JSON files:", len(sample_files))
print("Snippet of first doc:\n", sample_data[0][:300], "...")

Number of sampled JSON files: 20
Snippet of first doc:
 alzheimer's disease and other types of dementia are the top cause for disabilities in later life and various types of experiments have been performed to understand the underlying mechanisms of the disease with the aim of coming up with potential drug targets. these experiments have been carried out  ...


### Set up training and heldout data

In [48]:
# Split data to train and heldout sets
def train_heldout_split(corpus, heldout_ratio=0.1, seed=42):
    random.seed(seed)
    random.shuffle(corpus)
    split_idx = int(len(corpus) * (1 - heldout_ratio))
    train = corpus[:split_idx]
    heldout = corpus[split_idx:]
    return train, heldout

# Implement splitting function
sample_train, sample_heldout = train_heldout_split(sample_data, heldout_ratio=0.1)

# Aggregate training and heldout corpora into single strings
train_corpus = " ".join(sample_train)
heldout_corpus = " ".join(sample_heldout)

# Check text content
print("Training corpus snippet:\n", train_corpus[:300], "...")
print("Heldout corpus snippet:\n", heldout_corpus[:300], "...")

Training corpus snippet:
 Yu et al. investigated metabolic changes in the urine of APP/ PS1 transgenic mice prior to cognitive impairment (132) . At 2 months of age, the spatial working memory of APP/PS1 mice showed no significant differences when compared to NTG controls (132) . However, metabolomics analysis of urine from  ...
Heldout corpus snippet:
 Approximately 41 percent of financially independent undergraduates received financial aid, and the average amount they received was about $3,500. More than one-third of independent students (36 percent) received grants (averaging about $2,000), and 18 percent received loans (averaging $3,500) (table ...


### Set up test data

In [49]:
# Generate test data outside initial sample data
test_path = os.path.expanduser("/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train/*.json")

# Remove files already in sample_files
sample_file_set = set(sample_files)
all_test_files = glob.glob(test_path)
test_files = [f for f in all_test_files if f not in sample_file_set]

# Randomly select 10 test files
test_sample_files = random.sample(test_files, min(10, len(test_files)))
test_data = []
for path in test_sample_files:
    with open(path) as f:
        data = json.load(f)
        for entry in data:
            test_data.append(entry.get("text", ""))

# Aggregate test corpus into a single string
test_corpus = " ".join(test_data)

print("Number of sampled test JSON files:", len(test_sample_files))
print("Snippet of first test doc:\n", test_corpus[:300], "...")

Number of sampled test JSON files: 10
Snippet of first test doc:
 Scales with varying degrees of measurement reliability are often used in the context of multistage sampling, where variance exists at multiple levels of analysis (e.g., individual and group). Because methodological guidance on assessing and reporting reliability at multiple levels of analysis is cur ...


## Develop algorithm

### Set up sentence segmenter

In [50]:
# Sentence splitting using NLTK with markers for sentence boundaries
def sentence_split(corpus,n=3):
    sent = sent_tokenize(corpus)
    start = "<s> " * (n-1)
    end = " </s>"
    sentences = [start + " " + s + " " + end for s in sent]
    sentences_clean = [re.sub(r'\s+', ' ', s).strip() for s in sentences]
    return sentences_clean

# Test
print(sentence_split(train_corpus)[:3])

['<s> <s> Yu et al. </s>', '<s> <s> investigated metabolic changes in the urine of APP/ PS1 transgenic mice prior to cognitive impairment (132) . </s>', '<s> <s> At 2 months of age, the spatial working memory of APP/PS1 mice showed no significant differences when compared to NTG controls (132) . </s>']


### Set up N-gram counters

In [51]:
# Set up N-gram counters
def n_gram_counter(corpus, n=3):

    unigram_counts = Counter()
    bigram_counts = Counter()
    trigram_counts = Counter()
    sentences = sentence_split(corpus,n=n)

    for sent in sentences:
        tokens = sent.split()

        for i, tok in enumerate(tokens):
            # Unigrams
            if tok not in {"<s>", "</s>"}: # skip <s> and </s>
                unigram_counts[tok] += 1

            # Bigrams
            if i > 0:
                w1, w2 = tokens[i-1], tok
                if not (w1 == "<s>" and w2 == "<s>"): # skip <s>, <s>
                    bigram_counts[tokens[i-1], tok] += 1

            # Trigrams
            if i > 1:
                trigram_counts[tokens[i-2], tokens[i-1], tok] += 1

    return unigram_counts, bigram_counts, trigram_counts

# Test
unigram_counts, bigram_counts, trigram_counts = n_gram_counter(train_corpus)
print("Sample unigram counts:", list(unigram_counts.items())[:5])
print("Sample bigram counts:", list(bigram_counts.items())[:5])
print("Sample trigram counts:", list(trigram_counts.items())[:5])

Sample unigram counts: [('Yu', 2), ('et', 352), ('al.', 142), ('investigated', 11), ('metabolic', 89)]
Sample bigram counts: [(('<s>', 'Yu'), 1), (('Yu', 'et'), 2), (('et', 'al.'), 141), (('al.', '</s>'), 142), (('<s>', 'investigated'), 3)]
Sample trigram counts: [(('<s>', '<s>', 'Yu'), 1), (('<s>', 'Yu', 'et'), 1), (('Yu', 'et', 'al.'), 2), (('et', 'al.', '</s>'), 141), (('<s>', '<s>', 'investigated'), 3)]


### Interpolate probability

In [52]:
# Implement add-k smoothing for interpolated trigram model
def interpolated_prob(w, h1, h2, unigram, bigram, trigram, vocab_size, lambdas, add_k=1e-5):

    lambda_tri, lambda_bi, lambda_uni = lambdas
    
    # unigram MLE with add-k
    uni_count = unigram.get(w, 0) + add_k
    uni_denom = sum(unigram.values()) + add_k * vocab_size
    p_uni = uni_count / uni_denom

    # bigram P(w|h2) = count(h2,w)/count(h2) smoothed with add_k
    bi_num = bigram.get((h2, w), 0) + add_k
    bi_den = unigram.get(h2, 0) + add_k * vocab_size
    p_bi = bi_num / bi_den

    # trigram P(w|h1,h2) = count(h1,h2,w)/count(h1,h2)
    tri_num = trigram.get((h1, h2, w), 0) + add_k
    tri_den = bigram.get((h1, h2), 0) + add_k * vocab_size
    p_tri = tri_num / tri_den

    return lambda_tri * p_tri + lambda_bi * p_bi + lambda_uni * p_uni

# Test
vocab = set(unigram_counts.keys())
vocab_size = len(vocab)
lambdas = (0.7, 0.2, 0.1)
test_word = "the"
test_h1 = "of"
test_h2 = "in"
prob = interpolated_prob(test_word, test_h1, test_h2, unigram_counts, bigram_counts, trigram_counts, vocab_size, lambdas)
print(f"P({test_word}|{test_h1},{test_h2}) =", prob)

P(the|of,in) = 0.06311815745025817


### Tune lambda

In [53]:
# Grid search to tune lambdas
def tune_lambdas(heldout_corpus, unigram, bigram, trigram, vocab, grid_step=0.05):
        
    sentences = sentence_split(heldout_corpus)
    vocab_size = len(vocab)
    best = None
    best_ll = -float("inf")

    # Generate lambda triplets
    lambdas_list = []
    steps = np.arange(0, 1 + 1e-9, grid_step)
    for l_tri in steps:
        for l_bi in steps:
            if l_tri + l_bi <= 1.0 + 1e-9:
                l_uni = 1.0 - l_tri - l_bi
                lambdas_list.append((l_tri, l_bi, l_uni))

    # Evaluate each lambda set
    for lambdas in tqdm(lambdas_list, desc="grid lambdas"):
        ll = 0.0
        N = 0
        for sent in sentences:
            tokens = sent.split()
            for i in range(2, len(tokens)):
                w = tokens[i]
                h1 = tokens[i - 2]
                h2 = tokens[i - 1]
                p = interpolated_prob(w, h1, h2, unigram, bigram, trigram, vocab_size, lambdas)
                if p > 0:
                    ll += math.log(p)
                    N += 1
        if N > 0:
            ll_per_word = ll / N
            if ll_per_word > best_ll:
                best_ll = ll_per_word
                best = lambdas

    return best if best else (0.33, 0.33, 0.34)


# Test
vocab = set(unigram_counts.keys())
best_lambdas = tune_lambdas(heldout_corpus, unigram_counts, bigram_counts, trigram_counts, vocab)
print("Best lambdas:", best_lambdas)

grid lambdas: 100%|██████████| 231/231 [10:33<00:00,  2.74s/it]

Best lambdas: (0.25, 0.4, 0.35)





## Generate sentences

In [54]:
def generate_sentence(unigram, bigram, trigram, vocab, lambdas, max_len=30):
    vocab_list = sorted(vocab)
    vocab_size = len(vocab_list)
    prob_cache = {}

    sent = ['<s>', '<s>']
    for _ in range(max_len):
        h1, h2 = sent[-2], sent[-1]
        key = (h1, h2)

        if key not in prob_cache:
            probs = np.array([
                interpolated_prob(w, h1, h2, unigram, bigram, trigram, vocab_size, lambdas)
                for w in vocab_list
            ])
            probs = probs / probs.sum()
            prob_cache[key] = probs

        probs = prob_cache[key]
        w = np.random.choice(vocab_list, p=probs)

        if w == '</s>':
            break
        sent.append(w)

    output = [t for t in sent if t not in ('<s>', '</s>')]
    return ' '.join(output)

# Generate sample sentences
for _ in range(2):
    sentence = generate_sentence(unigram_counts, bigram_counts, trigram_counts, vocab, best_lambdas)
    print("Generated sentence:", sentence)

Generated sentence: However, an cortical thickness is a testament to Years diagnosis, which these risks and limitations of out-of-district age reflect differing generations of AD. patients. all (0.07) circles that methyl path
Generated sentence: It has provided written informed consent. not 61.5% . of the Gulf of blurring stream-wise dynamics 56 subjects as an 3 brain atlas. (20.0%). found to 5 with in 1956).


## Calculate perplexity

In [55]:
def perplexity(corpus, unigram, bigram, trigram, vocab, lambdas):
    sentences = sentence_split(corpus)
    docs_tokenized = [s.split() for s in sentences]
    vocab_size = len(vocab)
    
    log_prob = 0.0
    N = 0
    
    for doc in docs_tokenized:
        sent = []
        for tok in doc:
            sent.append(tok)
            if tok in ('.', '!', '?'):
                tokens = ['<s>', '<s>'] + sent + ['</s>']
                for i in range(2, len(tokens)):
                    w = tokens[i]
                    h1, h2 = tokens[i-2], tokens[i-1]
                    p = max(interpolated_prob(w, h1, h2, unigram, bigram, trigram, vocab_size, lambdas), 1e-12)
                    log_prob += math.log(p)
                    N += 1
                sent = []
        
        # Handle leftover (no punctuation)
        if sent:
            tokens = ['<s>', '<s>'] + sent + ['</s>']
            for i in range(2, len(tokens)):
                w = tokens[i]
                h1, h2 = tokens[i-2], tokens[i-1]
                p = max(interpolated_prob(w, h1, h2, unigram, bigram, trigram, vocab_size, lambdas), 1e-12)
                log_prob += math.log(p)
                N += 1

    return math.exp(-log_prob / N) if N > 0 else float('inf')

# Compute perplexity score on test set
print("List of sampled test files:")
display(test_sample_files)
vocab = set(unigram_counts.keys())
pp = perplexity(test_corpus, unigram_counts, bigram_counts, trigram_counts, vocab, best_lambdas)
print("Perplexity on test set:", pp)

List of sampled test files:


['/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train/e72aa200-5e2b-4775-b155-d983cdf46f46.json',
 '/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train/cf0223e8-f25e-4ea4-b63e-e971d72a9caa.json',
 '/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train/a1043cef-47a8-4865-97dd-1e284c3ef160.json',
 '/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train/68878dd8-162e-4dbd-a595-ca959dabe09b.json',
 '/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train/fbcafd40-a96c-4f2e-b9ad-94937ba02cb3.json',
 '/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train/796f35c1-ba6b-4552-8a7f-5d8b61164fb0.json',
 '/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train/3570db8e-7968-4dc6-a080-c2a62a629e5d.json',
 '/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train/c5d0d6b0-692c-495d-9382-31fbaba4ff22.json',


Perplexity on test set: 2021.0180522874603


## Conclusion

Due to machine limitation, only 20 samples are selected for training and heldout. This exercise highlights the limitation of n-gram language models with limited number of training datasets. This is shown by the high perplexity value.