# N-Gram model for job description generation

In [1]:
import os
import sys
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams
import numpy as np


project_root = os.path.abspath(os.path.join(os.getcwd(), '../../..')) # Adjust '..' if your notebook is deeper
if project_root not in sys.path:
    sys.path.append(project_root)

### Download necessary resources for nltk

In [2]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading NLTK 'punkt' tokenizer...")
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Downloading NLTK 'stopwords'...")
    nltk.download('stopwords')

# Add this check and download for punkt_tab
try:
    # Check for the specific English directory within punkt_tab
    nltk.data.find('tokenizers/punkt_tab/english/') 
except LookupError:
    print("Downloading NLTK 'punkt_tab'...")
    nltk.download('punkt_tab')

print("NLTK resources checked/downloaded.")

Downloading NLTK 'punkt' tokenizer...


[nltk_data] Downloading package punkt to /Users/Gabriel/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/Gabriel/nltk_data...


Downloading NLTK 'punkt_tab'...
NLTK resources checked/downloaded.


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [3]:
descriptions_df = pd.read_parquet(os.path.join(project_root, 'data', 'processed', 'cleaned_postings_modeling.parquet'))
descriptions_df

Unnamed: 0,company_name,title,description,location
0,Corcoran Sawyer Smith,Marketing Coordinator,job descriptiona leading real estate firm in n...,"Princeton, NJ"
1,The National Exemplar,Assitant Restaurant Manager,the national exemplar is accepting application...,"Cincinnati, OH"
2,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,senior associate attorney elder law trusts and...,"New Hyde Park, NY"
3,Downtown Raleigh Alliance,Economic Development and Planning Intern,job summarythe economic development planning i...,"Raleigh, NC"
4,Raw Cereal,Producer,company descriptionraw cereal is a creative de...,United States
...,...,...,...,...
122119,Lozano Smith,Title IX/Investigations Attorney,our walnut creek office is currently seeking a...,"Walnut Creek, CA"
122120,Pinterest,"Staff Software Engineer, ML Serving Platform",about pinterest millions of people across the ...,United States
122121,EPS Learning,"Account Executive, Oregon/Washington",company overview eps learning is a leading k12...,"Spokane, WA"
122122,Trelleborg Applied Technologies,Business Development Manager,the business development manager is a hunter t...,"Texas, United States"


In [4]:
# Text normalization function
def normalize_text(text):
    if not isinstance(text, str):
        return []
    # Sentence tokenization
    sentences = sent_tokenize(text)
    # Word tokenization for each sentence
    tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
    return tokenized_sentences

In [5]:
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline
from collections import Counter

# Create corpus of tokenized descriptions
corpus = []
for desc in descriptions_df['description'].dropna():
    tokenized_text = normalize_text(desc)
    for sentence in tokenized_text:
        corpus.append(sentence)

# Set n-gram order (e.g., trigram model)
n = 3

# Prepare n-grams using NLTK's built-in function
train_data, padded_vocab = padded_everygram_pipeline(n, corpus)

# Create and train the model
model = MLE(n)  # Maximum Likelihood Estimation
model.fit(train_data, padded_vocab)

### Text generation

In [7]:
def generate_job_description(model, num_words=100, text_seed=None):
    if text_seed is None:
        text_seed = ['we', 'are', 'looking', 'for']
    else:
        text_seed = word_tokenize(text_seed.lower())
    
    context = text_seed.copy()
    output = context.copy()
    
    # Generate words
    for _ in range(num_words):
        # Get context for prediction (last n-1 words)
        context = context[-(model.order-1):]
        
        # Generate next word
        next_word = model.generate(1, context)
        
        # Add to output
        output.append(next_word)
        context.append(next_word)
        
        # Stop if we generate an end-of-sentence token
        if next_word in ['.', '!', '?']:
            break
    
    return ' '.join(output)

def generate_with_sampling(model, num_words=100, text_seed=None, 
                          method='greedy', temp=1.0, k=10, p=0.9):
    """
    Generate text using different sampling methods:
    - 'greedy': Always choose the most likely next word
    - 'random': Sample from the full probability distribution
    - 'topk': Sample from the k most likely words
    - 'nucleus': Sample from the top words that comprise p probability mass
    - 'temperature': Apply temperature to soften/sharpen the distribution
    """
    if text_seed is None:
        text_seed = ['we', 'are', 'looking', 'for']
    else:
        text_seed = word_tokenize(text_seed.lower())
    
    context = text_seed.copy()
    output = context.copy()
    
    for _ in range(num_words):
        # Get context (last n-1 words)
        context = context[-(model.order-1):]
        
        # Get the probability distribution for next words
        dist = model.context_counts[tuple(context)]
        
        # Different sampling methods
        if method == 'greedy':
            # Get the most likely next word
            next_word = max(dist.items(), key=lambda x: x[1])[0]
            
        elif method == 'random':
            # Sample according to distribution
            words, probs = zip(*dist.items())
            total = sum(probs)
            probs = [p/total for p in probs]  # Normalize to sum to 1
            next_word = np.random.choice(words, p=probs)
            
        elif method == 'topk':
            # Sample from top k most likely words
            top_k = sorted(dist.items(), key=lambda x: x[1], reverse=True)[:k]
            words, counts = zip(*top_k)
            total = sum(counts)
            probs = [c/total for c in counts]  # Normalize to sum to 1
            next_word = np.random.choice(words, p=probs)
            
        elif method == 'nucleus':
            # Nucleus (top-p) sampling
            items = sorted(dist.items(), key=lambda x: x[1], reverse=True)
            total = sum(item[1] for item in items)
            cumulative = 0
            nucleus = []
            
            for word, count in items:
                nucleus.append((word, count))
                cumulative += count/total
                if cumulative >= p:
                    break
                    
            words, counts = zip(*nucleus)
            nucleus_total = sum(counts)
            probs = [c/nucleus_total for c in counts]
            next_word = np.random.choice(words, p=probs)
            
        elif method == 'temperature':
            # Temperature sampling
            words, counts = zip(*dist.items())
            # Convert counts to log probabilities for numerical stability
            logits = np.array([np.log(count) for count in counts])
            # Apply temperature
            logits = logits / temp
            # Convert back to probabilities
            probs = np.exp(logits)
            probs = probs / np.sum(probs)  # Normalize
            next_word = np.random.choice(words, p=probs)
        
        output.append(next_word)
        context.append(next_word)
        
        if next_word in ['.', '!', '?']:
            break
    
    return ' '.join(output)

### Evaluation metrics

In [8]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.lm.vocabulary import Vocabulary

def calculate_perplexity(model, test_sentences):
    """Calculate perplexity of the model on test data"""
    perplexities = []
    for sentence in test_sentences:
        try:
            # Use n-grams from the test sentence
            test_ngrams = list(ngrams(sentence, model.order))
            if test_ngrams:
                perplexity = model.perplexity(test_ngrams)
                perplexities.append(perplexity)
        except Exception as e:
            continue
    
    return np.mean(perplexities) if perplexities else float('inf')

def evaluate_model(model, test_data, num_samples=5):
    """Evaluate the model using perplexity and BLEU score"""
    # Calculate perplexity
    perplexity = calculate_perplexity(model, test_data)
    
    # Generate samples and calculate BLEU scores
    bleu_scores = []
    samples = []
    
    for _ in range(num_samples):
        generated = generate_job_description(model, num_words=50)
        samples.append(generated)
        
        # Calculate BLEU score against test data
        references = [[sentence] for sentence in test_data]
        hypothesis = word_tokenize(generated)
        
        smoothing = SmoothingFunction().method1
        bleu = sentence_bleu(references, hypothesis, smoothing_function=smoothing)
        bleu_scores.append(bleu)
    
    return {
        "perplexity": perplexity,
        "avg_bleu": np.mean(bleu_scores),
        "samples": samples
    }

### Train and evaluate

In [None]:
# Split data into train and test
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(descriptions_df, test_size=0.2)

# Train model on training data
train_corpus = []
for desc in train_df['description'].dropna():
    tokenized_text = normalize_text(desc)
    for sentence in tokenized_text:
        train_corpus.append(sentence)

train_data, padded_vocab = padded_everygram_pipeline(n, train_corpus)
model.fit(train_data, padded_vocab)

# Evaluate on test data
test_corpus = []
for desc in test_df['description'].dropna():
    tokenized_text = normalize_text(desc)
    for sentence in tokenized_text:
        test_corpus.append(sentence)

eval_results = evaluate_model(model, test_corpus)
print(f"Perplexity: {eval_results['perplexity']:.2f}")
print(f"Average BLEU score: {eval_results['avg_bleu']:.4f}")
print("\nSample generations:")
for i, sample in enumerate(eval_results['samples']):
    print(f"\n[{i+1}] {sample}")

In [None]:
from nltk.lm import Laplace, KneserNeyInterpolated

# Try different smoothing methods
laplace_model = Laplace(n)
laplace_model.fit(train_data, padded_vocab)

kn_model = KneserNeyInterpolated(n)
kn_model.fit(train_data, padded_vocab)

### Compare sampling methods

In [None]:
def compare_sampling_methods(model, test_data):
    methods = ['greedy', 'random', 'topk', 'nucleus', 'temperature']
    results = {}
    
    for method in methods:
        print(f"\nEvaluating {method} sampling...")
        samples = []
        
        # Generate 3 examples with each method
        for i in range(3):
            if method == 'temperature':
                # Try different temperatures
                temps = [0.5, 1.0, 2.0]
                sample = generate_with_sampling(model, num_words=50, 
                                              method=method, temp=temps[i])
                samples.append(f"Temperature {temps[i]}: {sample}")
            else:
                sample = generate_with_sampling(model, num_words=50, method=method)
                samples.append(sample)
        
        results[method] = samples
    
    return results

# Run the comparison
sampling_results = compare_sampling_methods(model, test_corpus)

# Print results
for method, samples in sampling_results.items():
    print(f"\n== {method.upper()} SAMPLING ==")
    for i, sample in enumerate(samples):
        print(f"[{i+1}] {sample}")
        print("-" * 50)