# N-Gram model for job description generation

In [45]:
import os
import sys
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams
import numpy as np


project_root = os.path.abspath(os.path.join(os.getcwd(), '../../..')) # Adjust '..' if your notebook is deeper
if project_root not in sys.path:
    sys.path.append(project_root)

### Download necessary resources for nltk

In [46]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading NLTK 'punkt' tokenizer...")
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Downloading NLTK 'stopwords'...")
    nltk.download('stopwords')

# Add this check and download for punkt_tab
try:
    # Check for the specific English directory within punkt_tab
    nltk.data.find('tokenizers/punkt_tab/english/') 
except LookupError:
    print("Downloading NLTK 'punkt_tab'...")
    nltk.download('punkt_tab')

print("NLTK resources checked/downloaded.")

NLTK resources checked/downloaded.


In [47]:
descriptions_df = pd.read_parquet(os.path.join(project_root, 'data', 'processed', 'cleaned_postings_modeling.parquet'))
descriptions_df

Unnamed: 0,company_name,title,description,location
0,Corcoran Sawyer Smith,Marketing Coordinator,job description a leading real estate firm in ...,"Princeton, NJ"
1,The National Exemplar,Assitant Restaurant Manager,the national exemplar is accepting application...,"Cincinnati, OH"
2,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,senior associate attorney elder law trusts and...,"New Hyde Park, NY"
3,Downtown Raleigh Alliance,Economic Development and Planning Intern,job summary the economic development planning ...,"Raleigh, NC"
4,Raw Cereal,Producer,company description raw cereal is a creative d...,United States
...,...,...,...,...
122119,Lozano Smith,Title IX/Investigations Attorney,our walnut creek office is currently seeking a...,"Walnut Creek, CA"
122120,Pinterest,"Staff Software Engineer, ML Serving Platform",about pinterest millions of people across the ...,United States
122121,EPS Learning,"Account Executive, Oregon/Washington",company overview eps learning is a leading k 1...,"Spokane, WA"
122122,Trelleborg Applied Technologies,Business Development Manager,the business development manager is a hunter t...,"Texas, United States"


In [48]:
# Text normalization function
def normalize_text(text):
    if not isinstance(text, str):
        return []
    # Sentence tokenization
    sentences = sent_tokenize(text)
    # Word tokenization for each sentence
    tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
    return tokenized_sentences

In [None]:
from nltk.lm import MLE, Laplace, KneserNeyInterpolated
from nltk.lm.preprocessing import padded_everygram_pipeline
from collections import Counter

# Create and train the model
laplace_model = Laplace(n)  # Laplace smoothing
kn_model = KneserNeyInterpolated(n)  # Kneser-Ney smoothing
# model.fit(train_data, padded_vocab)

### Text generation

In [50]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

def generate_with_sampling(model, num_words=100, text_seed=None, 
                          method='greedy', temp=1.0, k=10, p=0.9):
    """
    Generate text using different sampling methods:
    - 'greedy': Always choose the most likely next word
    - 'random': Sample from the full probability distribution
    - 'topk': Sample from the k most likely words
    - 'nucleus': Sample from the top words that comprise p probability mass
    - 'temperature': Apply temperature to soften/sharpen the distribution
    """
    if text_seed is None:
        text_seed = ['we', 'are', 'looking', 'for']
    else:
        text_seed = word_tokenize(text_seed.lower())
    
    context = text_seed.copy()
    output = context.copy()
    
    for _ in range(num_words):
        # Get context (last n-1 words)
        context = context[-(model.order-1):]
        
        # Get all words in vocabulary
        vocab = list(model.vocab)
        
        # Create distribution manually
        dist = {}
        for word in vocab:
            try:
                # Get score for this word given context
                score = model.score(word, context)
                if score > 0:  # Only include words with non-zero probability
                    dist[word] = score
            except:
                continue
        
        # If no words found with score > 0, try backing off
        if not dist:
            # Generate a random word from vocab as fallback
            next_word = np.random.choice(vocab)
            output.append(next_word)
            context.append(next_word)
            continue
            
        # Different sampling methods
        if method == 'greedy':
            # Get the most likely next word
            next_word = max(dist.items(), key=lambda x: x[1])[0]
            
        elif method == 'random':
            # Sample according to distribution
            words, probs = zip(*dist.items())
            total = sum(probs)
            probs = [p/total for p in probs]  # Normalize to sum to 1
            next_word = np.random.choice(words, p=probs)
            
        elif method == 'topk':
            # Sample from top k most likely words
            top_k = sorted(dist.items(), key=lambda x: x[1], reverse=True)[:min(k, len(dist))]
            words, scores = zip(*top_k)
            total = sum(scores)
            probs = [s/total for s in scores]  # Normalize to sum to 1
            next_word = np.random.choice(words, p=probs)
            
        elif method == 'nucleus':
            # Nucleus (top-p) sampling
            items = sorted(dist.items(), key=lambda x: x[1], reverse=True)
            total = sum(item[1] for item in items)
            cumulative = 0
            nucleus = []
            
            for word, score in items:
                nucleus.append((word, score))
                cumulative += score/total
                if cumulative >= p:
                    break
                    
            words, scores = zip(*nucleus)
            nucleus_total = sum(scores)
            probs = [s/nucleus_total for s in scores]
            next_word = np.random.choice(words, p=probs)
            
        elif method == 'temperature':
            # Temperature sampling
            words, scores = zip(*dist.items())
            # Convert scores to log probabilities for numerical stability
            logits = np.array([np.log(score) for score in scores])
            # Apply temperature
            logits = logits / temp
            # Convert back to probabilities
            probs = np.exp(logits)
            probs = probs / np.sum(probs)  # Normalize
            next_word = np.random.choice(words, p=probs)
        
        output.append(next_word)
        context.append(next_word)
        
        if next_word in ['.', '!', '?']:
            break
    detokenizer = TreebankWordDetokenizer()
    return detokenizer.detokenize(output)

### Evaluation metrics

In [51]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.lm.vocabulary import Vocabulary


def calculate_perplexity(model, test_sentences):
    """Calculate perplexity of the model on test data"""
    perplexities = []
    for sentence in test_sentences:
        try:
            # Use n-grams from the test sentence
            test_ngrams = list(ngrams(sentence, model.order))
            if test_ngrams:
                perplexity = model.perplexity(test_ngrams)
                perplexities.append(perplexity)
        except Exception as e:
            # print(f"Error calculating perplexity for sentence: {sentence}. Error: {e}")
            continue
    
    return np.mean(perplexities) if perplexities else float('inf')

def evaluate_model(model, test_data, num_samples=20, seed_length=5, method='greedy'):
    """
    Evaluate the model using perplexity and BLEU score
    
    Uses the first seed_length words of reference sentences to 
    generate samples, then compares against the original reference.
    """
    # Calculate perplexity
    perplexity = calculate_perplexity(model, test_data)
    
    # Generate samples and calculate BLEU scores
    bleu_scores = []
    samples = []
    references = []
    
    # Randomly select sentences that have at least seed_length words
    valid_sentences = [sent for sent in test_data if len(sent) > seed_length]
    
    if len(valid_sentences) < num_samples:
        print(f"Warning: Only {len(valid_sentences)} valid sentences found. Using all of them.")
        sample_sentences = valid_sentences
    else:
        # FIX: Use indices to select random sentences instead of np.random.choice on the sentences directly
        indices = np.random.choice(len(valid_sentences), num_samples, replace=False)
        sample_sentences = [valid_sentences[i] for i in indices]
    
    # Generate and evaluate for each selected sentence
    for reference in sample_sentences:
        # Get the seed (first few words)
        seed = ' '.join(reference[:seed_length])  # Convert seed tokens to string
        
        # Generate text using this seed
        generated = generate_with_sampling(model, num_words=50, text_seed=seed, method=method)
        
        # Add to samples collection
        samples.append(generated)
        references.append(' '.join(reference))
        
        # Calculate BLEU score against the reference
        hypothesis = word_tokenize(generated)
        smoothing = SmoothingFunction().method1
        
        try:
            # Reference needs to be in a list
            bleu = sentence_bleu([reference], hypothesis, smoothing_function=smoothing)
            bleu_scores.append(bleu)
        except Exception as e:
            print(f"Error calculating BLEU: {e}")
    
    return {
        "perplexity": perplexity,
        "avg_bleu": np.mean(bleu_scores) if bleu_scores else 0.0,
        "samples": samples,
        "references": references,
        "bleu_scores": bleu_scores
    }

### Train and evaluate

In [52]:
# Split data into train and test
from sklearn.model_selection import train_test_split

# Take a random subset of the data (adjust size as needed)
subset_size = 20000  # Try 5k-20k descriptions for a good balance
descriptions_subset = descriptions_df.sample(n=min(subset_size, len(descriptions_df)))

train_df, test_df = train_test_split(descriptions_subset, test_size=0.2, random_state=42)

# Train model on training data
train_corpus = []
for desc in train_df['description'].dropna():
    tokenized_text = normalize_text(desc)
    for sentence in tokenized_text:
        train_corpus.append(sentence)

train_data, padded_vocab = padded_everygram_pipeline(n, train_corpus)
laplace_model.fit(train_data, padded_vocab)
kn_model.fit(train_data, padded_vocab)

### Evaluate Laplace Model

In [53]:
# Evaluate on test data
test_corpus = []
for desc in test_df['description'].dropna():
    tokenized_text = normalize_text(desc)
    for sentence in tokenized_text:
        test_corpus.append(sentence)

eval_results = evaluate_model(laplace_model, test_corpus, num_samples=5, method='topk')
print(f"Perplexity: {eval_results['perplexity']:.2f}")
print(f"Average BLEU score: {eval_results['avg_bleu']:.4f}")
print("\nSample generations:")

for i, (sample, reference, bleu) in enumerate(zip(
    eval_results['samples'], 
    eval_results['references'], 
    eval_results['bleu_scores']
)):
    print(f"\n[{i+1}] BLEU: {bleu:.4f}")
    print(f"Reference: {reference[:100]}...")
    print(f"Generated: {sample}")
    print("-" * 50)

Perplexity: 18595.71
Average BLEU score: 0.1325

Sample generations:

[1] BLEU: 0.1747
Reference: we do not discriminate or allow discrimination on the basis of race , color , religion , creed , sex...
Generated: we do not discriminate or allow discrimination on position level position with a focus on our careers page on instagram, linked in top companies list and 1 among financial services company that serves and is publicly traded on the basis of race, color, religion, or a related industry preferred, but
--------------------------------------------------

[2] BLEU: 0.2601
Reference: vertiv will only employ those who are legally authorized to work in the united states ....
Generated: vertiv will only employ those who did.
--------------------------------------------------

[3] BLEU: 0.0533
Reference: ready mix driver cdl required front discharge location 4101 spring run pkwy , eagle mountain , ut 84...
Generated: ready mix driver cdl required within 6 months of employment, including 

### Compare sampling methods

In [55]:
def evaluate_all_sampling_methods(model, test_data, num_samples=5, seed_length=5):
    """
    Evaluate all sampling methods and compare their performance using perplexity and BLEU scores
    """
    # Calculate model perplexity (this is independent of sampling method)
    perplexity = calculate_perplexity(model, test_data)
    print(f"Overall model perplexity: {perplexity:.2f}")
    
    methods = ['greedy', 'random', 'topk', 'nucleus', 'temperature']
    results = {}
    
    # Randomly select sentences that have at least seed_length words
    valid_sentences = [sent for sent in test_data if len(sent) > seed_length]
    
    if len(valid_sentences) < num_samples:
        print(f"Warning: Only {len(valid_sentences)} valid sentences found. Using all of them.")
        sample_sentences = valid_sentences
    else:
        # Use indices to select random sentences
        indices = np.random.choice(len(valid_sentences), num_samples, replace=False)
        sample_sentences = [valid_sentences[i] for i in indices]
    
    # Evaluate each method
    for method in methods:
        print(f"\nEvaluating {method} sampling...")
        
        # Special handling for temperature
        if method == 'temperature':
            temps = [0.5, 1.0, 2.0]
            method_results = []
            
            for temp in temps:
                bleu_scores = []
                samples = []
                references = []
                
                # Generate samples using this temperature
                for reference in sample_sentences:
                    # Get seed
                    seed = ' '.join(reference[:seed_length])
                    
                    # Generate text
                    generated = generate_with_sampling(model, num_words=50, 
                                                     text_seed=seed, 
                                                     method=method,
                                                     temp=temp)
                    
                    # Calculate BLEU score
                    hypothesis = word_tokenize(generated)
                    smoothing = SmoothingFunction().method1
                    
                    try:
                        bleu = sentence_bleu([reference], hypothesis, smoothing_function=smoothing)
                        bleu_scores.append(bleu)
                    except Exception as e:
                        print(f"Error calculating BLEU: {e}")
                        continue
                    
                    samples.append(generated)
                    references.append(' '.join(reference))
                
                avg_bleu = np.mean(bleu_scores) if bleu_scores else 0.0
                print(f"Temperature {temp} - Average BLEU: {avg_bleu:.4f}")
                
                method_results.append({
                    "temperature": temp,
                    "avg_bleu": avg_bleu,
                    "samples": samples,
                    "references": references,
                    "bleu_scores": bleu_scores
                })
            
            results[method] = method_results
            
        else:
            # Standard methods evaluation
            bleu_scores = []
            samples = []
            references = []
            
            for reference in sample_sentences:
                # Get seed
                seed = ' '.join(reference[:seed_length])
                
                # Generate text
                generated = generate_with_sampling(model, num_words=50, 
                                                 text_seed=seed, 
                                                 method=method)
                
                # Calculate BLEU score
                hypothesis = word_tokenize(generated)
                smoothing = SmoothingFunction().method1
                
                try:
                    bleu = sentence_bleu([reference], hypothesis, smoothing_function=smoothing)
                    bleu_scores.append(bleu)
                except Exception as e:
                    print(f"Error calculating BLEU: {e}")
                    continue
                
                samples.append(generated)
                references.append(' '.join(reference))
            
            avg_bleu = np.mean(bleu_scores) if bleu_scores else 0.0
            print(f"Average BLEU: {avg_bleu:.4f}")
            
            results[method] = {
                "avg_bleu": avg_bleu,
                "samples": samples,
                "references": references,
                "bleu_scores": bleu_scores
            }
    
    # Print summary comparison
    print("\n==== SAMPLING METHODS COMPARISON ====")
    print(f"Model perplexity: {perplexity:.2f}")
    for method in methods:
        if method == 'temperature':
            for temp_result in results[method]:
                print(f"Temperature {temp_result['temperature']} - Average BLEU: {temp_result['avg_bleu']:.4f}")
        else:
            print(f"{method.capitalize()} - Average BLEU: {results[method]['avg_bleu']:.4f}")
    
    return {"perplexity": perplexity, "methods": results}

In [56]:
# Evaluate all sampling methods
evaluation_results = evaluate_all_sampling_methods(laplace_model, test_corpus, num_samples=3)

# Display example generations for each method
print("\n==== EXAMPLE GENERATIONS BY METHOD ====")
for method, result in evaluation_results["methods"].items():
    print(f"\n== {method.upper()} SAMPLING ==")
    
    if method == 'temperature':
        for temp_idx, temp_result in enumerate(result):
            temp = temp_result["temperature"]
            print(f"\nTemperature: {temp}")
            
            for i, (sample, reference) in enumerate(zip(temp_result["samples"][:2], temp_result["references"][:2])):
                bleu = temp_result["bleu_scores"][i]
                print(f"\n[{i+1}] BLEU: {bleu:.4f}")
                print(f"Reference: {reference[:100]}...")
                print(f"Generated: {sample}")
                print("-" * 50)
    else:
        for i, (sample, reference) in enumerate(zip(result["samples"][:2], result["references"][:2])):
            bleu = result["bleu_scores"][i]
            print(f"\n[{i+1}] BLEU: {bleu:.4f}")
            print(f"Reference: {reference[:100]}...")
            print(f"Generated: {sample}")
            print("-" * 50)

Overall model perplexity: 18595.71

Evaluating greedy sampling...
Average BLEU: 0.2042

Evaluating random sampling...
Average BLEU: 0.0624

Evaluating topk sampling...
Average BLEU: 0.0732

Evaluating nucleus sampling...
Average BLEU: 0.0619

Evaluating temperature sampling...
Temperature 0.5 - Average BLEU: 0.0619
Temperature 1.0 - Average BLEU: 0.0619
Temperature 2.0 - Average BLEU: 0.0619

==== SAMPLING METHODS COMPARISON ====
Model perplexity: 18595.71
Greedy - Average BLEU: 0.2042
Random - Average BLEU: 0.0624
Topk - Average BLEU: 0.0732
Nucleus - Average BLEU: 0.0619
Temperature 0.5 - Average BLEU: 0.0619
Temperature 1.0 - Average BLEU: 0.0619
Temperature 2.0 - Average BLEU: 0.0619

==== EXAMPLE GENERATIONS BY METHOD ====

== GREEDY SAMPLING ==

[1] BLEU: 0.3485
Reference: ensure applicability of current quality policies , procedures , and objectives ....
Generated: ensure applicability of current quality and safety of our team.
--------------------------------------------------
