# N-Gram model for job description generation

In [22]:
import os
import sys
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams
import numpy as np


project_root = os.path.abspath(os.path.join(os.getcwd(), '../../..')) # Adjust '..' if your notebook is deeper
if project_root not in sys.path:
    sys.path.append(project_root)

### Download necessary resources for nltk

In [23]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading NLTK 'punkt' tokenizer...")
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Downloading NLTK 'stopwords'...")
    nltk.download('stopwords')

# Add this check and download for punkt_tab
try:
    # Check for the specific English directory within punkt_tab
    nltk.data.find('tokenizers/punkt_tab/english/') 
except LookupError:
    print("Downloading NLTK 'punkt_tab'...")
    nltk.download('punkt_tab')

print("NLTK resources checked/downloaded.")

NLTK resources checked/downloaded.


In [24]:
descriptions_df = pd.read_parquet(os.path.join(project_root, 'data', 'processed', 'cleaned_postings_modeling.parquet'))
descriptions_df

Unnamed: 0,company_name,title,description,location
0,Corcoran Sawyer Smith,Marketing Coordinator,job description a leading real estate firm in ...,"Princeton, NJ"
1,The National Exemplar,Assitant Restaurant Manager,the national exemplar is accepting application...,"Cincinnati, OH"
2,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,senior associate attorney elder law trusts and...,"New Hyde Park, NY"
3,Downtown Raleigh Alliance,Economic Development and Planning Intern,job summary the economic development planning ...,"Raleigh, NC"
4,Raw Cereal,Producer,company description raw cereal is a creative d...,United States
...,...,...,...,...
122119,Lozano Smith,Title IX/Investigations Attorney,our walnut creek office is currently seeking a...,"Walnut Creek, CA"
122120,Pinterest,"Staff Software Engineer, ML Serving Platform",about pinterest millions of people across the ...,United States
122121,EPS Learning,"Account Executive, Oregon/Washington",company overview eps learning is a leading k 1...,"Spokane, WA"
122122,Trelleborg Applied Technologies,Business Development Manager,the business development manager is a hunter t...,"Texas, United States"


In [25]:
def normalize_text(text, handle_eod=True):
    """
    Tokenize text with special handling for <EOD> tokens
    
    Parameters:
    - text: The input text to tokenize
    - handle_eod: How to handle EOD tokens. 
      If True, preserves them as special end-of-description markers
    """
    if not isinstance(text, str):
        return []
    
    # Handle EOD token before sentence tokenization
    if handle_eod:
        # Replace <EOD> with a special marker that won't be split
        text = text.replace('<EOD>', ' __EOD__ ')
    
    # Sentence tokenization
    sentences = sent_tokenize(text)
    
    # Word tokenization for each sentence
    tokenized_sentences = []
    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())
        
        # Handle the EOD special token
        if handle_eod:
            # Convert back our special marker
            tokens = ['<EOD>' if token == '__eod__' else token for token in tokens]
            
            # If this sentence contains EOD, make it the final token
            if '<EOD>' in tokens:
                eod_index = tokens.index('<EOD>')
                # Keep all tokens up to and including the EOD
                tokens = tokens[:eod_index+1]
        
        tokenized_sentences.append(tokens)
    
    return tokenized_sentences

In [26]:
from nltk.lm import Laplace, KneserNeyInterpolated
from nltk.lm.preprocessing import padded_everygram_pipeline
from collections import Counter
n = 1
# Create and train the model
laplace_model = Laplace(n)  # Laplace smoothing
# model.fit(train_data, padded_vocab)

### Text generation

In [27]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

def generate_with_sampling(model, num_words=100, text_seed=None, 
                          method='greedy', temp=1.0, k=10, p=0.9):
    """
    Generate text using different sampling methods:
    - 'greedy': Always choose the most likely next word
    - 'random': Sample from the full probability distribution
    - 'topk': Sample from the k most likely words
    - 'nucleus': Sample from the top words that comprise p probability mass
    - 'temperature': Apply temperature to soften/sharpen the distribution
    """
    if text_seed is None:
        text_seed = ['we', 'are', 'looking', 'for']
    else:
        text_seed = word_tokenize(text_seed.lower())
    
    context = text_seed.copy()
    output = context.copy()
    
    for _ in range(num_words):
        # Get context (last n-1 words)
        context = context[-(model.order-1):]
        
        # Get all words in vocabulary
        vocab = list(model.vocab)
        
        # Create distribution manually
        dist = {}
        for word in vocab:
            try:
                # Get score for this word given context
                score = model.score(word, context)
                if score > 0:  # Only include words with non-zero probability
                    dist[word] = score
            except:
                continue
        
        # If no words found with score > 0, try backing off
        if not dist:
            # Generate a random word from vocab as fallback
            next_word = np.random.choice(vocab)
            output.append(next_word)
            context.append(next_word)
            continue
            
        # Different sampling methods
        if method == 'greedy':
            # Get the most likely next word
            next_word = max(dist.items(), key=lambda x: x[1])[0]
            
        elif method == 'random':
            # Sample according to distribution
            words, probs = zip(*dist.items())
            total = sum(probs)
            probs = [p/total for p in probs]  # Normalize to sum to 1
            next_word = np.random.choice(words, p=probs)
            
        elif method == 'topk':
            # Sample from top k most likely words
            top_k = sorted(dist.items(), key=lambda x: x[1], reverse=True)[:min(k, len(dist))]
            words, scores = zip(*top_k)
            total = sum(scores)
            probs = [s/total for s in scores]  # Normalize to sum to 1
            next_word = np.random.choice(words, p=probs)
            
        elif method == 'nucleus':
            # Nucleus (top-p) sampling
            items = sorted(dist.items(), key=lambda x: x[1], reverse=True)
            total = sum(item[1] for item in items)
            cumulative = 0
            nucleus = []
            
            for word, score in items:
                nucleus.append((word, score))
                cumulative += score/total
                if cumulative >= p:
                    break
                    
            words, scores = zip(*nucleus)
            nucleus_total = sum(scores)
            probs = [s/nucleus_total for s in scores]
            next_word = np.random.choice(words, p=probs)
            
        elif method == 'temperature':
            # Temperature sampling
            words, scores = zip(*dist.items())
            # Convert scores to log probabilities for numerical stability
            logits = np.array([np.log(score) for score in scores])
            # Apply temperature
            logits = logits / temp
            # Convert back to probabilities
            probs = np.exp(logits)
            probs = probs / np.sum(probs)  # Normalize
            next_word = np.random.choice(words, p=probs)
        
        output.append(next_word)
        context.append(next_word)
        
        # if next_word in ['.', '!', '?']:
        #     break
        if next_word == '<EOD>':
            output.append(next_word)
            break
    detokenizer = TreebankWordDetokenizer()
    return detokenizer.detokenize(output)

### Evaluation metrics

In [28]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.lm.vocabulary import Vocabulary


def calculate_perplexity(model, test_sentences):
    """Calculate perplexity of the model on test data"""
    perplexities = []
    for sentence in test_sentences:
        try:
            # Use n-grams from the test sentence
            test_ngrams = list(ngrams(sentence, model.order))
            if test_ngrams:
                perplexity = model.perplexity(test_ngrams)
                perplexities.append(perplexity)
        except Exception as e:
            # print(f"Error calculating perplexity for sentence: {sentence}. Error: {e}")
            continue
    
    return np.mean(perplexities) if perplexities else float('inf')

from rouge_score import rouge_scorer
import bert_score

def evaluate_model(model, test_data, num_samples=20, seed_length=5, method='greedy'):
    """
    Evaluate the model using multiple metrics:
    - Perplexity: How well the model predicts the test data
    - BLEU: N-gram precision between generated and reference text
    - ROUGE: N-gram recall between generated and reference text
    - BERT Score: Semantic similarity using contextual embeddings
    
    Uses the first seed_length words of reference sentences to 
    generate samples, then compares against the original reference.
    """
    # Calculate perplexity
    perplexity = calculate_perplexity(model, test_data)
    
    # Initialize scorers
    rouge_metrics = ['rouge1', 'rouge2', 'rougeL']
    scorer = rouge_scorer.RougeScorer(rouge_metrics, use_stemmer=True)
    
    # Prepare for scores
    bleu_scores = []
    rouge_scores = {metric: [] for metric in rouge_metrics}
    bert_scores = {'precision': [], 'recall': [], 'f1': []}
    samples = []
    references = []
    
    # Randomly select sentences with sufficient length
    valid_sentences = [sent for sent in test_data if len(sent) > seed_length]
    
    if len(valid_sentences) < num_samples:
        print(f"Warning: Only {len(valid_sentences)} valid sentences found. Using all of them.")
        sample_sentences = valid_sentences
    else:
        # Use indices for random selection
        indices = np.random.choice(len(valid_sentences), num_samples, replace=False)
        sample_sentences = [valid_sentences[i] for i in indices]
    
    # Generate and evaluate each sample
    for reference in sample_sentences:
        # Get the seed (first few words)
        seed = ' '.join(reference[:seed_length])
        
        # Generate text using this seed
        generated = generate_with_sampling(model, num_words=50, text_seed=seed, method=method)
        
        # Store samples and references
        samples.append(generated)
        reference_text = ' '.join(reference)
        references.append(reference_text)
        
        # Calculate BLEU score
        hypothesis = word_tokenize(generated)
        smoothing = SmoothingFunction().method1
        
        try:
            # BLEU score calculation
            bleu = sentence_bleu([reference], hypothesis, smoothing_function=smoothing)
            bleu_scores.append(bleu)
            
            # ROUGE score calculation
            rouge_results = scorer.score(reference_text, generated)
            for metric in rouge_metrics:
                rouge_scores[metric].append(rouge_results[metric].fmeasure)
                
            # Batch BERT scores for efficiency
            if len(samples) % 8 == 0 or len(samples) == len(sample_sentences):
                # Calculate BERT score in batches to improve efficiency
                P, R, F1 = bert_score.score(samples[-8:] if len(samples) % 8 == 0 else samples, 
                                           references[-8:] if len(samples) % 8 == 0 else references,
                                           lang="en", verbose=False)
                bert_scores['precision'].extend(P.tolist())
                bert_scores['recall'].extend(R.tolist())
                bert_scores['f1'].extend(F1.tolist())
                
        except Exception as e:
            print(f"Error calculating metrics: {e}")
    
    # Calculate averages
    avg_bleu = np.mean(bleu_scores) if bleu_scores else 0.0
    avg_rouge = {metric: np.mean(scores) if scores else 0.0 for metric, scores in rouge_scores.items()}
    avg_bert = {metric: np.mean(scores) if scores else 0.0 for metric, scores in bert_scores.items()}
    
    return {
        "perplexity": perplexity,
        "avg_bleu": avg_bleu,
        "avg_rouge": avg_rouge,
        "avg_bert": avg_bert,
        "samples": samples,
        "references": references,
        "bleu_scores": bleu_scores,
        "rouge_scores": rouge_scores,
        "bert_scores": bert_scores
    }

### Train and evaluate on subset of data

In [29]:
# Split data into train and test
from sklearn.model_selection import train_test_split

# Take a random subset of the data (adjust size as needed)
subset_size = 30000  # Try 5k-20k descriptions for a good balance
descriptions_subset = descriptions_df.sample(n=min(subset_size, len(descriptions_df)))

train_df, test_df = train_test_split(descriptions_subset, test_size=0.1, random_state=42)

# Train model on training data
train_corpus = []
for desc in train_df['description'].dropna():
    tokenized_text = normalize_text(desc)
    for sentence in tokenized_text:
        train_corpus.append(sentence)

train_data, padded_vocab = padded_everygram_pipeline(n, train_corpus)
laplace_model.fit(train_data, padded_vocab)

### Evaluate Laplace Model with `Rouge` / `Bleu` / `Bert` Scores

In [30]:
# Evaluate on test data
test_corpus = []
for desc in test_df['description'].dropna():
    tokenized_text = normalize_text(desc)
    for sentence in tokenized_text:
        test_corpus.append(sentence)

eval_results = evaluate_model(laplace_model, test_corpus, num_samples=5, method='topk')

# Display overall metrics with all scores
print(f"Perplexity: {eval_results['perplexity']:.2f}")
print(f"Average BLEU score: {eval_results['avg_bleu']:.4f}")
print(f"Average ROUGE-1: {eval_results['avg_rouge']['rouge1']:.4f}")
print(f"Average ROUGE-2: {eval_results['avg_rouge']['rouge2']:.4f}")
print(f"Average ROUGE-L: {eval_results['avg_rouge']['rougeL']:.4f}")
print(f"Average BERT-Precision: {eval_results['avg_bert']['precision']:.4f}")
print(f"Average BERT-Recall: {eval_results['avg_bert']['recall']:.4f}")
print(f"Average BERT-F1: {eval_results['avg_bert']['f1']:.4f}")

print("\nSample generations:")

# Display individual sample results with detailed metrics
for i, (sample, reference, bleu) in enumerate(zip(
    eval_results['samples'], 
    eval_results['references'], 
    eval_results['bleu_scores']
)):
    print(f"\n[{i+1}] Metrics:")
    print(f"BLEU: {bleu:.4f}")
    
    # Add ROUGE scores for this sample
    for metric in ['rouge1', 'rouge2', 'rougeL']:
        if i < len(eval_results['rouge_scores'][metric]):
            print(f"ROUGE-{metric[-1] if metric != 'rougeL' else 'L'}: {eval_results['rouge_scores'][metric][i]:.4f}")
    
    # Add BERT scores for this sample
    for metric in ['precision', 'recall', 'f1']:
        if i < len(eval_results['bert_scores'][metric]):
            print(f"BERT-{metric.capitalize()}: {eval_results['bert_scores'][metric][i]:.4f}")
    
    print(f"\nReference: {reference[:100]}...")
    print(f"Generated: {sample}")
    print("-" * 50)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Perplexity: 2649.00
Average BLEU score: 0.0640
Average ROUGE-1: 0.1307
Average ROUGE-2: 0.1061
Average ROUGE-L: 0.1307
Average BERT-Precision: 0.7215
Average BERT-Recall: 0.8141
Average BERT-F1: 0.7648

Sample generations:

[1] Metrics:
BLEU: 0.0619
ROUGE-1: 0.1250
ROUGE-2: 0.1026
ROUGE-L: 0.1250
BERT-Precision: 0.7241
BERT-Recall: 0.8104
BERT-F1: 0.7648

Reference: overview the territory field admissions representatives will have the distinct pleasure of finding p...
Generated: overview the territory field admissions store denver 1001 colorado 2744086 2744086 1001 80203, colorado store, colorado 80203 broadway store colorado 1001 1001 2744086 colorado availability 80203, 1001 shift, colorado 1001 1001 80203 store store 1001 availability, 2744086 denver colorado broadway colorado store, 1001 availability shift shift 2744086 broadway 2744086
--------------------------------------------------

[2] Metrics:
BLEU: 0.0696
ROUGE-1: 0.1020
ROUGE-2: 0.0833
ROUGE-L: 0.1020
BERT-Precision: 0.730

### Compare sampling methods

In [31]:
def evaluate_all_sampling_methods(model, test_data, num_samples=5, seed_length=5):
    """
    Evaluate all sampling methods and compare their performance using multiple metrics
    """
    # Calculate model perplexity (independent of sampling method)
    perplexity = calculate_perplexity(model, test_data)
    print(f"Overall model perplexity: {perplexity:.2f}")
    
    methods = ['greedy', 'random', 'topk', 'nucleus', 'temperature']
    results = {}
    
    # Select common test sentences across all methods
    valid_sentences = [sent for sent in test_data if len(sent) > seed_length]
    
    if len(valid_sentences) < num_samples:
        print(f"Warning: Only {len(valid_sentences)} valid sentences found. Using all of them.")
        sample_sentences = valid_sentences
    else:
        indices = np.random.choice(len(valid_sentences), num_samples, replace=False)
        sample_sentences = [valid_sentences[i] for i in indices]
    
    # Initialize ROUGE scorer once
    rouge_metrics = ['rouge1', 'rouge2', 'rougeL']
    scorer = rouge_scorer.RougeScorer(rouge_metrics, use_stemmer=True)
    
    # Evaluate each method
    for method in methods:
        print(f"\nEvaluating {method} sampling...")
        
        # Special handling for temperature
        if method == 'temperature':
            temps = [0.5, 1.0, 2.0]
            method_results = []
            
            for temp in temps:
                result = evaluate_with_metrics(model, sample_sentences, seed_length, method, temp, scorer)
                print(f"Temperature {temp} - Metrics: BLEU={result['avg_bleu']:.4f}, ROUGE-L={result['avg_rouge']['rougeL']:.4f}, BERT-F1={result['avg_bert']['f1']:.4f}")
                method_results.append(result)
            
            results[method] = method_results
            
        else:
            result = evaluate_with_metrics(model, sample_sentences, seed_length, method, scorer=scorer)
            print(f"Average scores: BLEU={result['avg_bleu']:.4f}, ROUGE-L={result['avg_rouge']['rougeL']:.4f}, BERT-F1={result['avg_bert']['f1']:.4f}")
            results[method] = result
    
    # Print summary comparison
    print("\n==== SAMPLING METHODS COMPARISON ====")
    print(f"Model perplexity: {perplexity:.2f}")
    
    for method in methods:
        if method == 'temperature':
            for temp_result in results[method]:
                print(f"Temperature {temp_result['temperature']}: " + 
                     f"BLEU={temp_result['avg_bleu']:.4f}, " + 
                     f"ROUGE-L={temp_result['avg_rouge']['rougeL']:.4f}, " +
                     f"BERT-F1={temp_result['avg_bert']['f1']:.4f}")
        else:
            res = results[method]
            print(f"{method.capitalize()}: " + 
                 f"BLEU={res['avg_bleu']:.4f}, " + 
                 f"ROUGE-L={res['avg_rouge']['rougeL']:.4f}, " +
                 f"BERT-F1={res['avg_bert']['f1']:.4f}")
    
    return {"perplexity": perplexity, "methods": results}

def evaluate_with_metrics(model, sample_sentences, seed_length, method, temp=1.0, scorer=None):
    """Helper function that evaluates a specific method with all metrics"""
    bleu_scores = []
    rouge_scores = {metric: [] for metric in ['rouge1', 'rouge2', 'rougeL']}
    bert_scores = {'precision': [], 'recall': [], 'f1': []}
    samples = []
    references = []
    
    # Create ROUGE scorer if not provided
    if scorer is None:
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Generate and evaluate for each selected sentence
    for reference in sample_sentences:
        # Get seed
        seed = ' '.join(reference[:seed_length])
        
        # Generate text
        generated = generate_with_sampling(model, num_words=50, 
                                         text_seed=seed, 
                                         method=method,
                                         temp=temp)
        
        # Calculate metrics
        hypothesis = word_tokenize(generated)
        smoothing = SmoothingFunction().method1
        reference_text = ' '.join(reference)
        
        try:
            # BLEU score
            bleu = sentence_bleu([reference], hypothesis, smoothing_function=smoothing)
            bleu_scores.append(bleu)
            
            # ROUGE scores
            rouge_results = scorer.score(reference_text, generated)
            for metric in rouge_scores.keys():
                rouge_scores[metric].append(rouge_results[metric].fmeasure)
            
            samples.append(generated)
            references.append(reference_text)
            
        except Exception as e:
            print(f"Error calculating metrics: {e}")
            continue
    
    # Calculate BERT scores (batch computation for efficiency)
    if samples:
        P, R, F1 = bert_score.score(samples, references, lang="en", verbose=False)
        bert_scores['precision'] = P.tolist()
        bert_scores['recall'] = R.tolist()
        bert_scores['f1'] = F1.tolist()
    
    # Calculate averages
    avg_bleu = np.mean(bleu_scores) if bleu_scores else 0.0
    avg_rouge = {metric: np.mean(scores) if scores else 0.0 for metric, scores in rouge_scores.items()}
    avg_bert = {metric: np.mean(scores) if scores else 0.0 for metric, scores in bert_scores.items()}
    
    result = {
        "temperature": temp if method == 'temperature' else None,
        "avg_bleu": avg_bleu,
        "avg_rouge": avg_rouge,
        "avg_bert": avg_bert,
        "samples": samples,
        "references": references,
        "bleu_scores": bleu_scores,
        "rouge_scores": rouge_scores,
        "bert_scores": bert_scores
    }
    
    return result

In [32]:
# Evaluate all sampling methods
evaluation_results = evaluate_all_sampling_methods(laplace_model, test_corpus, num_samples=3)

def display_evaluation_results(results, num_examples=2):
    """Display evaluation results with all metrics"""
    for method, result in results["methods"].items():
        print(f"\n== {method.upper()} SAMPLING ==")
        
        if method == 'temperature':
            for temp_result in result:
                temp = temp_result["temperature"]
                print(f"\nTemperature: {temp}")
                print(f"BLEU: {temp_result['avg_bleu']:.4f}")
                print(f"ROUGE-1: {temp_result['avg_rouge']['rouge1']:.4f}")
                print(f"ROUGE-2: {temp_result['avg_rouge']['rouge2']:.4f}")
                print(f"ROUGE-L: {temp_result['avg_rouge']['rougeL']:.4f}")
                print(f"BERT-F1: {temp_result['avg_bert']['f1']:.4f}")
                
                for i, (sample, reference) in enumerate(zip(temp_result["samples"][:num_examples], 
                                                        temp_result["references"][:num_examples])):
                    print(f"\n[{i+1}]")
                    print(f"Reference: {reference[:100]}...")
                    print(f"Generated: {sample}")
                    print("-" * 50)
        else:
            print(f"BLEU: {result['avg_bleu']:.4f}")
            print(f"ROUGE-1: {result['avg_rouge']['rouge1']:.4f}")
            print(f"ROUGE-2: {result['avg_rouge']['rouge2']:.4f}")
            print(f"ROUGE-L: {result['avg_rouge']['rougeL']:.4f}")
            print(f"BERT-F1: {result['avg_bert']['f1']:.4f}")
            
            for i, (sample, reference) in enumerate(zip(result["samples"][:num_examples], 
                                                    result["references"][:num_examples])):
                print(f"\n[{i+1}]")
                print(f"Reference: {reference[:100]}...")
                print(f"Generated: {sample}")
                print("-" * 50)

# Display evaluation results
display_evaluation_results(evaluation_results, num_examples=2)

Overall model perplexity: 2649.00

Evaluating greedy sampling...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average scores: BLEU=0.0619, ROUGE-L=0.1257, BERT-F1=0.7409

Evaluating random sampling...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average scores: BLEU=0.0619, ROUGE-L=0.1092, BERT-F1=0.7548

Evaluating topk sampling...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average scores: BLEU=0.0696, ROUGE-L=0.1324, BERT-F1=0.7443

Evaluating nucleus sampling...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average scores: BLEU=0.0619, ROUGE-L=0.1237, BERT-F1=0.7637

Evaluating temperature sampling...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Temperature 0.5 - Metrics: BLEU=0.0619, ROUGE-L=0.1064, BERT-F1=0.7639


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Temperature 1.0 - Metrics: BLEU=0.0624, ROUGE-L=0.0984, BERT-F1=0.7599


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Temperature 2.0 - Metrics: BLEU=0.0619, ROUGE-L=0.1056, BERT-F1=0.7540

==== SAMPLING METHODS COMPARISON ====
Model perplexity: 2649.00
Greedy: BLEU=0.0619, ROUGE-L=0.1257, BERT-F1=0.7409
Random: BLEU=0.0619, ROUGE-L=0.1092, BERT-F1=0.7548
Topk: BLEU=0.0696, ROUGE-L=0.1324, BERT-F1=0.7443
Nucleus: BLEU=0.0619, ROUGE-L=0.1237, BERT-F1=0.7637
Temperature 0.5: BLEU=0.0619, ROUGE-L=0.1064, BERT-F1=0.7639
Temperature 1.0: BLEU=0.0624, ROUGE-L=0.0984, BERT-F1=0.7599
Temperature 2.0: BLEU=0.0619, ROUGE-L=0.1056, BERT-F1=0.7540

== GREEDY SAMPLING ==
BLEU: 0.0619
ROUGE-1: 0.1257
ROUGE-2: 0.1032
ROUGE-L: 0.1257
BERT-F1: 0.7409

[1]
Reference: all qualified applicants will receive consideration for employment without regard to race , color , ...
Generated: all qualified applicants will receive store store store store store store store store store store store store store store store store store store store store store store store store store store store store store store store store store store s