In [5]:
import csv

input_path = "training_data/dolly-15k-compressions-gpt-test.csv"
output_path = "training_data/dolly-summarization-data.csv"

unique_prompts = set()
cleaned_rows = []

with open(input_path, "r", encoding="utf-8") as infile:
    reader = csv.reader(infile)
    header = next(reader)
    for row in reader:
        prompt = row[0]
        # Only keep if fifth column is not empty and prompt is unique
        if row[4].strip() and prompt not in unique_prompts:
            cleaned_rows.append(row)
            unique_prompts.add(prompt)

with open(output_path, "w", encoding="utf-8", newline="") as outfile:
    writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
    writer.writerow(header)
    writer.writerows(cleaned_rows)

print(f"Cleaned data written to {output_path}. {len(cleaned_rows)} unique, complete rows retained.")

Cleaned data written to training_data/dolly-summarization-data.csv. 14779 unique, complete rows retained.


In [8]:
import csv

input_path = "training_data/dolly-summarization-data.csv"

total_original_tokens = 0
total_compression_tokens = 0
row_count = 0

with open(input_path, "r", encoding="utf-8") as infile:
    reader = csv.reader(infile)
    header = next(reader)
    for row in reader:
        try:
            orig_tokens = int(row[2])
            comp_tokens = int(row[3])
            total_original_tokens += orig_tokens
            total_compression_tokens += comp_tokens
            row_count += 1
        except (ValueError, IndexError):
            continue  # skip rows with missing or invalid token counts

if total_original_tokens > 0:
    overall_compression_ratio = round(total_compression_tokens / total_original_tokens, 4)
else:
    overall_compression_ratio = None

print(f"Total rows: {row_count}")
print(f"Total original tokens: {total_original_tokens}")
print(f"Total compression tokens: {total_compression_tokens}")
print(f"Overall compression ratio (column 5): {overall_compression_ratio}")

Total rows: 14779
Total original tokens: 236923
Total compression tokens: 177070
Overall compression ratio (column 5): 0.7474


In [None]:
# Install rouge-score if not already available
!pip install --quiet rouge-score

In [9]:
import csv
from rouge_score import rouge_scorer

input_path = "training_data/dolly-summarization-data.csv"
output_path = "training_data/dolly-summarization-data-rouge.csv"

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8", newline="") as outfile:
    reader = csv.reader(infile)
    header = next(reader)
    # Add new columns for ROUGE scores
    new_header = header + ["rouge_1", "rouge_2", "rouge_l"]
    writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
    writer.writerow(new_header)
    for row in reader:
        original = row[0]
        compressed = row[1]
        # Compute ROUGE scores
        scores = scorer.score(original, compressed)
        rouge_1 = round(scores['rouge1'].fmeasure, 4)
        rouge_2 = round(scores['rouge2'].fmeasure, 4)
        rouge_l = round(scores['rougeL'].fmeasure, 4)
        writer.writerow(row + [rouge_1, rouge_2, rouge_l])

print(f"ROUGE scores added. Output written to {output_path}.")

ROUGE scores added. Output written to training_data/dolly-summarization-data-rouge.csv.


In [10]:
import csv

input_path = "training_data/dolly-summarization-data-rouge.csv"

total_rouge_1 = 0.0
total_rouge_2 = 0.0
total_rouge_l = 0.0
row_count = 0

with open(input_path, "r", encoding="utf-8") as infile:
    reader = csv.reader(infile)
    header = next(reader)
    rouge_1_idx = header.index("rouge_1")
    rouge_2_idx = header.index("rouge_2")
    rouge_l_idx = header.index("rouge_l")
    for row in reader:
        try:
            total_rouge_1 += float(row[rouge_1_idx])
            total_rouge_2 += float(row[rouge_2_idx])
            total_rouge_l += float(row[rouge_l_idx])
            row_count += 1
        except (ValueError, IndexError):
            continue  # skip rows with missing or invalid scores

if row_count > 0:
    avg_rouge_1 = round(total_rouge_1 / row_count, 4)
    avg_rouge_2 = round(total_rouge_2 / row_count, 4)
    avg_rouge_l = round(total_rouge_l / row_count, 4)
else:
    avg_rouge_1 = avg_rouge_2 = avg_rouge_l = None

print(f"Total rows: {row_count}")
print(f"Average ROUGE-1: {avg_rouge_1}")
print(f"Average ROUGE-2: {avg_rouge_2}")
print(f"Average ROUGE-L: {avg_rouge_l}")

Total rows: 14779
Average ROUGE-1: 0.722
Average ROUGE-2: 0.5207
Average ROUGE-L: 0.6755
