In [2]:
import pandas as pd
import numpy as np
import re
import json
from pathlib import Path
from huggingface_hub import login
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from dotenv import load_dotenv
import os

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1" 
device = "cuda"

In [4]:
df = pd.read_csv("../data/combined_text.csv")
df.head()

Unnamed: 0,folder_path,combined_cleaned_whisper_text,named_entities
0,1272/128104,mr quilter is the apostle of the middle classe...,"[('quilter', 'PERSON'), ('quilters', 'PERSON')..."
1,1272/135031,because you were sleeping instead of conquerin...,"[('poroshaegi', 'PERSON'), ('dove', 'PERSON'),..."
2,1272/141231,a man said to the universe sir i exist sweatco...,"[('brians', 'NORP'), ('thousands', 'CARDINAL')..."
3,1462/170138,he had written a number of books himself among...,"[('shakespeare', 'PERSON'), ('ernest dousen', ..."
4,1462/170142,the last two days of the voyage bartley found ...,"[('the last two days', 'DATE'), ('bartley', 'P..."


In [6]:
load_dotenv('../.env')
hf_token = os.getenv("HUGGINGFACE_TOKEN")
login(hf_token)

# Load Mistral-7B Model & Tokenizer
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16,  
    device_map=device
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.06s/it]


In [21]:
def summarize_text(text: str, model, tokenizer, style: str = "short", max_tokens_map: dict = None):
    """
    Generate a fully-paraphrased summary in one of three styles.
    Args:
        text : Source document.
        model / tokenizer: HuggingFace model & tokenizer.
        style : {'tiny', 'short', 'long'}
            tiny  -> one sentence
            short -> one paragraph
            long  -> multi-paragraph detailed
        max_tokens_map : dict, optional. can override the default max token, e.g.
            {'tiny': 40, 'short': 170, 'long': 400}
    Return: 
        The requested summary.
    """
        
    # ------------- length-------------
    default_max = {"tiny": 40, "short": 170, "long": 400}
    if max_tokens_map:
        default_max.update(max_tokens_map)
    budget = default_max.get(style, 120)
    
    # -------------prompt-------------
    # Force explicit format to avoid "Here is a summary" phrasing
    format_example = {
        "tiny": "Example of correct start: \"Art critic Mr. Quilter questions various artists' work...\" (NOT \"Here is a summary: Art critic...\")",
        "short": "Example of correct start: \"Mr. Quilter argues that Sir Frederick Layton's work...\" (NOT \"Here is a summary: Mr. Quilter argues...\")",
        "long": "Example of correct paragraph start: \"Mr. Quilter, an advocate for the middle class...\" (NOT \"Here is a detailed summary: Mr. Quilter...\")"
    }
    
    instruction = {
        "tiny": (
            f"Write ONE straightforward sentence (15–25 words) that captures the main point of the article. "
            f"Use plain, neutral language and simple words that are easy to understand. "
            f"Be direct and concise. Do not use figurative or poetic expressions. {format_example[style]}"
        ),

        "short": (
            f"Write ONE clear paragraph (3–6 sentences) summarizing the main points. "
            f"Use factual, paraphrased language and simple, easy-to-understand words. "
            f"Begin directly with the content. Do not include figurative or poetic expressions. {format_example[style]}"
        ),

        "long": (
            f"Write a detailed summary in 2–3 paragraphs. Each paragraph should have 3–5 sentences and follow a logical flow. "
            f"Use paraphrased, factual language with simple, clear words that are easy to understand. "
            f"Avoid figurative or poetic expressions. Begin directly with the content. {format_example[style]}"
        )
        }
    
    
    messages = [
        {"role": "system", "content": 
         "You are an expert summarizer who creates direct, concise summaries without any introductory phrases. "
         "You NEVER start with phrases like 'Here is a summary' or 'This text is about' - you jump straight into the content. "
         "Write in simple words for easy understanding. "
         "Always complete sentences with proper punctuation and never cut off mid-sentence."
        },
        
        {"role": "user", "content": f"""{instruction[style]}

        CRITICAL INSTRUCTION: DO NOT start with any of these phrases:
        - "Here is..."
        - "This is a summary..."
        - "In this summary..."
        - "The text discusses..."
        - "This article is about..."
        - "This passage explains..."
        - Or ANY similar introductory phrase

        INSTEAD, begin directly with the content itself.

        **Key requirements:**
        1. Paraphrase completely—do not copy phrases.
        2. Highlight the most important points and context.
        3. DIRECTLY start with the actual content - no introductions whatsoever.
        4. Always end with a complete sentence and proper punctuation.
        5. For tiny summaries, keep it to ONE sentence under 30 words.

        **Article to summarize:**
        {text}"""
        }
    ]
    
    # Create prompt and generate summary
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to(device)
    
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=budget,  
            num_beams=5,          
            temperature=0.4,      
            repetition_penalty=1.4, 
            min_length=int(budget * 0.5),
            do_sample=True,       
            top_p=0.85,  
            top_k=40,
            return_dict_in_generate=True,
            output_scores=False
        )
    
    generated_tokens = output["sequences"][:, inputs["input_ids"].shape[1]:]
    summary = tokenizer.decode(generated_tokens[0], skip_special_tokens=True).strip()
    
    # Ensure the summary ends with a complete sentence
    def ensure_complete_sentence(text):
        if not text:
            return text
            
        # If it ends with a period, question mark, or exclamation point, it's likely complete
        if text[-1] in ['.', '!', '?']:
            return text
            
        # Find the last complete sentence
        sentence_endings = [m.end() for m in re.finditer(r'[.!?]', text)]
        if sentence_endings:
            return text[:sentence_endings[-1]]
        
        # If no sentence ending is found but the text exists, return the full text
        # and add a period if it's not empty
        if text.strip():
            return text.strip() + "."
        return text
    
    # Actually use the ensure_complete_sentence function
    summary = ensure_complete_sentence(summary)
    
    return summary

In [22]:
for i in range(5):
    text = df['combined_cleaned_whisper_text'].iloc[i]  # take text from dataframe
    summary_tiny = summarize_text(text, model, tokenizer, style = "tiny")  # generate summary
    summary_short = summarize_text(text, model, tokenizer, style = "short")
    summary_long = summarize_text(text, model, tokenizer, style = "long")
    print(f"Text {i+1}: \n{text}")
    print(f"\nSummary Tiny: \n{summary_tiny}")
    print(f"\nSummary Short: \n{summary_short}")
    print(f"\nSummary Long: \n{summary_long}")
    print("\n")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Text 1: 
mr quilter is the apostle of the middle classes and we are glad to welcome his gospel nor is mr quilters manner less interesting than his matter he tells us that at this festive season of the year with christmas and roast beef looming before us similarly drawn from eating and its results occur most readily to the mind he has graved doubts whether sir frederick laytons work is really greek after all and can discover in it but little of rocky ithaca lynelles pictures are a sort of upgards and atom paintings and masons exquisite ittles are as national as a jingo poem mr birkett fosters landscapes smile at one much in the same way that mr carker used to flash his teeth and mr john collier gives his sitter a cheerful slap on the back before he says like a shampoo or a turkish bath next man it is obviously unnecessary for us to point out how luminous these criticisms are how delicate in expression on the general principles of art and mr krilter writes with equal lucidity painting he

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Text 2: 
because you were sleeping instead of conquering the lovely rose princess has become a fiddle without a bow while poroshaegi sits there accuing dove he has gone and gone for good answered polychrome who had managed to squeeze into the room beside the dragon and had witnessed the occurrences with much interest i have remained a prisoner only because i wish to be one and with this he stepped forward and burst the stout chains as easily as if they had been threads the little girl had been asleep but she heard the raps and opened the door the king is flooded disgrace and your friends are asking for you i begged ruggedo a long ago to send him away but he would not do so i also offered to help your brother to escape but he would not go he eats and sleeps very steadily replay the nooking i hope he doesnt work too hard since shaggy he doesnt work at all in fact there is nothing he can do in these dominions as well as our norms whose numbers are so great that it worries us to keep them 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Text 3: 
a man said to the universe sir i exist sweatcovered brians body trickling into the titling class that was the only girl in the world the cut on his chest is still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators retrievalities not worth thinking about his instant panic was followed by a small sharp blow high on his chest one minute a voice said and the timbers are sounded a minute is not a very large measure of time and his body needed every fraction of it the buzzers were triggered as muscles in the complete relaxation o lees heart and lungs worked on at a strong measured rate he was in reverie sliding along the borders of consciousness the contestants in the s needed undisturbed rest therefore knights in the dormitories were as quiet as death particularly so on this last night when only two of the little cubicles were occupied the thousands of others standing with dark empty doors the other voice snapped with

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Text 4: 
he had written a number of books himself among them a history of dancing a history of costume a key to shakespeare sonnets a study of the poetry of ernest dousen etc hughes written a delightful part for her and shes quite inexpressible i happen to have mcconnells box for tonight or thered be no chance of our getting places alexander exclaimed mildly myself i always knew she had it in her do you know alexander maynall looked with perplexity up into the top of the handsome and rubbed his pink cheek with his gloved finger do you know i sometimes think of taking to criticism seriously myself when they entered the stage box on the left the first act was well underway the scene being the interior of a cabin in the south of ireland as they sat down a burst of applause drew alexanders attention to the stage of course hilda is irish their bergoins have been stage people for generations and she has the irish voice its delightful to hear it in a london theatre when she began to dance by 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Text 5: 
the last two days of the voyage bartley found almost intolerable emerging at houston at half past three oclock in the afternoon alexander had his luggage sent to the savoy and drove it once to bedford square she blushed and smiled and fumbled his card in her confusion before she ran upstairs the room was empty when he entered a coal fire was crackling in the grate and the lamps were lit for it was already beginning to grow dark outside she calls his name on the threshold but in her swift flight across the room she felt a change in him and caught herself up so deftly that he could not tell just when she did it she merely brushed his cheek with her lips and put a hand lightly and joyously on either shoulder i never dreamed it would be you bartley when did you come bartley and how did it happen you havent spoken a word she looked at his heavy shoulders and big determined head frustre forward like a catapult in leash ill do anything you wish me to bartley she said chumylisly he pu

In [23]:
from tqdm.auto import tqdm

src_col = "combined_cleaned_whisper_text"
styles = ["tiny", "short", "long"]

for style in styles:
    df[f"summary_{style}"] = [
        summarize_text(text, model, tokenizer, style=style)
        for text in tqdm(df[src_col], desc=f"Summarizing [{style}]", unit="doc")
    ]

df.to_csv("summarize_text.csv", index=False)


Summarizing [tiny]:   0%|          | 0/97 [00:00<?, ?doc/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Summarizing [short]:   0%|          | 0/97 [00:00<?, ?doc/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Summarizing [long]:   0%|          | 0/97 [00:00<?, ?doc/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for