https://metatext.io/models/remi-bertabs-finetuned-cnndm-extractive-abstractive-summarization

# Preparation


In [None]:
# Install required libraries: transformers for BERT, summarizer for extractive baseline, sentencepiece for tokenization support
!pip install transformers
!pip install bert-extractive-summarizer
!pip install sentencepiece

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pytest-timeit

# Summarize with a chosen model

In [None]:
# Import essential libraries
import json              # for reading JSON comment data
import os                # for file path handling
import regex as re       # for text processing
import timeit            # for optional performance benchmarking

# Hugging Face and Summarizer libraries
from transformers import AutoTokenizer, pipeline, AutoConfig, AutoModel, BertTokenizer
from summarizer import Summarizer  # For extractive BERT-based summarization

# Prompt user to select summarization model type
user_input = ""
while user_input not in ["T5", "BART", "bert-ext", "bert-ext-sci", "bert-ext-bug"]:
    user_input = input("Choose model: T5, BART, bert-ext-sci, bert-ext, bert-ext-bug")

# Placeholder for processed sentence inputs
sentencelist = []

# Model initialization based on selected model type
if user_input == "T5":
    # T5 abstractive summarization model via Hugging Face pipeline
    sum_model = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="pt")
    tokenizer = AutoTokenizer.from_pretrained('t5-base')

elif user_input == "BART":
    # BART summarization (default: facebook/bart-large)
    sum_model = pipeline("summarization")
    tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')

elif user_input == "bert-ext-sci":
    # Extractive summarization using SciBERT (AllenAI)
    custom_config = AutoConfig.from_pretrained('allenai/scibert_scivocab_uncased')
    custom_config.output_hidden_states = True
    tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
    custom_model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased', config=custom_config)
    sum_model = Summarizer(custom_model=custom_model, custom_tokenizer=tokenizer)

elif user_input == "bert-ext":
    # Extractive summarization using default BERT
    sum_model = Summarizer()
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

elif user_input == "bert-ext-bug":
    # Custom fine-tuned BERT model for extractive summarization
    custom_config = AutoConfig.from_pretrained(path + '/Fine_Tuned_BertForMaskedLM/config.json')
    custom_config.output_hidden_states = True
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    custom_model = AutoModel.from_pretrained(path + '/Fine_Tuned_BertForMaskedLM/', config=custom_config)
    sum_model = Summarizer(custom_model=custom_model, custom_tokenizer=tokenizer)

# 📁 Save path configuration for chosen model
pathmodel = path + "/" + user_input

In [None]:
def clean_data(text):
    """
    Cleans raw input text for BERT-compatible summarization:
    - Splits into individual sentences using punctuation as delimiters
    - Removes lines with unwanted symbols or formats (e.g. timestamps, image sizes)
    - Tokenizes and filters overly long or short sentences
    - Ensures sentence ends with a period if needed
    Returns:
        str: cleaned, filtered, and formatted text
    """
    # Split text into sentences using punctuation or newline
    pattern1 = re.compile('[.!?] |\n')
    sentences = re.split(pattern1, text)
    washed_text = ""

    # Pattern filters for unwanted sentence formats
    pattern = re.compile("[<>{}]|[0-9][0-9]:|[0-9]+x[0-9]+|::|:[0-9]|@@")  # e.g. 12:30, 1080x1920, emojis
    pattern2 = re.compile("^(.*/)([^/]*) |==|--|@[A-Za-z0-9]+")            # file paths, markdown headers, mentions

    for sentence in sentences:
        li = re.findall(pattern, sentence)
        sentence = re.sub(pattern2, '', sentence)

        if li == []:  # Only process if it passes regex filter
            tokenized = tokenizer.tokenize(sentence)

            if len(tokenized) <= 512 and len(sentence) > 8:  # Max 512 tokens per BERT input, min length safeguard
                if sentence[-1] == ".":
                    washed_text += sentence
                else:
                    washed_text += sentence + ". "
            else:
                continue  # Skip long or short/invalid sentences

    return washed_text

In [None]:
def summarize(filenr, description, links):
    """
    Generates a formatted summary based on a bug description and related commit messages.

    Parameters:
        filenr (str): File identifier used to load the corresponding commit JSON.
        description (str): Natural language bug report or issue description.
        links (list): List of URLs related to the bug (not used in current version).

    Returns:
        str: Combined summary text including an abstracted or extracted description and cleaned commit messages.
    """
    global user_input
    end = False
    summary = ""
    bug_pattern = re.compile(r'Bug [0-9]+ - |r=[A-Za-z]+')  # Cleans metadata like "Bug 1234 - " or reviewer IDs

    # Load commit messages related to this file number
    with open(pathsolution + '/' + filenr + ".json", 'r') as json_file:
        try:
            data = json.load(json_file)
            results = data['commit_messages']

            # Abstractive summarization branch
            if user_input in ["abs", "t5", "absft"]:
                try:
                    # 🔍 Summarize the issue description with transformer model
                    summary += 'Description: \n' + sum_model(description, max_length=150, min_length=40, do_sample=False)[0]["summary_text"]
                    summary += "\n Solution: \n"

                    # Include last 3 commit messages if available
                    commit_values = list(results.values())
                    if len(commit_values) >= 3:
                        for result in commit_values[-3:]:
                            result = re.sub(bug_pattern, '', result)
                            summary += result + "\n"
                    else:
                        for result in commit_values:
                            result = re.sub(bug_pattern, '', result)
                            summary += result + "\n"
                except IndexError:
                    print("too long")
                    end = True

            # Extractive summarization branch
            elif user_input in ["ext", "ext2", "sci", "ft", "ft2", "ft3"]:
                summary += 'Description: \n' + sum_model(description, num_sentences=2)
                summary += "\n Solution: \n"

                commit_values = list(results.values())
                if len(commit_values) >= 3:
                    for result in commit_values[-3:]:
                        result = re.sub(bug_pattern, '', result)
                        summary += result + "\n"
                else:
                    for result in commit_values:
                        result = re.sub(bug_pattern, '', result)
                        summary += result + "\n"

        except:
            summary = "no commit"

    return summary


In [None]:
def generate_json(file, summary, header, destination):
    """
    Creates a structured JSON file containing a summarized bug report.

    Parameters:
        file (str): Base filename (typically an ID or index, without extension).
        summary (str): Generated summary text for the issue.
        header (str): Original issue title or bug report header.
        destination (str): Path to the directory where the JSON file should be saved.

    Output:
        Writes a JSON file with structure:
        {
            "id": <file>,
            "header": <header>,
            "summary": <summary>
        }
    """
    data = {
        'id': file.strip('.json'), 
        'header': header,           
        'summary': summary        
    }

    # Write to output file in specified directory
    with open(destination + '/' + file + '.json', 'w') as json_file:
        json.dump(data, json_file)


In [None]:
def summarize_ticket(json_file):
    """
    Loads a bug report JSON, cleans the main description, generates a summary,
    and writes the final result as a structured output JSON file.

    Parameters:
        json_file (file-like object): Opened JSON file containing bug report and comments.

    Behavior:
        - Extracts and cleans the first comment (description)
        - Removes URLs
        - Passes cleaned description to summarizer
        - If successful, writes summary + header to output JSON using generate_json()
    """
    global user_input

    # Load raw comment data
    data = json.load(json_file)

    # Remove all URLs from the first comment (used as description)
    pattern = re.compile(r"(?P<url>https?://[^\s]+)")
    description = data['comments'][0]['raw_text']
    description = re.sub(pattern, '', description)

    # Clean description text for summarization
    washed_description = clean_data(description)

    # Placeholder for links (unused, but preserved for API compatibility)
    links = ''

    # Generate combined summary from description and commit log
    summary = summarize(str(data['id']), washed_description, links)

    # Save summary if commits were available
    if summary != "no commit":
        generate_json(str(data['id']), summary, data['summary'], path + '/' + user_input)



In [None]:
file = ""  # Tracks current filename being processed (globally scoped for reuse in summarizer)

def main():
    """
    Main control flow:
    - Prompts user for filepaths to description and solution datasets
    - Creates an output directory based on selected summarization model
    - Iterates over all JSON files in the description path
    - Processes each using summarize_ticket()
    """
    global file
    global path
    global pathsolution

    # Prompt user to input paths for descriptions and commit solutions
    path = input("What is the filepath to your tickets descriptions? ")
    pathsolution = input("What is the filepath to your tickets solutions? ")

    # Create output directory for summaries if it doesn't already exist
    try:
        os.mkdir(path + "/" + user_input)
    except FileExistsError:
        print('Directory already created')

    # Iterate through all JSON files in the description path
    for file in os.listdir(path):
        try:
            with open(os.path.join(path, file)) as json_file:
                summarize_ticket(json_file)
        except IsADirectoryError:
            pass  # Ignore folders

# Measure runtime of the script execution
starttime = timeit.default_timer()
main()
print("The time difference is :", timeit.default_timer() - starttime)
