In [1]:
import fitz
from transformers import pipeline, AutoTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk

def remove_header_footer(input_pdf_path, output_pdf_path, header_height, footer_height):
    # Open the input PDF
    document = fitz.open(input_pdf_path)
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        rect = page.rect
        # Define the crop box to remove the header and footer
        crop_box = fitz.Rect(
            rect.x0,
            rect.y0 + header_height,
            rect.x1,
            rect.y1 - footer_height
        )
        page.set_cropbox(crop_box)

    # Save the cropped PDF
    document.save(output_pdf_path)


In [2]:
def remove_captions(input_pdf_path, output_pdf_path, caption_keywords):
    # Open the input PDF
    document = fitz.open(input_pdf_path)
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        # Iterate over text blocks
        for block in blocks:
            if block["type"] == 0:  # this is a text block
                block_text = block["lines"]
                for line in block_text:
                    spans = line["spans"]
                    for span in spans:
                        text = span["text"]
                        if any(keyword.lower() in text.lower() for keyword in caption_keywords):
                            # Redact the caption
                            rect = fitz.Rect(span["bbox"])
                            page.add_redact_annot(rect, text="")
                            page.apply_redactions()

    # Save the modified PDF
    document.save(output_pdf_path)
    document.close()

In [3]:
def remove_headings(input_pdf, output_pdf, font_size_threshold=12):
    # Open the PDF file
    doc = fitz.open(input_pdf)
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        blocks = page.get_text("dict")["blocks"]
        
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        if span["size"] > font_size_threshold:
                            # Remove the span if it is considered a heading
                            page.add_redact_annot(span["bbox"], fill=(1, 1, 1))

        # Apply redactions to the page
        page.apply_redactions()

    # Save the modified PDF to a new file
    doc.save(output_pdf)

In [4]:
input_pdf = "well_student.pdf"
cropped_pdf = "cropped_well_student.pdf"
output_pdf = "modified_well_student.pdf"
final_pdf = "final_well_student.pdf"
# Define header and footer heights (example values, adjust as needed)
header_height = 70  # height to remove from the top
footer_height = 80  # height to remove from the bottom

remove_header_footer(input_pdf, cropped_pdf, header_height, footer_height)

# Define caption keywords to identify captions (add more keywords as needed)
caption_keywords = ["Figure", "Table", "Fig.", "Tbl.", "Image"]

# Then, remove the captions from the cropped PDF
remove_captions(cropped_pdf, output_pdf, caption_keywords)

remove_headings(output_pdf, final_pdf)

In [5]:
def extract_text_from_pdf(pdf_path):
    # Open the PDF
    document = fitz.open(pdf_path)
    text = ""

    # Iterate over each page
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        # Extract text from the page
        text += page.get_text()

    return text

# Path to the modified PDF
pdf_path = "final_well_student.pdf"

# Extract text from the modified PDF
extracted_text = extract_text_from_pdf(pdf_path)

# Print the extracted text
print(extracted_text)

Country 
India 
State/ District 
Tripura, West Tripura 
Village/ Mandal/Tehsil 
Bishalgarh 
Basin 
A & AA Basin 
Area/ Structure 
Agartala Dome 
Proposed as 
(including EPMB/ File details) 
B-AD-54 
13th BEXB dt 22-23.12.23 
Released as 
(Including order no. & date) 
ADAV 
(AAAB/BMG/Loc.Rel./30th EPMB/2021-22 dated 
31.03.2022) 
Modified order 
(AAAB/BMG/EPMB/LRO_Mod/2022-23 dated 
21.10.2022) 
Well No. (Well Name) 
AD#66 (ADAV) 
WBS Element 
AG.23E.PMLAD.AD#ADAV 
Well Type / Category 
Exploratory / “B” 
Expendable / Non-Expendable 
Non-Expendable 
Well Profile 
Inclined L-Profile 
COORDINATES 
Released_modified 
(Sub-Surface) 
Latitude:  
PROJECTION SYSTEM 
WGS-84 
Longitude:   
Northing:  
Easting:  
Staked 
Latitude:    
Longitude:  
Northing:  
Easting:  
Actual (Final Surface Location) 
Latitude:   3°3‟11.831”N 
LAMBERT26_90 
Longitude: 1°6‟1.2”E 
Northing: 72613.65 
Easting: 6297.74 
Seismic Reference 
IL: 35 XL: 㤀97181  
(Tripura Mega Merge vol) 
Offset from Released Location 
3

In [6]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration

# Initialize the tokenizer and model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# Function to chunk text
def chunk_text(text, tokenizer, max_chunk_length=1024):
    tokens = tokenizer(text, return_tensors='pt', truncation=False)
    input_ids = tokens['input_ids'].squeeze().tolist()
    
    chunks = []
    for i in range(0, len(input_ids), max_chunk_length):
        chunk_ids = input_ids[i:i + max_chunk_length]
        chunks.append(tokenizer.decode(chunk_ids, skip_special_tokens=True))
    
    return chunks

# Function to summarize text
def summarize_text(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    
    if inputs['input_ids'].shape[1] > 1024:
        raise ValueError("Input text length exceeds model's max input length.")
    
    # Generate summary
    summary_ids = model.generate(inputs["input_ids"], num_beams=4, length_penalty=2.0, max_length=1024, min_length=100, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary


# Chunk the text
text_chunks = chunk_text(extracted_text, tokenizer)

# Summarize each chunk
summaries = []
for chunk in text_chunks:
    try:
        summary = summarize_text(chunk, model, tokenizer)
        summaries.append(summary)
    except Exception as e:
        print(f"Error summarizing chunk: {e}")
        summaries.append("")

# Combine the summaries into a final summary
summary = " ".join(summaries)

print(summary)


Agartala Dome structure is the most prolific hydrocarbon accumulation in Tripura. No conventional core was cut in this well. No production was carried out in the well. The well was completed on 12:05:2023 at 17:45:45 hrs. It is located in the Bishalgarh region of West Tripura, India. It was drilled by Jindal Drilling & Industries Ltd (Unit no. 2K-1334) in the AP-36 area of Upper Bhuban. In Tripura, through remote sensing studies and systematic geological mapping, 24 structures (18 exposed and 6 concealed) have been identified. So far 18 structures have been probed. In total 11 structures (07 fields are put on production & 04 fields yet to be put onProduction) have established gas reserves in Bokabil, Upper Bhuban, Middle Bhuban and Lower Bhubans. Proven gas producing reservoirs are mainly within the facies of Mio-Pliocene and younger rocks exposed on the surface. amura Thrust, which has a spectacularly long N-S trending Baramura structure has developed on the hanging wall of this major

In [14]:
def limit_summary(text):
    # Split the text into words
    words = text.split()
    
    # Join the first 800 words into a single string
    first_800_words = ' '.join(words[:800])
    
    # Print the first 800 words
    print(first_800_words)

limited_summary = limit_summary(summary)

print(limited_summary)

Agartala Dome structure is the most prolific hydrocarbon accumulation in Tripura. No conventional core was cut in this well. No production was carried out in the well. The well was completed on 12:05:2023 at 17:45:45 hrs. It is located in the Bishalgarh region of West Tripura, India. It was drilled by Jindal Drilling & Industries Ltd (Unit no. 2K-1334) in the AP-36 area of Upper Bhuban. In Tripura, through remote sensing studies and systematic geological mapping, 24 structures (18 exposed and 6 concealed) have been identified. So far 18 structures have been probed. In total 11 structures (07 fields are put on production & 04 fields yet to be put onProduction) have established gas reserves in Bokabil, Upper Bhuban, Middle Bhuban and Lower Bhubans. Proven gas producing reservoirs are mainly
None


In [12]:
def evaluate_summary(reference_text, summary_text):
    # Tokenize the reference and summary texts
    reference_tokens = [reference_text.split()]
    summary_tokens = summary_text.split()
    # Calculate BLEU score
    smoothie = SmoothingFunction().method4
    bleu_score = sentence_bleu(reference_tokens, summary_tokens, smoothing_function=smoothie)
    return bleu_score

bleu_score = evaluate_summary(extracted_text, limited_summary)

print("\nBLEU Score:", bleu_score)


BLEU Score: 1.1547772185971148e-30
