<a href="https://colab.research.google.com/github/fhariyaaaaa/Analysis-and-prediction-of-homevalues/blob/main/TensorFlow_with_GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tensorflow with GPU

This notebook provides an introduction to computing on a [GPU](https://cloud.google.com/gpu) in Colab. In this notebook you will connect to a GPU, and then run some basic TensorFlow operations on both the CPU and a GPU, observing the speedup provided by using the GPU.


## Enabling and testing the GPU

First, you'll need to enable GPUs for the notebook:

- Navigate to Edit→Notebook Settings
- select GPU from the Hardware Accelerator drop-down

Next, we'll confirm that we can connect to the GPU with tensorflow:

In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow 2.x selected.
Found GPU at: /device:GPU:0


## Observe TensorFlow speedup on GPU relative to CPU

This example constructs a typical convolutional neural network layer over a
random image and manually places the resulting ops on either the CPU or the GPU
to compare execution speed.

In [None]:
import tensorflow as tf
import timeit

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

def cpu():
  with tf.device('/cpu:0'):
    random_image_cpu = tf.random.normal((100, 100, 100, 3))
    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
    return tf.math.reduce_sum(net_cpu)

def gpu():
  with tf.device('/device:GPU:0'):
    random_image_gpu = tf.random.normal((100, 100, 100, 3))
    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
    return tf.math.reduce_sum(net_gpu)

# We run each op once to warm up; see: https://stackoverflow.com/a/45067900
cpu()
gpu()

# Run the op several times.
print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
      '(batch x height x width x channel). Sum of ten runs.')
print('CPU (s):')
cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):')
gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))

Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images (batch x height x width x channel). Sum of ten runs.
CPU (s):
3.862475891000031
GPU (s):
0.10837535100017703
GPU speedup over CPU: 35x


In [1]:
pip install tensorflow transformers torch



In [2]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import os
import pdfplumber
import re
from typing import List, Dict

# Step 1: Text Extraction from PDFs

def extract_text_from_pdfs(pdf_folder: str) -> List[Dict]:
    """
    Extract text from all PDF files in the given folder.

    Args:
        pdf_folder (str): Path to the folder containing PDF files.

    Returns:
        List[Dict]: A list of dictionaries, each containing the file name, page number, and extracted text.
    """
    extracted_text = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            file_path = os.path.join(pdf_folder, filename)
            try:
                with pdfplumber.open(file_path) as pdf:
                    for page_num, page in enumerate(pdf.pages):
                        text = page.extract_text()
                        if text:
                            extracted_text.append({
                                "file_name": filename,
                                "page_number": page_num + 1,
                                "text": text
                            })
            except Exception as e:
                print(f"Error processing file {filename}: {e}")
    return extracted_text

In [14]:
# Step 2: Chunking Text with Metadata
import re

def chunk_text_with_metadata(extracted_text: List[Dict], chunk_size: int = 2000) -> List[Dict]:
    """
    Chunk the extracted text into larger, meaningful sections with metadata.

    Args:
        extracted_text (List[Dict]): A list of dictionaries containing extracted text with metadata.
        chunk_size (int): The maximum character length for each chunk.

    Returns:
        List[Dict]: A list of dictionaries, each containing the file name, page number, and chunked text.
    """
    chunks = []
    for entry in extracted_text:
        text = entry["text"]
        current_chunk = ""
        for line in text.split("\n"):
            # Skip lines that are boilerplate or procedural
            if re.search(r'\b(APPEARANCES|COUNSEL PRESENT|INDEX|EXHIBIT|COURT|TRANSCRIPT)\b', line, re.IGNORECASE):
                continue

            if len(current_chunk) + len(line) <= chunk_size:
                current_chunk += line + " "
            else:
                chunks.append({
                    "file_name": entry["file_name"],
                    "page_number": entry["page_number"],
                    "chunk_text": current_chunk.strip()
                })
                current_chunk = line + " "

        # Add the last chunk if it's not empty
        if current_chunk:
            chunks.append({
                "file_name": entry["file_name"],
                "page_number": entry["page_number"],
                "chunk_text": current_chunk.strip()
            })

    return chunks



In [15]:
# Step 3 : Cleaning
def clean_text(chunks: List[Dict]) -> List[Dict]:
    """
    Clean the text by removing non-ASCII characters, extra whitespace, and short chunks.

    Args:
        chunks (List[Dict]): A list of dictionaries containing chunked text with metadata.

    Returns:
        List[Dict]: A list of dictionaries with cleaned text.
    """
    cleaned_chunks = []
    for chunk in chunks:
        cleaned_text = re.sub(r'[^\x00-\x7F]+', ' ', chunk["chunk_text"])  # Remove non-ASCII characters
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  #remove extra spaces
        if len(cleaned_text) > 50:
            chunk["chunk_text"] = cleaned_text
            cleaned_chunks.append(chunk)
    return cleaned_chunks


In [16]:
# Main Function to Execute All Steps
def preprocess_pdfs(pdf_folder: str) -> List[Dict]:
    """
    Preprocess PDF files by extracting, chunking, and cleaning text.

    Args:
        pdf_folder (str): Path to the folder containing PDF files.

    Returns:
        List[Dict]: A list of dictionaries containing cleaned, chunked text with metadata.
    """
    extracted_text = extract_text_from_pdfs(pdf_folder)
    chunks_with_metadata = chunk_text_with_metadata(extracted_text)
    cleaned_chunks = clean_text(chunks_with_metadata)
    return cleaned_chunks

# Example Usage - text chunking
if __name__ == "__main__":
    pdf_folder = "/content"
    processed_chunks = preprocess_pdfs(pdf_folder)
    for chunk in processed_chunks[:5]:
        print(chunk)
        print("\n---\n")

{'file_name': '2006-08-29 Basile v. Honda_Depo Transcript of Paul LeCour.pdf', 'page_number': 1, 'chunk_text': '0001 PENNSYLVANIA 2 - - - 3 KATHERINE M. BASILE, the Executrix)CIVIL DIVISION of the Estate of FRED DALBO, SR., )ASBESTOS 4 Deceased, and VIOLA IMOGENE COEN ) DALBO, his wife, in her own right,)NO. 5 )11484 CD 2005 Plaintiffs, ) 6 ) vs. ) 7 ) AMERICAN HONDA MOTOR COMPANY, ) 8 INC., et al., ) ) 9 Defendants. ) 10 - - - 11 Video Deposition of PAUL LeCOUR 12 Tuesday, August 29, 2006 13 - - - 14 The video deposition of PAUL LeCOUR, called as a witness on behalf of the Plaintiff, pursuant 15 to notice and the Pennsylvania Rules of Civil Procedure pertaining to the taking of 16 depositions, taken before me, the undersigned, Terri J. Urbash, a Notary Public in and for the 17 Commonwealth of Pennsylvania, held at RoData, 1207 Muriel Street, Pittsburgh, Pennsylvania 18 15203, commencing at 9:32 a.m., the day and date above set forth. 19 - - - 20 21 - - - 22 NETWORK DEPOSITION SERVICES