# Step 1: Setting Up the Environment

Connect to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Install Necessary Libraries

In [7]:
!pip install PyPDF2
!pip install pytesseract
!pip install pdf2image
!sudo apt-get install poppler-utils
!sudo apt-get install tesseract-ocr

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.10
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 45 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils am

Import Necessary Libraries

In [8]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import os
import pickle
import itertools
from sklearn.metrics import accuracy_score
import joblib
from PyPDF2 import PdfFileReader
import pytesseract
from PyPDF2 import PdfReader
from pdf2image import convert_from_path

# Step 2: Data Preparation

Import the extracted data of deeds and other documents from PKL files stored in Google Drive.

In [17]:
# Define the directory paths where the pickle files are stored
deed_documents_dir = '/content/drive/MyDrive/Zitles/Task 01/Deed_Documents_Train'
other_documents_dir = '/content/drive/MyDrive/Zitles/Task 01/Other_Documents_Train'

# Initialize dictionaries or lists to store the loaded content
deed_document_contents = {}
other_document_contents = {}

# Load deed document dictionaries
for i in range(1, 6):
    file_name = f'deed_documents{i}.pkl'
    file_path = os.path.join(deed_documents_dir, file_name)
    with open(file_path, 'rb') as f:
        deed_document_contents[f'deed_document_content_{i}'] = pickle.load(f)

# Load other document dictionaries
for i in range(1, 6):
    file_name = f'other_documents{i}.pkl'
    file_path = os.path.join(other_documents_dir, file_name)
    with open(file_path, 'rb') as f:
        other_document_contents[f'other_document_content_{i}'] = pickle.load(f)

Print to check if the deeds and other documents have been successfully imported.

In [18]:
# Print deed document contents
print("Deed Document Contents:")
for key, value in deed_document_contents.items():
    print(f"Key: {key}")
    print(f"Content:\n{value}\n")

# Print other document contents
print("Other Document Contents:")
for key, value in other_document_contents.items():
    print(f"Key: {key}")
    print(f"Content:\n{value}\n")

Deed Document Contents:
Key: deed_document_content_1
Content:
{'Document_1': '(Printed Text):\nN=XT\n\nABSTRACTING, LLC\n\nDate: September 15, 2022\n\nClient: Nelson Galbreath\nRE: Invoice for Abstracting Services\n\nPlease see the invoice details below for title abstracting services. Please make checks payable to Next\nAbstracting, LLC. Thank you for your business!\n\nClient Matter: 3242935\nProperty Address: 2224 Skyler Drive\nSearch Type: Up from policy\nInvoice Number: 22-1055\n\nInvoice Amount: $125\n\nDate Due: 9/30/2022\n\nVendor Information:\nNext Abstracting, LLC\nAttn: Joe Fabie\n769 Wakendaw Blvd.\nMt. Pleasant, SC 29464\n\nEmail: Joseph.A.Fabie@gmail.com\nCell: (717) 818 -3760\n\nThank you again for your business. Should there be any comments or questions, please do not hesitate\nto reach out to me directly.\n\nBest regards,\n\nfoe Fabte\n\nJoe Fabie\n\x0c\n\n(Handwritten Text):\n=\n\x0c\n\n\n(Printed Text):\nRealTitle of the Carolinas, LLC\n\nTitle Search Order Form - File

Store all the Deed documents together in one dictionary, with each document renamed to facilitate identification.

In [19]:
# Initialize a dictionary to store all small documents together
all_deed_documents = {}

# Iterate over each deed document dictionary
for i in range(1, 6):
    # Get the current deed document dictionary
    current_deed_documents = deed_document_contents[f'deed_document_content_{i}']

    # Iterate over each document in the current deed document dictionary
    for key, value in current_deed_documents.items():
        # Rename the key and store the content in the new dictionary
        new_key = f'D{i}_{key}'  # Rename the key
        all_deed_documents[new_key] = value  # Store the content

In [None]:
print(all_deed_documents.keys())

dict_keys(['D1_Document_1', 'D1_Document_5', 'D1_Document_6', 'D1_Document_8', 'D1_Document_9', 'D1_Document_11', 'D1_Document_12', 'D1_Document_13', 'D1_Document_15', 'D1_Document_25', 'D2_Document_2', 'D2_Document_3', 'D2_Document_4', 'D2_Document_6', 'D2_Document_7', 'D2_Document_8', 'D2_Document_10', 'D2_Document_11', 'D3_Document_1', 'D3_Document_2', 'D3_Document_3', 'D3_Document_4', 'D3_Document_5', 'D3_Document_6', 'D3_Document_7', 'D3_Document_11', 'D3_Document_12', 'D4_Document_1', 'D4_Document_2', 'D4_Document_4', 'D4_Document_5', 'D4_Document_6', 'D5_Document_1', 'D5_Document_2'])


Store all the Other documents together in one dictionary, with each document renamed to facilitate identification.

In [20]:
# Initialize a dictionary to store all small documents together
all_other_documents = {}

# Iterate over each deed document dictionary
for i in range(1, 6):
    # Get the current deed document dictionary
    current_other_documents = other_document_contents[f'other_document_content_{i}']

    # Iterate over each document in the current deed document dictionary
    for key, value in current_other_documents.items():
        # Rename the key and store the content in the new dictionary
        new_key = f'D{i}_{key}'  # Rename the key
        all_other_documents[new_key] = value  # Store the content

In [None]:
print(all_other_documents.keys())

dict_keys(['D1_Document_2', 'D1_Document_3', 'D1_Document_4', 'D1_Document_7', 'D1_Document_10', 'D1_Document_14', 'D1_Document_16', 'D1_Document_17', 'D1_Document_18', 'D1_Document_19', 'D1_Document_20', 'D1_Document_21', 'D1_Document_22', 'D1_Document_23', 'D1_Document_24', 'D1_Document_26', 'D1_Document_27', 'D1_Document_28', 'D1_Document_29', 'D2_Document_1', 'D2_Document_5', 'D2_Document_9', 'D3_Document_8', 'D3_Document_9', 'D3_Document_10', 'D4_Document_3', 'D5_Document_3', 'D5_Document_4'])


Check the count of each Deed document and the count of other documents to ensure that the training dataset is balanced.

In [None]:
print(len(all_deed_documents.keys()))
print(len(all_other_documents.keys()))

34
28



If it's unbalanced, remove additional data from the training set to achieve balance.

In [21]:
all_deed_documents = dict(itertools.islice(all_deed_documents.items(), 28))

In [22]:
print(len(all_deed_documents.keys()))
print(len(all_other_documents.keys()))

28
28


## Combine each deed and other document together.

This code combines the content of deed documents and other documents into one dataset. It creates labels to identify each document type (1 for deed documents, 0 for other documents). Then, it converts the content of all documents into a list for further processing.

In [23]:
# Combine the deed and other document content dictionaries into one dataset
all_documents = {}
all_documents.update(all_deed_documents)
all_documents.update(all_other_documents)

# Create labels for the documents (1 for deed documents, 0 for other documents)
labels = np.concatenate([np.ones(len(all_deed_documents)), np.zeros(len(all_other_documents))])

# Convert the dictionary values (document content) into a list
document_texts = list(all_documents.values())

In [None]:
print(labels)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]


# Step 3: Feature Extraction

This code turns the text from documents into numbers so computers can understand them better. It's like translating the words into a language the computer can read and analyze. This technique helps in preparing the text data for further analysis or machine learning tasks.

In [24]:
# Convert text data into numerical vectors using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X = tfidf_vectorizer.fit_transform(document_texts)

In [None]:
print(X)

  (0, 749)	0.002387712171672988
  (0, 1143)	0.002017004621567151
  (0, 7990)	0.005455812911666726
  (0, 1971)	0.0016377790049898323
  (0, 5604)	0.0018061462016476018
  (0, 7659)	0.0010373578068406226
  (0, 2479)	0.001717746375161372
  (0, 2574)	0.001760782595091165
  (0, 3598)	0.0016004829831002803
  (0, 8377)	0.0016004829831002803
  (0, 8491)	0.0014659519174872273
  (0, 4868)	0.0013775520910009973
  (0, 8120)	0.0034722678349726754
  (0, 6876)	0.0010952443673813903
  (0, 6618)	0.003056751039224683
  (0, 7974)	0.0012482162267601717
  (0, 6493)	0.003435492750322744
  (0, 5472)	0.0015305239503797786
  (0, 4022)	0.002146340485807977
  (0, 5031)	0.002299312345186758
  (0, 9072)	0.001760782595091165
  (0, 3205)	0.0016768103374067763
  (0, 7918)	0.0018061462016476018
  (0, 8411)	0.0012725374178046494
  (0, 5882)	0.0014354386515417648
  :	:
  (55, 8585)	0.016025430388367082
  (55, 9140)	0.008854546846433409
  (55, 2788)	0.02533903920766164
  (55, 3228)	0.005067807841532328
  (55, 344)	0.027963

# Step 4: Model Selection

This code selects a machine learning algorithm called Multinomial Naive Bayes (NB) classifier for the task. Naive Bayes is a simple but effective algorithm commonly used for text classification tasks. It works well with text data represented as numerical vectors, like the ones obtained from TF-IDF vectorization.

In [25]:
# Choose a machine learning algorithm
model = MultinomialNB()  # Naive Bayes classifier

In [None]:
print(model)

MultinomialNB()


# Step 5: Model Training

This code divides the data into two parts: training set and testing set. The training set is used to teach the model, while the testing set is kept separate to evaluate how well the model performs on new, unseen data. After splitting the data, the model is trained using the training set.

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Step 6: Model Evaluation

This code checks how well the model performs on new data. It predicts labels for the testing data and then compares those predictions to the actual labels. The classification report summarizes the model's accuracy and performance in a few key metrics.

In [None]:
# Evaluate the model on the testing set
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.60      0.75         5
         1.0       0.78      1.00      0.88         7

    accuracy                           0.83        12
   macro avg       0.89      0.80      0.81        12
weighted avg       0.87      0.83      0.82        12



Display only The Accuracy

In [None]:
# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.8333333333333334


# Save Trained Model for Future Uses

Save the trained model in Google Drive so it can be easily accessed and used for applications whenever needed.

In [None]:
# Define the path to save the trained model
# model_path = '/content/drive/MyDrive/Zitles/Task 01/trained_model_all.pkl'

# Save the trained model to the specified path
joblib.dump(model, model_path)

print("Model saved successfully to:", model_path)

Model saved successfully to: /content/drive/MyDrive/Zitles/Task 01/trained_model_all.pkl


# Import Pre-Trained Model From Google Drive

In [26]:
# Define the path to the saved model
model_path = '/content/drive/MyDrive/Zitles/Task 01/trained_model_all.pkl'

# Load the saved model
model = joblib.load(model_path)

print("Model loaded successfully from:", model_path)

Model loaded successfully from: /content/drive/MyDrive/Zitles/Task 01/trained_model_all.pkl


# Step 7: Classify the Document

Define OCR and Classification Functions

In [27]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PdfFileReader(file)
        num_pages = pdf_reader.numPages
        for page_num in range(num_pages):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
    return text

def classify_document(text):
    pred = model.predict([text])
    return "deed" if pred[0] == 1 else "other"

Define the file path for a particular document.

In [1]:
pdf_path = '/content/drive/MyDrive/Zitles/Task 01/others/otherdoc_test1.pdf'

Read the Document

In [31]:
def read_pdf(pdf_path):
    try:
        # Convert each page of the PDF to an image
        images = convert_from_path(pdf_path)

        # Extract text from each image using OCR
        text = ""
        i = 1
        for image in images:
            print('Page', i, 'is Scanning..')
            text += pytesseract.image_to_string(image)
            i += 1

        return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

# Example usage:
pdf_path =  '/content/drive/MyDrive/Zitles/Task 01/split/0616_483.pdf'
pdf_text = read_pdf(pdf_path)
if pdf_text:
    print("PDF content:")
    print(pdf_text)
else:
    print("Failed to read PDF.")

Page 1 is Scanning..
Page 2 is Scanning..
Page 3 is Scanning..
Page 5 is Scanning..
Page 6 is Scanning..
PDF content:
Recorded by and to be

 

 

returned to:

ee es

STATE OF SOUTH CAROLINA ) TITLE TO REAL ESTATE
) (Title Not Examined by

COUNTY OF CHARLESTON ) Harvey & Vallini, LLC)

KNOW ALL MEN BY THESE PRESENTS, that D.R. HORTON, INC.. herein
“Grantor(s)”, in the State aforesaid. for and in consideration of the sum of Five Hundred Fifty-
Nine Thousand and 00/100 Dollars ($559,000.00), to Grantor(s) in hand paid by ROBERT G.
GRILLI and MELISSA GRILLI, herein “Grantee(s)”, in the State aforesaid, the receipt whereof
is hereby acknowledged, has granted, bargained, sold, and released, and by these presents does
grant, bargain, sell, and release, unto the said ROBERT G. GRILLI and MELISSA GRILLI, as
joint tenants with right of survivorship and not as tenants in common, the following
described property, to wit:

ALL that certain piece. parcel or lot of land, together with any improveme

Classify a document as either a deed document or another type using the trained model, and returns a human-readable classification label.

In [32]:
def classify_document(text):
    # Convert the text into a numerical vector using TF-IDF vectorization
    text_vector = tfidf_vectorizer.transform([text])

    # Predict the class using the trained model
    predicted_class = model.predict(text_vector)[0]

    # Map the predicted class label to a human-readable format
    class_label = "Deed Document" if predicted_class == 1 else "Other Document"

    return class_label

# Example usage:
# pdf_text = "..."  # Replace "..." with the text content of the new PDF document
document_type = classify_document(pdf_text)
print(f"The document is classified as: {document_type}")


The document is classified as: Deed Document
