In [4]:
import spacy
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load spaCy model for preprocessing
nlp = spacy.load("en_core_web_sm")

# Load Hugging Face model and tokenizer (using DistilBERT for simplicity)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

In [5]:
# Function to preprocess text using spaCy
def preprocess_text_spacy(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return " ".join(tokens), entities

# Function to generate BERT embeddings
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the mean of the last hidden state as the embedding
    embeddings = torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy()
    return embeddings

In [6]:
# Prepare training data (mock data for illustration)
documents = [
    "Account Number: 123456789, Transaction Date: 2024-03-15, Description: Online transfer, Amount: $500.00, Balance: $1,500.00",  # Bank statement
    "Invoice Number: INV-4567, Date: 2024-02-20, Billed To: Jane Doe, Total Amount: $2,000.00, Due Date: 2024-03-15",  # Invoice
    "Name: John Smith, License Number: S123456789, Date of Birth: 1990-05-10, Expiry Date: 2026-05-10, Issuing State: California"  # Driver's license
]
labels = ["bank statement", "invoice", "driver's license"]  # Corresponding labels

# Extract features for training
X = []
y = labels
for doc in documents:
    cleaned_text, _ = preprocess_text_spacy(doc)
    embeddings = get_embeddings(cleaned_text)
    X.append(embeddings)

# Convert to numpy array
X = np.array(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

# Function to classify new documents
def classify_document(text):
    cleaned_text, entities = preprocess_text_spacy(text)
    embeddings = get_embeddings(cleaned_text)
    classification = clf.predict([embeddings])[0]

    return {
        "type": classification,
        "confidence": "high" if classification != "unknown" else "low",
        "entities": entities
    }

# Classify a new document
new_text = """
Account Number: 987654321
Transaction Date: 2024-03-25
Description: Salary deposit
Amount: $3,000.00
Balance: $8,500.00
"""

result = classify_document(new_text)
print(result)


                precision    recall  f1-score   support

bank statement       0.00      0.00      0.00       1.0
       invoice       0.00      0.00      0.00       0.0

      accuracy                           0.00       1.0
     macro avg       0.00      0.00      0.00       1.0
  weighted avg       0.00      0.00      0.00       1.0

{'type': 'invoice', 'confidence': 'high', 'entities': [('987654321', 'CARDINAL'), ('2024-03-25', 'DATE'), ('3,000.00', 'MONEY'), ('8,500.00', 'MONEY')]}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
# TODO load files, make embeddings, train a classifier, test it out