In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ===============================
# SECTION 1: IMPORTS AND SETUP
# ===============================

import pandas as pd
import json
import os
import glob

# ===============================
# SECTION 2: DATA EXTRACTION
# ===============================
# Paths and glob for CSV and JSON

# Paths to the directories
csv_folder = '/Users/fathimazajel/Documents/GitHub/image_invoice_reader/data/box'
json_folder = '/Users/fathimazajel/Documents/GitHub/image_invoice_reader/data/key'

# List all files
csv_files = glob.glob(os.path.join(csv_folder, '*.csv'))
json_files = glob.glob(os.path.join(json_folder, '*.json'))

# Create a dictionary of JSON files
json_file_dict = {os.path.splitext(os.path.basename(jf))[0]: jf for jf in json_files}

# Initialize lists to hold texts and labels
texts = []
labels = []
# Process each CSV file
for csv_file in csv_files:
    csv_filename_without_extension = os.path.splitext(os.path.basename(csv_file))[0]

    json_file = json_file_dict.get(csv_filename_without_extension)
    if json_file:
        # Load data
        df_csv = pd.read_csv(csv_file, header=None, sep="\t")  # Adjust parameters if needed

        with open(json_file, 'r') as file:
            json_data = json.load(file)

        # Extract and label data
        for index, row in df_csv.iterrows():
            if len(row) > 8:  # Make sure the row has at least 9 columns
                text_segment = str(row[8]).strip()  # The ninth column contains the text

                # Simple method to match and label text segments
                if text_segment in json_data["company"]:
                    labels.append("company")
                elif text_segment in json_data["date"]:
                    labels.append("date")
                elif text_segment in json_data["address"]:
                    labels.append("address")
                elif text_segment in json_data["total"]:
                    labels.append("total")
                else:
                    labels.append("other")

                texts.append(text_segment)
            else:
                print(f"Row {index} in file {csv_file} does not have 9 columns.")


In [9]:
pip install pandas openpyxl




In [10]:
# ===============================
# SECTION 3: DATA CONVERSION
# ===============================
# Convert txt files to xls format

import os
import pandas as pd

# Source and destination directories
source_dir = "/Users/fathimazajel/Documents/GitHub/image_invoice_reader/data/box"
dest_dir = "/Users/fathimazajel/Documents/GitHub/image_invoice_reader/data/box2"

# Ensure destination directory exists
os.makedirs(dest_dir, exist_ok=True)

# List all .txt files in source directory
txt_files = [f for f in os.listdir(source_dir) if f.endswith('.txt')]

for txt_file in txt_files:
    # Build the corresponding .xls filename
    xls_file = os.path.splitext(txt_file)[0] + ".xls"

    # Read each line from the txt file, split the data, and collect in a list
    data_list = []
    with open(os.path.join(source_dir, txt_file), 'r', encoding='utf-8') as infile:
        for line in infile:
            # Split by comma and extract columns and text
            columns = line.strip().split(",")[:8]
            text = ",".join(line.strip().split(",")[8:])
            data_list.append(columns + [text])

    # Convert the data list to a DataFrame and write to Excel
    df = pd.DataFrame(data_list, columns=["Column1", "Column2", "Column3", "Column4", "Column5", "Column6", "Column7", "Column8", "Text"])
    df.to_excel(os.path.join(dest_dir, xls_file), index=False, engine='openpyxl')

print(f"Converted {len(txt_files)} .txt files to .xls in {dest_dir}")



FileNotFoundError: ignored

In [None]:
# ===============================
# SECTION 4: DATA VALIDATION
# ===============================
# Validate the data dimensions in xls files

import pandas as pd

# Path to the .xls file
xls_file_path = "/Users/fathimazajel/Documents/GitHub/image_invoice_reader/data/box2/X00016469612.xls"  # replace 'filename.xls' with the name of the file you want to check

# Read the Excel file
df = pd.read_excel(xls_file_path, engine='openpyxl')

# Get the dimensions
num_rows, num_cols = df.shape

print(f"The file '{xls_file_path}' has {num_rows} rows and {num_cols} columns.")


The file '/Users/fathimazajel/Documents/GitHub/image_invoice_reader/data/box2/X00016469612.xls' has 44 rows and 9 columns.


In [1]:
# ===============================
# SECTION 5: DATA LABELLING
# ===============================
# Extract and label data from XLS


import pandas as pd
import json
import os
import glob

xls_folder = '/content/drive/MyDrive/image_invoice_processing_data/SROIE2019/box2'
json_folder = '/content/drive/MyDrive/image_invoice_processing_data/SROIE2019/key'

xls_files = glob.glob(os.path.join(xls_folder, '*.xls'))
json_files = glob.glob(os.path.join(json_folder, '*.json'))

json_file_dict = {os.path.splitext(os.path.basename(jf))[0]: jf for jf in json_files}

texts = []
labels = []

for xls_file in xls_files:
    xls_filename_without_extension = os.path.splitext(os.path.basename(xls_file))[0]

    json_file = json_file_dict.get(xls_filename_without_extension)
    if json_file:
        df_xls = pd.read_excel(xls_file, header=None)

        with open(json_file, 'r') as file:
            json_data = json.load(file)

        # Check if 'address' is missing in the JSON data and print the content
        if 'address' not in json_data:
            print(f"JSON file {json_file} does not contain the 'address' key.")
            print(json_data)
            print("-" * 50)  # Just for better visual separation

        # This assumes the last column in the xls file is the text
        for _, row in df_xls.iterrows():
            segment = str(row.iloc[-1])

            # Basic matching for labels. This is very rudimentary and might need further refinement.
            if 'company' in json_data and segment in json_data['company']:
                label = "company"
            elif 'date' in json_data and segment in json_data['date']:
                label = "date"
            elif 'address' in json_data and segment in json_data['address']:
                label = "address"
            elif 'total' in json_data and segment in json_data['total']:
                label = "total"
            else:
                label = "other"

            texts.append(segment)
            labels.append(label)

    else:
        print(f"No corresponding JSON file found for {xls_file}")

# At this point, texts and labels have your data labeled.


JSON file /content/drive/MyDrive/image_invoice_processing_data/SROIE2019/key/X51005663280.json does not contain the 'address' key.
{'company': 'T.A.S LEISURE SDN BHD', 'date': '30 DEC 17', 'total': '102.40'}
--------------------------------------------------


In [2]:


# 1. Check if the length of texts and labels lists are the same
if len(texts) == len(labels):
    print("All data segments have labels.")
else:
    print("Mismatch between number of data segments and labels.")

# 2. Check for "other" labels
other_count = labels.count("other")
if other_count > 0:
    print(f"There are {other_count} data segments labeled as 'other'.")

# 3. Verify the distribution of the labels
label_distribution = {label: labels.count(label) for label in set(labels)}
print("Label distribution:", label_distribution)


All data segments have labels.
There are 28421 data segments labeled as 'other'.
Label distribution: {'company': 731, 'total': 1578, 'other': 28421, 'date': 1678, 'address': 1844}


In [4]:
!pip install transformers datasets
!pip install transformers -U
!pip install torch
!pip install accelerate -U




In [5]:
!pip show transformers
!pip show torch
!pip show accelerate


Name: transformers
Version: 4.33.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 
Name: torch
Version: 2.0.1+cu118
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, jinja2, networkx, sympy, triton, typing-extensions
Required-by: accelerate, fastai, torchaudio, torchdata, torchtext, torchvision, triton
Name: accel

In [6]:
# ===============================
# SECTION 6: MODEL PREPARATION AND TRAINING
# ===============================
# Tokenization and dataset preparation



from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd
import tempfile

# Tokenize the texts
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(texts, truncation=True, padding=True)

# Convert labels from strings to integers
label_dict = {"company": 0, "date": 1, "address": 2, "total": 3, "other": 4}
labels = [label_dict[l] for l in labels]



In [7]:
from typing import List, Tuple
import numpy as np

def train_val_test_split(
    data: List,
    labels: List,
    train_ratio: float = 0.8,
    val_ratio: float = 0.1,
) -> Tuple[List, List, List, List, List, List]:

    # Ensure data and labels have the same length
    assert len(data) == len(labels)

    # Calculate the indices for splitting
    train_end = int(train_ratio * len(data))
    val_end = train_end + int(val_ratio * len(data))

    # Shuffle data and labels together
    combined = list(zip(data, labels))
    np.random.shuffle(combined)
    data, labels = zip(*combined)

    # Split the data and labels based on indices
    train_data, train_labels = data[:train_end], labels[:train_end]
    val_data, val_labels = data[train_end:val_end], labels[train_end:val_end]
    test_data, test_labels = data[val_end:], labels[val_end:]

    return train_data, val_data, test_data, train_labels, val_labels, test_labels


In [8]:
# Split the data
train_texts, val_texts, test_texts, train_labels, val_labels, test_labels = train_val_test_split(texts, labels, train_ratio=0.8, val_ratio=0.1)

# Create temporary files
train_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
val_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
test_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")

# Save your splits to these files
train_df = pd.DataFrame({'text': train_texts, 'labels': train_labels})
train_df.to_csv(train_temp_file.name, index=False)

val_df = pd.DataFrame({'text': val_texts, 'labels': val_labels})
val_df.to_csv(val_temp_file.name, index=False)

test_df = pd.DataFrame({'text': test_texts, 'labels': test_labels})
test_df.to_csv(test_temp_file.name, index=False)



In [9]:
# Now, load them using `load_dataset`
train_dataset = load_dataset('csv', data_files={'train': train_temp_file.name})['train']
val_dataset = load_dataset('csv', data_files={'validation': val_temp_file.name})['validation']
test_dataset = load_dataset('csv', data_files={'test': test_temp_file.name})['test']

# Optionally, after you're done, you can close and remove the temporary files
train_temp_file.close()
val_temp_file.close()
test_temp_file.close()

# Initialize model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_dict))



Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Define training arguments and train
training_args = TrainingArguments(
    output_dir='./results',  # <-- Add this line to specify the output directory
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    evaluation_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    remove_unused_columns=False,
)



In [12]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=256)


In [13]:
import torch

class InvoiceDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = InvoiceDataset(train_encodings, train_labels)
val_dataset = InvoiceDataset(val_encodings, val_labels)
test_dataset = InvoiceDataset(test_encodings, test_labels)


In [15]:
# 3. Initialize the Trainer and Train:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    evaluation_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    remove_unused_columns=False,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Assuming train_labels is a list of all your labels
num_labels = len(set(train_labels))

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#initial model evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()


In [26]:
#finetuning the model
#added load_best_model_at_end=True,metric_for_best_model="loss",weight_decay=0.01 to the arguments and early stop /callback to trainer()
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    evaluation_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()


Step,Training Loss,Validation Loss
10,0.3043,0.401247
20,0.2024,0.399772
30,0.2878,0.371344
40,0.1863,0.40417


KeyboardInterrupt: ignored

In [25]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=5,
    num_train_epochs=4,
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    evaluation_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    weight_decay=0.0001,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()

Step,Training Loss,Validation Loss
10,0.5694,0.468573
20,0.3551,0.438028
30,0.1548,0.450771


TrainOutput(global_step=30, training_loss=0.35976630051930747, metrics={'train_runtime': 629.7453, 'train_samples_per_second': 174.045, 'train_steps_per_second': 34.814, 'total_flos': 970272112500.0, 'train_loss': 0.35976630051930747, 'epoch': 0.01})

In [27]:
# 4. Save the Model and Tokenizer:
model.save_pretrained("./my_model")
tokenizer.save_pretrained("./my_model")


('./my_model/tokenizer_config.json',
 './my_model/special_tokens_map.json',
 './my_model/vocab.txt',
 './my_model/added_tokens.json',
 './my_model/tokenizer.json')