<a href="https://colab.research.google.com/github/cmbhatt1/FinancialDocsClassifier/blob/main/FinancialDocumentsClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio
!pip install pandas scikit-learn datasets transformers
!pip install transformers[torch]

In [None]:
from google.colab import drive
import os
import pandas as pd
from bs4 import BeautifulSoup
import re
import chardet

# Mount Google Drive
drive.mount('/content/drive')

# Function to detect the encoding of a file
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
    return result['encoding']

# Function to clean the text
def clean_text(text):
    # Remove patterns like a), b), c), (i), (ii), etc.
    text = re.sub(r'[a-z]\)|\(\w+\)', '', text)
    # Remove numbers and hyphens
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'-', '', text)
    # Remove specific patterns, commas, and dots
    text = text.replace('.', '').replace(',', '').strip()
    if 'saved from url=' in text:
        return ''
    return text

# Function to extract text from an HTML file and clean it
def extract_text_from_html(html_file):
    # Detect encoding
    encoding = detect_encoding(html_file)

    # Open and read file with the detected encoding
    with open(html_file, 'r', encoding=encoding, errors='ignore') as file:
        soup = BeautifulSoup(file, 'html.parser')

    # Extract all text elements
    text_elements = soup.find_all(string=True)
    cleaned_text = [clean_text(element.strip()) for element in text_elements if element.strip() and clean_text(element.strip()) != '']

    # Remove empty strings
    cleaned_text = [text for text in cleaned_text if text]
    return cleaned_text

# Define root directory and subfolders
root_dir = '/content/drive/MyDrive/data'
subfolders = ['Balance Sheets', 'Cash Flow', 'Income Statement', 'Notes', 'Others']

data = []
file_id = 1

for label in subfolders:
    folder_path = os.path.join(root_dir, label)

    # Loop through each file in the subfolder
    for filename in os.listdir(folder_path):
        if filename.endswith('.html'):
            file_path = os.path.join(folder_path, filename)
            try:
                # Extract text from the HTML file
                text = extract_text_from_html(file_path)
                # Append to the data list
                data.append({'id': file_id, 'text': text, 'label': label})
                file_id += 1
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

# Create DataFrame
df = pd.DataFrame(data)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoConfig
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Join the list of strings in the 'text' column into a single string
df['text'] = df['text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Convert labels to integer codes starting from 0
label_map = {label: idx for idx, label in enumerate(df['label'].unique())}
df['label'] = df['label'].map(label_map)

# Check label distribution
print(df['label'].value_counts())

# Split the data into train and test sets, stratified by the label
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Convert the DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

# Load the tokenizer and model
model_name = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
num_labels = 5  # Define the number of classes for your task

# Create a configuration with the appropriate number of labels
config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model with ignore_mismatched_sizes to adjust the classifier head
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True)

# Tokenize the datasets
def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)  # Limit max_length for efficiency

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Remove the 'text' column as it's no longer needed after tokenization
train_dataset = train_dataset.remove_columns(['text'])
test_dataset = test_dataset.remove_columns(['text'])

# Set the format of the datasets to be PyTorch tensors
train_dataset.set_format('torch')
test_dataset.set_format('torch')

# Create a DataCollator that will take care of padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define a function to compute metrics
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define the training arguments for CPU usage with reduced batch size
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=20,  # Reduced batch size for CPU
    per_device_eval_batch_size=20,   # Reduced batch size for CPU
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    no_cuda=True,  # Use only CPU
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Add compute_metrics here
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()
trainer.save_model('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

In [None]:
import gradio as gr
import torch


# Load the fine-tuned model and tokenizer
model_path = './fine_tuned_model'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Initialize the Trainer with the fine-tuned model
trainer = Trainer(model=model, tokenizer=tokenizer)

# Define the labels (update this to match your dataset's labels)
label_map = {0: 'Balance Sheets', 1: 'Cash Flow', 2: 'Income Statement', 3: 'Notes', 4: 'Others'}

# Prediction function
def predict_label(html_file):
    # Extract text from the HTML file
    text = extract_text_from_html(html_file)

    # Preprocess the text
    inputs = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=128)

    # Make a prediction
    with torch.no_grad():
        outputs = trainer.model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
        predicted_label = label_map[predictions.item()]

    return predicted_label

# Create Gradio interface
iface = gr.Interface(
    fn=predict_label,
    inputs=gr.File(label="Upload HTML File", type="filepath"),
    outputs=gr.Textbox(label="Predicted Label"),
    title="Financial Document Classifier",
    description="Upload an HTML file to predict its financial document category."
)

# Launch the interface
iface.launch()
