In [None]:
import os
import requests
from PIL import Image
from io import BytesIO
import pytesseract
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
import multiprocessing
print(f"Number of available CPU cores (threads): {multiprocessing.cpu_count()}")

Number of available CPU cores (threads): 2


In [None]:
import os
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Load the dataset
file_path = '/content/train.csv'  # Replace with your actual file path
data = pd.read_csv(file_path)

# Create a directory to save the images
image_dir = 'downloaded_images'
os.makedirs(image_dir, exist_ok=True)

# Function to download an image from a URL
def download_image(row):
    image_url = row['image_link']
    image_name = f"{row['group_id']}_{row.name}.jpg"  # Unique image name using group_id and index
    save_path = os.path.join(image_dir, image_name)

    try:
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
        return image_url, True  # Return URL and success status
    except Exception as e:
        return image_url, False  # Return URL and failure status

# Number of threads to use
num_threads = 80

# Use ThreadPoolExecutor to download images in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = [executor.submit(download_image, row) for idx, row in data.iterrows()]

    # Use tqdm to display progress
    for future in tqdm(as_completed(futures), total=len(futures)):
        url, success = future.result()
        if not success:
            print(f"Failed to download {url}")

print("Image download complete.")

In [None]:
# Load the dataset
csv_file = "/content/maximum_weight_recommendation_data.csv"  # Replace with the path to your CSV file
data = pd.read_csv(csv_file)

In [None]:
import os
import requests
from PIL import Image
from io import BytesIO
import pytesseract
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from concurrent.futures import ThreadPoolExecutor, as_completed
import joblib
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm import tqdm

In [None]:
import os
import re
import requests
from PIL import Image
from io import BytesIO
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import pytesseract

# Ensure required columns are present
required_columns = ['image_link', 'entity_name', 'entity_value']
if not all(col in data.columns for col in required_columns):
    raise ValueError(f"CSV must contain columns: {', '.join(required_columns)}")

# Directory to save images
image_dir = "images"
os.makedirs(image_dir, exist_ok=True)

# Configure retry strategy globally (outside the function)
retry_strategy = Retry(
    total=3,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"],
    backoff_factor=1
)

# Adjust pool size and max connections
adapter = HTTPAdapter(
    max_retries=retry_strategy,
    pool_connections=100,  # Increase pool connections limit
    pool_maxsize=100       # Increase the max number of connections in the pool
)

# Setup the session with the adjusted adapter
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)

# Pre-compile the regex pattern for efficiency
pattern = re.compile(r"(\d+\.?\d*)\s*([a-zA-Z]+)")

def process_image(row):
    try:
        image_url = row['image_link']
        entity_name = row['entity_name']
        entity_value = row['entity_value']

        image_name = f"image_{row.name}.jpg"
        save_path = os.path.join(image_dir, image_name)

        # Download the image
        response = http.get(image_url, timeout=10)
        if response.status_code == 200:
            img = Image.open(BytesIO(response.content))
            img.save(save_path)

            # Perform OCR on the image
            text = pytesseract.image_to_string(img)

            # Parse text to get the numerical value and unit
            matches = pattern.findall(text)
            parsed_text = " ".join([" ".join(match) for match in matches]) if matches else ""

            return {
                'image_path': save_path,
                'entity_name': entity_name,
                'entity_value': entity_value,
                'extracted_text': text,
                'parsed_text': parsed_text
            }
        else:
            # Log failed download attempts
            return {'error': f"Failed to download {image_url}. Status code: {response.status_code}"}

    except Exception as e:
        # Log exceptions instead of printing them
        return {'error': f"Error processing row {row.name}: {str(e)}"}

# Use ThreadPoolExecutor for parallel processing
ocr_results = []
with ThreadPoolExecutor(max_workers=min(20, os.cpu_count() * 2)) as executor:
    futures = {executor.submit(process_image, row): row.name for _, row in data.iterrows()}

    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing Images"):
        result = future.result()
        if result:
            ocr_results.append(result)

# Convert to DataFrame
ocr_df = pd.DataFrame(ocr_results)

# Filter out errors from the results if any
ocr_df_filtered = ocr_df[ocr_df['error'].isna()]

# Save OCR results to CSV for later use
ocr_df_filtered.to_csv('ocr_results.csv', index=False)

print(f"Processed {len(ocr_df_filtered)} images out of {len(data)} total images successfully.")

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import joblib
import spacy

In [None]:
ocr_df.head()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
_df_0.groupby('image_path').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the dataset
ocr_df = pd.read_csv('ocr_results.csv')

# Step 2: Check for NaN values and drop them
ocr_df = ocr_df.dropna(subset=['parsed_text', 'entity_value'])

# Encode your labels
ocr_df['entity_value_encoded'] = ocr_df['entity_value'].astype('category').cat.codes

# Prepare the dataset
texts = ocr_df['parsed_text'].tolist()
labels = ocr_df['entity_value_encoded'].tolist()

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Define a custom Dataset class
class EntityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Step 3: Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create DataLoader
max_len = 128  # Maximum length of input sequences
batch_size = 16

train_dataset = EntityDataset(X_train, y_train, tokenizer, max_len)
test_dataset = EntityDataset(X_test, y_test, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Step 4: Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(ocr_df['entity_value_encoded'].unique()))
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

for epoch in range(5):  # Adjust the number of epochs as needed
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1} completed. Loss: {loss.item()}')

# Step 5: Testing the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(batch['labels'].numpy())

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f'Test Accuracy: {accuracy:.4f}')

# Print classification report
print(classification_report(true_labels, predictions))

In [None]:
import pandas as pd
import re

def clean_text(text):
    if pd.isna(text):
        return text
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with a single space
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)  # Remove non-printable characters
    return text.strip()

# Load the OCR results CSV file
ocr_results_path = 'path_to_your_ocr_results.csv'
ocr_results_df = pd.read_csv(ocr_results_path)

# Clean the extracted_text and parsed_text columns
ocr_results_df['clean_extracted_text'] = ocr_results_df['extracted_text'].apply(clean_text)
ocr_results_df['clean_parsed_text'] = ocr_results_df['parsed_text'].apply(clean_text)

# Display the cleaned data
print(ocr_results_df[['image_path', 'entity_name', 'entity_value', 'clean_extracted_text', 'clean_parsed_text']].head())


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Testing the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(batch['labels'].numpy())

# Calculate accuracy and F1 score
accuracy = accuracy_score(true_labels, predictions)
print(f'Test Accuracy: {accuracy:.4f}')

# Print classification report
print(classification_report(true_labels, predictions))

# Confusion matrix
conf_matrix = confusion_matrix(true_labels, predictions)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
import pandas as pd
import re

def clean_text(text):
    if pd.isna(text):
        return text
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with a single space
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)  # Remove non-printable characters
    return text.strip()

# Load the OCR results CSV file
ocr_results_path = '/content/ocr_results.csv'
ocr_results_df = pd.read_csv(ocr_results_path)

# Clean the extracted_text and parsed_text columns
ocr_results_df['clean_extracted_text'] = ocr_results_df['extracted_text'].apply(clean_text)
ocr_results_df['clean_parsed_text'] = ocr_results_df['parsed_text'].apply(clean_text)

# Display the cleaned data
print(ocr_results_df[['image_path', 'entity_name', 'entity_value', 'clean_extracted_text', 'clean_parsed_text']].head())


In [None]:
import spacy
from spacy.tokens import DocBin

# Initialize a blank spaCy model
nlp = spacy.blank("en")

# Function to create training data in the format required by spaCy
def create_training_data(df):
    training_data = []
    for index, row in df.iterrows():
        text = row['clean_extracted_text']
        entities = []
        if pd.notna(row['entity_value']):
            entity_value = str(row['entity_value'])
            start_index = text.find(entity_value)
            if start_index != -1:
                end_index = start_index + len(entity_value)
                entities.append((start_index, end_index, row['entity_name']))
        training_data.append((text, {"entities": entities}))
    return training_data

training_data = create_training_data(ocr_results_df)

# Convert the training data to spaCy's DocBin format
doc_bin = DocBin()
for text, annotations in training_data:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annotations.get('entities'):
        span = doc.char_span(start, end, label=label)
        if span is not None:
            ents.append(span)
    doc.ents = ents
    doc_bin.add(doc)

# Save the training data to a file
doc_bin.to_disk("training_data.spacy")


In [None]:
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random

# Load the training data
doc_bin = DocBin().from_disk("training_data.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))

# Add the NER component to the pipeline if it's not already present
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Add the labels to the NER component
for _, annotations in training_data:
    for start, end, label in annotations.get('entities'):
        ner.add_label(label)

# Convert the training data to Example objects
examples = []
for doc, (text, annotations) in zip(docs, training_data):
    example = Example.from_dict(doc, annotations)
    examples.append(example)

# Disable other pipes during training to train only NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(100):  # Number of training iterations
        random.shuffle(examples)
        losses = {}
        batches = minibatch(examples, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            nlp.update(batch, drop=0.5, losses=losses)
        print("Losses", losses)

# Save the trained model
nlp.to_disk("ner_model")


In [None]:
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random
from sklearn.metrics import classification_report

# Load the training data
doc_bin = DocBin().from_disk("training_data.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))

# Add the NER component to the pipeline if it's not already present
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Add the labels to the NER component
for _, annotations in training_data:
    for start, end, label in annotations.get('entities'):
        ner.add_label(label)

# Convert the training data to Example objects
examples = []
for doc, (text, annotations) in zip(docs, training_data):
    example = Example.from_dict(doc, annotations)
    examples.append(example)

# Split into training and validation sets
split = int(len(examples) * 0.8)
train_examples = examples[:split]
val_examples = examples[split:]

# Function to evaluate the model
def evaluate_model(model, examples):
    preds = []
    trues = []
    for example in examples:
        pred = model(example.text)
        preds.extend([(ent.text, ent.label_) for ent in pred.ents])
        trues.extend([(ent.text, ent.label_) for ent in example.reference.ents])
    return classification_report(trues, preds, output_dict=True)

# Disable other pipes during training to train only NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(100):  # Number of training iterations
        random.shuffle(train_examples)
        losses = {}
        batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            nlp.update(batch, drop=0.5, losses=losses)
        print(f"Iteration {itn+1}: Losses", losses)

        # Evaluate the model every 10 iterations
        if (itn + 1) % 10 == 0:
            metrics = evaluate_model(nlp, val_examples)
            print(f"Iteration {itn+1}: Evaluation Metrics")
            print(metrics)

# Save the trained model
nlp.to_disk("ner_model")


In [None]:
from sklearn.metrics import classification_report

# Load the trained model
nlp = spacy.load("ner_model")

# Evaluate the final model on the validation set
metrics = evaluate_model(nlp, val_examples)
print("Final Evaluation Metrics")
print(metrics)

# Detailed classification report
report = classification_report([ent[1] for ent in metrics], [ent[1] for ent in metrics], output_dict=False)
print(report)

# Test the model on new text
test_text = "The product weight is 12.0 ounce."
doc = nlp(test_text)

for ent in doc.ents:
    print(ent.text, ent.label_)
