<a href="https://colab.research.google.com/github/deepak4728/Amazon-ML-Challenge-2024/blob/main/amazon_ml_challange_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Libraries Import**

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import random
import cv2
# import pytesseract
import requests
from io import BytesIO
from PIL import Image
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import ModelCheckpoint


label_encoder=LabelEncoder()

In [None]:
# Define the path to save the model in your Google Drive
model_save_path = '/content/drive/MyDrive/amazon ML model/model_checkpoint.keras'

In [None]:
# Define the checkpoint callback
checkpoint = ModelCheckpoint(
    filepath=model_save_path,  # Path to save the model file
    monitor='val_loss',  # Monitors the validation loss to decide if saving the model
    save_best_only=True,  # Only save the model if it improves on the monitored metric
    save_weights_only=False,  # Save the entire model (architecture + optimizer state + weights)
    mode='min',  # 'min' means it will save when the validation loss decreases
    verbose=1  # Verbose output during saving
)

### **Model defination and compile**

In [None]:
class myModel(tf.keras.Model):
    def __init__(self, **kwargs):
        super(myModel, self).__init__(**kwargs)

        # Feature extraction for text regions (using CNN backbone)
        self.conv1 = layers.Conv2D(32, (3, 3), activation='relu', name="conv1")
        self.pool1 = layers.MaxPooling2D((2, 2), name="maxpooling1")
        self.conv2 = layers.Conv2D(64, (3, 3), activation='relu', name="conv2")
        self.pool2 = layers.MaxPooling2D((2, 2), name="maxpooling2")
        self.flatten = layers.Flatten(name="cnn_flatten")
        self.image_dense = layers.Dense(128, activation='relu', name="cnn_dense")

        # Group ID processing
        self.group_id_dense = layers.Dense(16, activation='relu', name="group_dense")

        # Text processing for entity_name (Embedding or Dense Layer)
        # self.entity_name_embedding = layers.Embedding(input_dim=(999,), output_dim=16)  # Assuming entity names are tokenized and encoded
        self.entity_name_dense = layers.Dense(16, activation='relu', name="entity_dense")



        # Shared dense layers (after merging all inputs)
        self.dense1 = layers.Dense(128, activation='relu', name="shared_dense")

        # Output layers
        self.numerical_output = layers.Dense(1, activation='linear', name="numerical_output")  # Predicting numerical value
        self.unit_output = layers.Dense(38, activation='softmax', name="unit_output")  # Predicting unit (assuming 4 possible units like cm, kg, etc.)

    # def get_config(self):
    #     # This is required for serialization, ensuring it captures any custom arguments
    #     config = super(myModel, self).get_config()
    #     return config

    def call(self, inputs):
        # Unpack the inputs: image, group_id, entity_name
        image_input, group_id_input, entity_name_input = inputs

        # Ensure image input is 4D
        x = tf.reshape(image_input, [-1, 224, 224, 1])  # Reshape if needed

        x = self.conv1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)
        x = self.flatten(x)
        x = self.image_dense(x)

        # Process group_id through dense layer
        group_id_processed = tf.reshape(group_id_input, (-1, 1))  # Reshape to (batch_size, 1)
        group_id_processed = self.group_id_dense(group_id_processed)

        # Process entity_name input (text) through embedding + dense layers
        entity_name_processed = tf.reshape(entity_name_input, (-1, 1))  # Reshape to (batch_size, 1)
        entity_name_processed = self.entity_name_dense(entity_name_processed)



        # Concatenate the processed inputs
        combined = layers.concatenate([x, group_id_processed, entity_name_processed])

        # Pass through shared dense layer
        z = self.dense1(combined)

        # Predict numerical value and unit
        numerical_value_output = self.numerical_output(z)  # Predicting continuous value
        # numerical_value_output = tf.reshape(numerical_value_output, (-1, 1))  # Reshape to (batch_size, 1)
        unit_output = self.unit_output(z)  # Predicting unit (softmax for classification)

        return numerical_value_output, unit_output


### ***`for numerical value and unit extraction from entity column`***

In [None]:
# import pandas as pd

# # Load the Excel file
# file_path = '/content/train_splitted.xlsx'
# df = pd.read_excel(file_path)

# def unit_extract(lst):
#   lst=lst.split(" ")
#   try:
#       # Try to convert the element at index -2 to an integer
#       float(lst[-2])

#       # If successful, return the element at index -1
#       return lst[-1]
#   except (ValueError, IndexError):
#       try:
#           if(lst[-2][-1]=="]"):
#               return lst[-1]
#               # Try to check if element at -2 can be converted to a list
#           else:
#               # Otherwise, concatenate the elements at -2 and -1
#               return lst[-2] + " " + lst[-1]
#       except:
#           # Fallback case for other unexpected issues

#           return lst[-2] + " " + lst[-1]

# # Apply the unit extraction function to each row
# df['extracted_unit'] = df['entity_value'].apply(unit_extract)

# # Save the updated dataframe to a new Excel file
# output_file_path = '/content/train_cleaned.xlsx'
# df.to_excel(output_file_path, index=False)

# # Return the path of the updated file
# output_file_path

Function for entity_name to integer mapping

### **Image download and preprocess function**

In [None]:
# Function to download and preprocess an image and returns a numpy array
# @tf.function
def download_and_preprocess_image(url, target_size=(224, 224)):
    try:
        # Download the image
        response = requests.get(url)
        if response.status_code == 200:
            # Convert the image content to a PIL image
            image = Image.open(BytesIO(response.content))

            # Convert PIL image to NumPy array and then to OpenCV format (BGR)
            image = np.array(image)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

            # Resize the image to the target size (e.g., 224x224 for many models)
            image = cv2.resize(image, target_size)

            # Optionally, normalize the image (depends on the model you're using)
            image = image / 255.0  # Normalize to [0, 1]

            return image
        else:
            print(f"Failed to download image from {url}")
            return None
    except Exception as e:
        print(f"Error processing image from {url}: {e}")
        return None

### **Batch process function that returns processed data as lists for model fit**

In [None]:
# @tf.function
def batch_process(batch_df, b_num):

      # Initialize lists/arrays for the current batch
      image_list = np.empty((0, 50176))
      group_id_list = np.array([], dtype=int)
      entity_name_list = np.array([], dtype=str)
      entity_value_num_list = np.array([], dtype=float)
      entity_value_unit_list = np.array([], dtype=str)

      # entity_value_list=np.array([], dtype=str)
      # Iterate through the batch
      for index, row in batch_df.iterrows():
          image_url = row['image_link']  # Access the image URL
          image = download_and_preprocess_image(image_url)

          if image is not None:
              # Reshape and prepare data for the batch
              image = image.reshape(1, 50176)  # Assuming 224x224 image reshaped
              entity = row["entity_name"]
              group = row["group_id"]
              num_value=row["entity_value_numeric"]
              unit_value=row["entity_value_unit"]

              # Append the data to the batch arrays
              image_list = np.append(image_list, image, axis=0)
              group_id_list = np.append(group_id_list, group)
              entity_name_list = np.append(entity_name_list, entity)
              entity_value_num_list = np.append(entity_value_num_list, num_value)
              entity_value_unit_list = np.append(entity_value_unit_list, unit_value)


              # Fit the label encoder to the entity_name column
              encoded_entity_names = label_encoder.fit_transform(entity_name_list)

              # Fit the label encoder to the entity_value_unit_list column
              encoded_entity_unit = label_encoder.fit_transform(entity_value_unit_list)


          else:
              print(f"Image at index {index} could not be processed.")
              continue
      print(f"Processed batch: {b_num + 1}")
      return [image_list, group_id_list, encoded_entity_names], [entity_value_num_list, encoded_entity_unit]



### **Test data load and label encoder fit on entity_units**

In [None]:
# Load the dataset
df = pd.read_csv("train_modified_csv.csv")
label_encoder.fit(df['entity_value_unit'])

### **Model object creation and compilation**

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import custom_object_scope  # Import custom_object_scope

# Define your custom model class (myModel) here if not already defined


# Load the partially trained model using custom_object_scope
with custom_object_scope({'myModel': myModel}):
    model = load_model('/content/drive/MyDrive/amazon ML model/manual save/0_10_model.h5')

# Compile the model
model.compile(
    optimizer='adam',
    loss=['mean_absolute_error', 'sparse_categorical_crossentropy'],
    metrics=['mean_absolute_error', 'accuracy']
)

ValueError: Layer count mismatch when loading weights from file. Model expected 0 layers, found 8 saved layers.

# **Batches with model fit**

In [None]:

batch_size = 500  # For example, process 1000 rows at a time

# Calculate the number of batches needed
num_batches = int(np.ceil(len(df) / batch_size))

for batch_num in range(5):
    print(f"Processing batch {batch_num + 1}/{num_batches}...")

    # Define start and end index for the current batch
    start_idx = batch_num * batch_size
    end_idx = min((batch_num + 1) * batch_size, len(df))

    # Extract a batch from the DataFrame
    batch_df = df.iloc[start_idx:end_idx]
    xa, yb = batch_process(batch_df, batch_num)
    model.fit(x=xa, y=yb, batch_size=32, epochs=20, verbose=1, callbacks=[checkpoint])
    print(f"Batch {batch_num + 1} fitted.")

In [None]:

batch_size = 500  # For example, process 1000 rows at a time

# Calculate the number of batches needed
num_batches = int(np.ceil(len(df) / batch_size))

for batch_num in range(10,20):
    print(f"Processing batch {batch_num + 1}/{20}...")

    # Define start and end index for the current batch
    start_idx = batch_num * batch_size
    end_idx = min((batch_num + 1) * batch_size, len(df))

    # Extract a batch from the DataFrame
    batch_df = df.iloc[start_idx:end_idx]
    xa, yb = batch_process(batch_df, batch_num)
    model.fit(x=xa, y=yb, batch_size=32, epochs=20, verbose=1, callbacks=[checkpoint])
    print(f"Batch {batch_num + 1} fitted.")

### **Model Test and prediction**

In [None]:
model.save('/content/drive/MyDrive/amazon ML model/manual save/my_model.h5')  # or TensorFlow SavedModel format




In [None]:
# batch_df = df.iloc[1160:1171]
# xa, yb = batch_process(batch_df, 0)
# a, b=model.predict(xa)
# print(a)
# predicted_units_int = np.argmax(b, axis=1)
# b=label_encoder.inverse_transform(predicted_units_int)
# print(b)

In [None]:
# print(x[0].shape)
# print(xa[2].ndim)
# print(x[2].shape)
# print(y[0].shape)
# print(y[1].shape)

### **Test data load and model Testing on Test data using batches**

In [None]:
# Load the dataset
df = pd.read_csv("train.csv")

# Model Evaluation

In [None]:
# # Evaluate the model using the F1 score
# def evaluate_f1(y_true, y_pred):
#     true_positives = np.sum((y_true != "") & (y_pred != "") & (y_true == y_pred))
#     false_positives = np.sum((y_true != "") & (y_pred != "") & (y_true != y_pred))
#     false_negatives = np.sum((y_true != "") & (y_pred == ""))
#     true_negatives = np.sum((y_true == "") & (y_pred == ""))

#     precision = true_positives / (true_positives + false_positives)
#     recall = true_positives / (true_positives + false_negatives)

#     f1 = 2 * precision * recall / (precision + recall)
#     return f1

# y_true = test_df['entity_value']
# y_pred = output_predictions
# f1 = evaluate_f1(y_true, y_pred)
# print(f"F1 score: {f1:.4f}")

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model)

NameError: name 'model' is not defined