<a href="https://colab.research.google.com/github/gatienc/multimodal_product_data_classification/blob/main/notebooks/gatien_fusion_model_clip0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Idea
### CLIP1:
In this version I am trying to use only image and designation, I'm applying
Explicit Cross-modal Interaction from 
[Hate-CLIPper: Multimodal Hateful Meme Classification based on Cross-modal Interaction of CLIP Features](https://arxiv.org/pdf/2210.05916.pdf)

### CLIP2: 
diff: added a dropout layer to reduce overfitting


### CLIP3: 
diff: if description exists, we get the mean of the features of description and designation

# Hyperparameters

In [1]:
BATCH_SIZE=128#  8 for 13.8 gb usage, 6 for less than 12 gb usage
NUM_CLASSES=27

CLIP_FEATURE_SIZE=768

#testing
# train_percentage=0.0001
# valid_percentage=0.0001

train_percentage=0.9
valid_percentage=0.1
#(test_percentage takes the rest)

google_colab=False
force_cpu=False
train=False

# Imports


In [2]:
%pip install transformers pandas tqdm scikit-learn imageio matplotlib wget plotly dash

Note: you may need to restart the kernel to use updated packages.


In [3]:
from transformers import CLIPProcessor, CLIPModel,CLIPFeatureExtractor
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
import torchvision

import imageio

from tqdm.notebook import tqdm
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import f1_score

import zipfile
import os
import copy

import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

from datetime import datetime


In [4]:
%matplotlib inline

In [5]:
if google_colab:
  # mount the drive where your dataset is availabledevice = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  from google.colab import drive
  drive.mount('/content/drive')
  filepath='/content/drive/MyDrive/datasets/multimodal_product_classification/' # add your own path. Where to save the dataset

  if not os.path.exists('datasets'):
    os.makedirs('datasets')
    with zipfile.ZipFile(filepath+'images.zip', 'r') as zip_ref:
        zip_ref.extractall('datasets')
            
  datasets_path="/content/datasets/"
  save_directory="/content/drive/MyDrive/Lessons/Models/multimodal_classification/"

else:
  import wget
  if not os.path.exists('datasets'):
    os.makedirs('datasets')
    output_directory="datasets"
    csv_zip = wget.download("https://nextcloud.its-tps.fr/s/BTpB4SC93NreZxg/download/csv_data.zip",out=output_directory)
    images_zip=wget.download("https://nextcloud.its-tps.fr/s/fgBxQczEAZ7ws8J/download/images.zip",out=output_directory)
    
    with zipfile.ZipFile(output_directory+'/csv_data.zip', 'r') as zip_ref:
      zip_ref.extractall('datasets')
    with zipfile.ZipFile(output_directory+'/images.zip', 'r') as zip_ref:
        zip_ref.extractall('datasets')
  filepath=os.getcwd()+'/datasets/'
  save_directory='../models/'
  datasets_path=filepath
  

In [6]:
torch.cuda.mem_get_info()

(15545729024, 15655829504)

In [7]:
if force_cpu:
    device = torch.device("cpu")
else:
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


# Preprocessing

In [8]:
# Apply cleaning function to the 'designation' column
def clip_to_max_tokenize(text):
    text.split(" ",maxsplit=70)
    text=text[:70]
    text="".join(text)
    return(text)

plot a scatter plot of the representation in term of word as x number of word in designation and as y number of word in description.
if we see that sometimes x is low and y is big, we could fill designation with description


# Data loading

In [9]:
def image_to_tensor(image):
    img=torch.from_numpy(image).float()
    out=img.permute(2,0,1)
    return(out)

In [10]:
class ImageTextDataLoader(Dataset):
    """Title, Description and Image dataset."""

    def __init__(self, dataframe, image_dir):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            image_dir (string): Directory with all the images.
        """
        self.df = dataframe
        self.image_dir = image_dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        selected_df=self.df.iloc[idx]

        image_name="image_"+str(selected_df["imageid"])+"_product_"+str(selected_df["productid"])+".jpg"
        filepath=os.path.join(self.image_dir,image_name)
        image_arr = imageio.v3.imread(filepath)

        designation=selected_df['designation']
        description=selected_df['description']
        label=torch.tensor(selected_df['labels'], dtype=torch.float,device=device)

        return [designation,description,image_arr,label]


In [11]:
Clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
Clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


In [12]:
torch.cuda.mem_get_info()

(13788315648, 15655829504)

In [13]:
def get_images_features(images):
    inputs = Clip_processor(images=images, return_tensors="pt").to(device)
    image_features = Clip_model.get_image_features(**inputs).to(device)
    del inputs
    return(image_features)

def get_text_features(texts):
    inputs = Clip_processor(text=texts, padding=True, return_tensors="pt").to(device)
    text_features = Clip_model.get_text_features(**inputs).to(device)
    del inputs
    return(text_features)

# Model Definition

way 0 : concat the layer
way 1 (future): cross modal concatenation

In [14]:
print(Clip_model.device)

cuda:0


In [15]:
class ClassificationHead(nn.Module):
   def __init__(self, input_dim, num_classes):
       super(ClassificationHead, self).__init__()

       self.head=nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(input_dim,128),
            nn.ReLU(),
            nn.Linear(128,num_classes),
       )
       #self.softmax = nn.Softmax(dim=-1)

   def forward(self, x):
       x = self.head(x)
       return (x)


## eval on test(/eval) dataset

In [16]:
# Load data
X_eval = pd.read_csv(filepath+'X_test.csv').fillna("")
X_eval=X_eval.drop(columns="Unnamed: 0")
X_eval['designation'] = X_eval['designation'].fillna('').apply(clip_to_max_tokenize)
X_eval['description'] = X_eval['description'].fillna('').apply(clip_to_max_tokenize)

X_eval["labels"]=np.zeros((len(X_eval["designation"]),1))

In [17]:
eval_dataset=ImageTextDataLoader(X_eval,datasets_path+"/images/image_test")
print(len(eval_dataset))
eval_dataloader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=True)




13812


In [18]:
saved_state_dict = torch.load("/home/onyxia/work/multimodal_product_data_classification/models/clip3-2024-01-15-5.ckpt")
model=ClassificationHead(CLIP_FEATURE_SIZE**2,NUM_CLASSES).to(device)
model.load_state_dict(saved_state_dict)
model.to(device)
model.eval()
torch.no_grad()

<torch.autograd.grad_mode.no_grad at 0x7f36004d3280>

In [19]:
torch.cuda.mem_get_info()

(13184335872, 15655829504)

In [20]:
@torch.no_grad
def eval_model(model,eval_dataloader):

    loop_on_batch=tqdm(eval_dataloader,leave=False,position=0,ncols=800)
    preds=[]
    for batch in loop_on_batch:
        # print(batch)
        designation,description,image_arr,_=batch

        # Iterate over data.
        description_features=[]

        for i in range(len(description)):
            #get the description that are not null
            if len(description[i])>=1:
                # print(f'{description[i]=}')
                description_feature=get_text_features(description[i])
                description_features.append(description_feature[0])
            else:
                description_features.append([])
        images_features=get_images_features(image_arr).unsqueeze(2)
        designation_features=get_text_features(designation).unsqueeze(2)
            

        if len(description_features[0])>0:
            designation_features[0]=(designation_features[0]+description_features[0].unsqueeze(1))/2

        #HERE implement fusion model of designation, description and image_arr
        input_features=torch.matmul(images_features[0] , designation_features[0].T).flatten().to(device).unsqueeze(0)

        for i in range(1,images_features.size(dim=0)):
            # Perform the multiplication and append the result to the results array
            if len(description_features[i])>0:
                designation_features[i]=(designation_features[i]+description_features[i].unsqueeze(1))/2

            feature_interaction_matrix=torch.matmul(images_features[i], designation_features[i].T)
            input_feature=(feature_interaction_matrix).flatten().to(device)
            input_features = torch.cat((input_features, input_feature.unsqueeze(0)), dim=0)

        # forward
        pred=model(input_features)
        print(pred)
        preds.append(pred)


    return preds

In [21]:
preds=eval_model(model,eval_dataloader)

  0%|                                                                                                         …

tensor([[ 1.9758,  0.9648,  0.0364,  ..., -1.7364, -7.1709, -3.8716],
        [-0.5433,  0.2504, -1.8263,  ...,  3.8445, -3.8741, -5.3435],
        [-4.2845,  0.0284,  0.0875,  ...,  7.7207, -1.2421, -3.4965],
        ...,
        [-1.7529, -1.3386, -2.4720,  ...,  4.3807, -3.3234, -5.0707],
        [-1.3170, -1.6225, -1.0445,  ...,  4.0877, -4.2809, -4.4414],
        [-2.0568,  2.1997,  3.9746,  ...,  0.2675, -5.6746, -3.0493]],
       device='cuda:0')
tensor([[-2.7795e+00,  7.9472e-02, -1.6774e+00,  ...,  3.5545e+00,
         -4.9876e+00, -3.5763e+00],
        [ 1.8479e+00, -2.5458e+00, -3.5208e+00,  ..., -3.7875e-01,
         -8.3265e-01, -3.8154e+00],
        [-3.4938e+00,  3.8802e+00,  4.7654e+00,  ...,  4.6404e-01,
         -5.3954e+00, -4.6809e+00],
        ...,
        [-7.2587e-01, -1.0640e+00, -4.4272e+00,  ...,  6.6011e-01,
         -3.0410e+00, -4.4389e+00],
        [-1.7425e+00,  1.3858e+00,  1.5110e+00,  ..., -7.2690e-01,
         -3.2433e+00,  8.7309e-01],
        [-7.11

In [30]:
preds2=[]
for pred in preds:
    preds2.extend(torch.max(pred,1)[1].tolist())

[7, 22, 24, 21, 13, 0, 2, 19, 17, 8, 24, 20, 19, 5, 18, 15, 14, 13, 19, 8, 14, 25, 2, 12, 5, 5, 1, 18, 25, 21, 14, 19, 18, 4, 0, 18, 21, 21, 21, 23, 22, 16, 18, 21, 9, 18, 24, 21, 19, 13, 21, 7, 8, 16, 7, 25, 23, 16, 0, 5, 4, 5, 18, 24, 18, 16, 8, 9, 4, 1, 7, 16, 16, 1, 22, 23, 18, 7, 19, 20, 0, 0, 9, 18, 6, 1, 14, 7, 9, 13, 0, 22, 22, 23, 23, 19, 21, 25, 16, 7, 7, 16, 16, 21, 18, 23, 23, 0, 9, 18, 1, 13, 13, 20, 18, 0, 20, 23, 20, 19, 23, 24, 23, 4, 23, 13, 21, 9]
[7, 22, 24, 21, 13, 0, 2, 19, 17, 8, 24, 20, 19, 5, 18, 15, 14, 13, 19, 8, 14, 25, 2, 12, 5, 5, 1, 18, 25, 21, 14, 19, 18, 4, 0, 18, 21, 21, 21, 23, 22, 16, 18, 21, 9, 18, 24, 21, 19, 13, 21, 7, 8, 16, 7, 25, 23, 16, 0, 5, 4, 5, 18, 24, 18, 16, 8, 9, 4, 1, 7, 16, 16, 1, 22, 23, 18, 7, 19, 20, 0, 0, 9, 18, 6, 1, 14, 7, 9, 13, 0, 22, 22, 23, 23, 19, 21, 25, 16, 7, 7, 16, 16, 21, 18, 23, 23, 0, 9, 18, 1, 13, 13, 20, 18, 0, 20, 23, 20, 19, 23, 24, 23, 4, 23, 13, 21, 9, 7, 16, 20, 12, 14, 16, 23, 5, 21, 23, 1, 18, 2, 20, 5, 3, 9,

In [22]:
df_preds=pd.DataFrame(preds)
df_preds.to_csv("/home/onyxia/work/multimodal_product_data_classification/df_preds.csv")


TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.