## Env setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# For emptying trash after each run
from google.colab import auth
auth.authenticate_user()
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')
drive_service.files().emptyTrash().execute()
!pwd
!nvidia-smi

/content
Mon Jan 22 13:18:00 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                           

## Imports

In [1]:
folder_name = "/home/nlp-lab-ws23/nlp_praktikum/persuasion_technique_detection/" #"/content/drive/MyDrive/persuasion_technique_detection/"

In [2]:
#!pip install transformers datasets wandb evaluate accelerate -qU sklearn_hierarchical_classification sentencepiece

In [3]:
import gc
import re
import json
import numpy as np
import pandas as pd
import random
import torch
import subprocess
import json
import warnings
import shutil
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, ViTFeatureExtractor
from sklearn.metrics import f1_score, accuracy_score
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset,load_dataset,DatasetDict,concatenate_datasets
import datasets
import os
from torch.nn.functional import sigmoid
from sklearn.preprocessing import LabelBinarizer
from datasets import concatenate_datasets
from sklearn.ensemble import RandomForestClassifier
from transformers import Trainer
from PIL import Image
import pickle
import torch.nn.functional as F
from transformers import AutoModel, AutoConfig,ViTForImageClassification, AutoModelForSequenceClassification, \
AutoImageProcessor,AutoTokenizer,AutoFeatureExtractor,ViTImageProcessor,ViTConfig, BertConfig, VisionTextDualEncoderConfig, VisionTextDualEncoderModel,CLIPImageProcessor

In [4]:
import torch
AVAIL_GPUS = 0
if torch.cuda.is_available():
    device = torch.device("cuda")
    AVAIL_GPUS = torch.cuda.device_count()
    print(f'There are {AVAIL_GPUS} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA GeForce RTX 3080 Ti


## Login WandB

In [5]:
import wandb
import os

#wandb.login(relogin=True)
wandb.login()

# setup wandb environment variables
os.environ['WANDB_PROJECT'] = "subtask2b"
os.environ['WANDB_ENTITY'] = "tumnlp"
os.environ["WANDB_LOG_MODEL"]= "end"

[34m[1mwandb[0m: Currently logged in as: [33mmahmudfami[0m ([33mtumnlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
#text_checkpoint = "vinai/bertweet-large"
text_checkpoint="microsoft/deberta-v3-large"
img_checkpoint= "google/vit-base-patch32-224-in21k"

In [7]:
summary_dir_path = folder_name + "subtask2b/new_summaries/summary_" + text_checkpoint.replace("/","_")+"_"+img_checkpoint.replace("/","_")+"_STACKING_ValForTrainingMeta_RF_Labels/"

## Preprocess text input

In [8]:
val_path="data/subtask2b/val.json"
train_path="data/subtask2b/train.json"
test_path="data/subtask2b/dev_unlabeled.json"

with open(folder_name+val_path) as f:
  d = json.load(f)
  val=pd.DataFrame.from_dict(d)
  labels=val["label"]
  num_label=[int(el=="propagandistic") for el in labels]
  val["num_label"]=num_label
  val=val.drop(columns=['label'])
  val_set=val.rename(columns={"num_label": "label"})

with open(folder_name+train_path) as f:
  d = json.load(f)
  train=pd.DataFrame.from_dict(d)
  labels=train["label"]
  num_label=[int(el=="propagandistic") for el in labels]
  train["num_label"]=num_label
  train=train.drop(columns=['label'])
  train_set=train.rename(columns={"num_label": "label"})
  #mask = train_set['image'] == "prop_meme_24871.png"
  #train_set = train_set[~mask]

with open(folder_name+test_path) as f:
  d = json.load(f)
  dev_unlabeled_set=pd.DataFrame.from_dict(d)

label2num={"non_propagandistic":0,"propagandistic":1}
num2label={0:"non_propagandistic",1:"propagandistic"}

print(len(train_set),len(val_set),len(dev_unlabeled_set))


1200 150 300


In [9]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, dataset_type,image_processor):
        super().__init__()
        self.ids=list(dataset["id"])
        self.texts = list(dataset["text"])
        self.image_paths = list(dataset["image"])
        if dataset_type=="train" or dataset_type=="val":
          self.labels = dataset["label"].astype(int).tolist()
        self.image_processor = image_processor
        self.dataset_type=dataset_type

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if self.dataset_type=="train":
          image_path=folder_name+ "data/subtask2b/subtask2b_images/train/"+ self.image_paths[idx]
        elif self.dataset_type=="val":
          image_path=folder_name+ "data/subtask2b/subtask2b_images/val/"+ self.image_paths[idx]
        else:
          image_path=folder_name+ "data/subtask2b/subtask2b_images/dev/"+ self.image_paths[idx]

        image_input = self.image_processor(images=Image.open(image_path).convert("RGB"), return_tensors="pt")

        if self.dataset_type=="train" or self.dataset_type=="val":
          label = torch.tensor(self.labels[idx], dtype=torch.float32)
          return self.ids[idx],self.texts[idx],image_input,label
        else:
          return self.ids[idx],self.texts[idx],image_input

In [10]:
tokenizer = AutoTokenizer.from_pretrained(text_checkpoint)
text_model = AutoModel.from_pretrained(text_checkpoint)

# change image processor for different models
image_processor = ViTImageProcessor.from_pretrained(img_checkpoint)  #'google/vit
#image_processor = CLIPImageProcessor.from_pretrained(img_checkpoint) #"openai/clip
#image_processor = AutoImageProcessor.from_pretrained(img_checkpoint)
image_model = AutoModel.from_pretrained(img_checkpoint)
#image_model.config, text_model.config



In [11]:
#image_model.config.hidden_sizes

In [12]:
class TextImageBinaryClassifier(nn.Module):
    def __init__(self, text_model, image_model):
        super(TextImageBinaryClassifier, self).__init__()
        self.text_model = text_model
        self.image_model = image_model

        #image_hidden_size=image_model.config.projection_dim #openai/vit
        image_hidden_size=image_model.config.hidden_size # google/vit
        #image_hidden_size=image_model.config.hidden_sizes[-1] #resnet
        #image_hidden_size=image_model.config.hidden_dim # efficientnet

        self.lin1 = nn.Linear(text_model.config.hidden_size + image_hidden_size,512)
        self.relu=nn.ReLU()
        self.lin2=nn.Linear(512,1)

    def forward(self, text_input, image_input):
        # Text encoding
        text_outputs = self.text_model(**text_input)

        text_embedding = text_outputs.last_hidden_state.mean(dim=1)

        # Image encoding
        try:
          #image_embedding=self.image_model.get_image_features(**image_input) #  "openai/clip

          image_outputs = self.image_model(**image_input)  #'google/vit
          image_embedding = image_outputs.last_hidden_state.mean(dim=1) #'google/vit

          #image_outputs = self.image_model(**image_input) # resnet, efficientnet
          #image_embedding=image_outputs.last_hidden_state # resnet, efficientnet
          #image_embedding = F.adaptive_avg_pool2d(image_embedding, (1, 1)).view(image_embedding.size(0), image_embedding.size(1)) # resnet, efficientnet

        except Exception as e:
          print("error:",str(e))


        # Concatenate text and image embeddings
        combined_embedding = torch.cat((text_embedding, image_embedding), dim=1)


        x = self.lin1(combined_embedding)
        x=self.relu(x)
        logits=self.lin2(x)
        return logits

In [13]:
# Create a custom dataset
train_dataset = CustomDataset(train_set,"train", image_processor)
val_dataset = CustomDataset(val_set,"val", image_processor)
test_dataset = CustomDataset(dev_unlabeled_set,"test", image_processor)

batch_size=2
num_workers=2
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,num_workers=num_workers,pin_memory=True)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,num_workers=num_workers,pin_memory=True,drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,num_workers=num_workers,pin_memory=True)

In [14]:
print("train size:",len(train_set),"val size:",len(val_set),"test size:",len(dev_unlabeled_set))

train size: 1200 val size: 150 test size: 300


## Training - Meta model

In [15]:
def write_json(path,data,file_name="summary.json"):
  if not isinstance(data, dict):
    data = data.to_dict("records")
  if not os.path.exists(path):
    os.makedirs(path)
  with open(path+file_name, "w") as output_file:
      json.dump(data, output_file, indent=2,ensure_ascii=False)

In [17]:
best_image_model= "best_model_google_vit-base-patch32-224-in21k-subtask2b-memes-IMAGEONLY_3e-06learningRate:v0"
#best_text_model= "best_model_vinai_bertweet-large-TEXTONLY-subtask2b-memes_1e-06learningRate:v1"
best_text_model="best_model_microsoft_deberta-v3-large-TEXTONLY-subtask2b-memes_0.0005learningRate:v0"
best_meta_model={
    "best_image_model":best_image_model,
    "best_text_model":best_text_model,
    "best_meta":None,

}

In [18]:
api = wandb.Api()
artifact=api.artifact(best_image_model)
best_image_model_dir_=artifact.download()
best_image_model_dir = os.path.join(best_image_model_dir_, best_image_model.split(":")[0]+".pth")
image_model_state_dict = torch.load(best_image_model_dir)

artifact=api.artifact(best_text_model)
best_text_model_dir_=artifact.download()
best_text_model_dir = os.path.join(best_text_model_dir_, best_text_model.split(":")[0]+".pth" )
text_model_state_dict = torch.load(best_text_model_dir)


text_model = AutoModelForSequenceClassification.from_pretrained(text_checkpoint, num_labels=2 ,id2label=num2label,label2id=label2num,ignore_mismatched_sizes=True)
text_model.load_state_dict(text_model_state_dict)
text_model.cuda()
text_model.eval()
image_model = ViTForImageClassification.from_pretrained(img_checkpoint)
image_model.load_state_dict(image_model_state_dict)
image_model.cuda()
image_model.eval()

image_preds_train=[]
text_preds_train=[]
y_train=[]

image_preds_val=[]
text_preds_val=[]
y_val=[]

image_preds_test=[]
text_preds_test=[]
"""for id, text_input, image_input, label in train_dataloader:

  text_input = tokenizer(text_input, return_tensors="pt",truncation=True, padding=True,max_length=512)
  text_input=text_input.to("cuda")
  image_input['pixel_values']=image_input['pixel_values'].squeeze(1)
  image_input=image_input.to("cuda")

  text_preds_train += sigmoid(text_model(**text_input).logits).detach().cpu().tolist()
  image_preds_train += sigmoid(image_model(**image_input).logits).detach().cpu().tolist()
  y_train+=label.detach().cpu().tolist()"""
#print(text_preds_train)

for id, text_input, image_input, label in val_dataloader:

  text_input = tokenizer(text_input, return_tensors="pt",truncation=True, padding=True,max_length=512)
  text_input=text_input.to("cuda")
  image_input['pixel_values']=image_input['pixel_values'].squeeze(1)
  image_input=image_input.to("cuda")

  text_preds=sigmoid(text_model(**text_input).logits)
  image_preds=sigmoid(image_model(**image_input).logits)

  text_preds_val.extend((text_preds > 0.5).int().cpu().tolist())
  image_preds_val.extend((image_preds> 0.5).int().cpu().tolist())
  y_val+=label.detach().cpu().tolist()

for id, text_input, image_input in test_dataloader:

  text_input = tokenizer(text_input, return_tensors="pt",truncation=True, padding=True,max_length=512)
  text_input=text_input.to("cuda")
  image_input['pixel_values']=image_input['pixel_values'].squeeze(1)
  image_input=image_input.to("cuda")
  text_preds=sigmoid(text_model(**text_input).logits)
  image_preds=sigmoid(image_model(**image_input).logits)
  text_preds_test.extend((text_preds > 0.5).int().cpu().tolist())
  image_preds_test.extend((image_preds> 0.5).int().cpu().tolist())

#train_x=np.column_stack((image_preds_train,text_preds_train))
val_x=np.column_stack((image_preds_val,text_preds_val))
test_x=np.column_stack((image_preds_test,text_preds_test))
lb = LabelBinarizer()
#y_train=lb.fit_transform(y_train)
y_val=lb.fit_transform(y_val)

[34m[1mwandb[0m: Downloading large artifact best_model_google_vit-base-patch32-224-in21k-subtask2b-memes-IMAGEONLY_3e-06learningRate:v0, 333.73MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.9
[34m[1mwandb[0m: Downloading large artifact best_model_microsoft_deberta-v3-large-TEXTONLY-subtask2b-memes_0.0005learningRate:v0, 1659.85MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:2.0
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'pooler.dense.weight', 'classifier.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch32-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should 

In [None]:
"""def train():
  try:
    # Initialize a new wandb run
    wandb.init()

    # sweep agent inputs config with hyperparameters
    config = wandb.config

    #learning_rate = config.learning_rate
    run_name = config.run_name #+f"_{str(learning_rate)}learningRate"
    wandb.run.name = run_name

    #num_epochs = 10

    clf = RandomForestClassifier()

    #clf.fit(train_x,y_train.ravel())
    clf.fit(val_x,y_val.ravel())
    #preds_val=clf.predict(val_x)
    preds_test=clf.predict(test_x)

    val_f1_macro = f1_score(y_val, preds_val,average="macro")
    val_f1_micro = f1_score(y_val, preds_val,average="micro")
    val_accuracy = accuracy_score(y_val, preds_val)
    print({"val_f1_macro": val_f1_macro,"val_f1_micro":val_f1_micro, "val_accuracy": val_accuracy})
    val_pred_labels=[num2label[el.item()] for el in preds_val]
    val_dataset_=val_set.drop(columns=["label","text","image"])
    val_dataset_["label"]=val_pred_labels
    write_json(summary_dir_path,val_dataset_,"val_preds.json")


    test_pred_labels=[num2label[el.item()] for el in preds_test]
    dev_unlabeled_set_=dev_unlabeled_set.drop(columns=["text","image"])
    dev_unlabeled_set_["label"]=test_pred_labels
    write_json(summary_dir_path,dev_unlabeled_set_,"dev_preds.json")

    # Log metrics to W&B
    wandb.log({"val_f1_macro": val_f1_macro,"val_f1_micro":val_f1_micro, "val_accuracy": val_accuracy})
    summary = {
    "text_checkpoint" : best_text_model,
    "img_checkpoint":best_image_model,
    "best_meta_model" : best_meta_model["best_meta"],
    "train_path" : train_path,
    "val_path":val_path,
    "test_path":test_path,
    "val_results":{"val_f1_macro": val_f1_macro,"val_f1_micro":val_f1_micro, "val_accuracy": val_accuracy}
    }
    write_json(summary_dir_path,summary)
    artifact = wandb.Artifact(f"best_model_{run_name}".replace("/","_"), type="model")
    artifact.add_file(folder_name+f"best_model_{run_name}.pth".replace("/","_"), pickle.dump(clf, open(folder_name+f"best_model_{run_name}.pth".replace("/","_"), 'wb')))
    wandb.run.log_artifact(artifact)
    os.remove(folder_name+f"best_model_{run_name}.pth".replace("/","_"))
    drive_service.files().emptyTrash().execute()
    wandb.finish()
  except Exception as e:
    print(f"Error in training: {str(e)}")"""

In [None]:
"""# Set hyperparams in sweep configurations
run_name=f'{text_checkpoint}-{img_checkpoint}-subtask2b-STACKING'.replace("/","_")
sweep_name=f'sweep_{run_name}'
sweep_config = {
    'method': 'grid',  # can be grid, random, or bayes
    'name' : sweep_name,
    'metric': {
      'name': 'eval/f1_macro',
      'goal': 'maximize'
    },
    'parameters': {
        'run_name': {
            'value' : run_name
        }
    }
}

# Start sweeps with specific configuration
sweep_id = wandb.sweep(sweep_config, project="subtask2b")
wandb.agent(sweep_id, train)
# Get best model of sweep
api = wandb.Api()
sweep = api.sweep(f"subtask2b/{sweep_id}")
best_run = sweep.best_run()

artifacts = best_run.logged_artifacts()

model_artifact = None
for artifact in artifacts:
    if 'model' in artifact.type:  # Adjust the condition based on your setup
        model_artifact = artifact
        break

if model_artifact != None:
  model_artifact_name = model_artifact.name
  print(f"Best Model: {model_artifact_name}")
else:
  warnings.warn(f"No models was found")

# save best model of this node
best_meta_model["best_meta"] = model_artifact_name"""


In [None]:
wandb.finish()

In [19]:
clf = RandomForestClassifier()

#clf.fit(train_x,y_train.ravel())
clf.fit(val_x,y_val.ravel())
#preds_val=clf.predict(val_x)
preds_test=clf.predict(test_x)

test_pred_labels=[num2label[el.item()] for el in preds_test]
dev_unlabeled_set_=dev_unlabeled_set.drop(columns=["text","image"])
dev_unlabeled_set_["label"]=test_pred_labels
write_json(summary_dir_path,dev_unlabeled_set_,"dev_preds.json")

In [28]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

#clf.fit(train_x,y_train.ravel())
clf.fit(val_x,y_val.ravel())
#preds_val=clf.predict(val_x)
preds_test=clf.predict(test_x)

test_pred_labels=[num2label[el.item()] for el in preds_test]
dev_unlabeled_set_=dev_unlabeled_set.drop(columns=["text","image"])
dev_unlabeled_set_["label"]=test_pred_labels
write_json(summary_dir_path,dev_unlabeled_set_,"dev_preds_lr.json")