<a href="https://colab.research.google.com/github/faezesarlakifar/emotion-recognition/blob/main/ER_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch.nn as nn

from transformers import AutoTokenizer , get_linear_schedule_with_warmup , AutoModel , AutoModelForSequenceClassification

# from preprocessing import Preprocessing
from transformers import AutoTokenizer , AutoModelForSequenceClassification


In [None]:
def pe_predict(test_file, model,tokenizer,max_length=128,threshold=0.5):
    # tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large') if tokenizer is None else tokenizer
    # tqdm.pandas()
    test_data = test_file
    # # test_data['tweet'] = test_data['tweet'].progress_apply(Preprocessing()
    texts = test_data['tweet'].tolist()
    local_ids = test_data['local_id'].tolist()

    device = torch.device('cpu')
    model.eval()
    model = model.to(device)
    inputs = []
    label_dict = {0: 'sadness', 1: 'happiness', 2: 'surprise', 3: 'fear', 4: 'disgust', 5: 'anger', 6: 'other'}
    num_classes = len(label_dict)
    # model = model.cuda()
    for text in tqdm(texts):
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze().to(device)
        attention_mask = encoding['attention_mask'].squeeze().to(device)
        inputs.append({'input_ids': input_ids, 'attention_mask': attention_mask})

    # Make predictions
    with torch.no_grad():
        predictions = []
        prob_matrix = []  # To store the probability distribution for each text
        for input_data in tqdm(inputs):
            input_ids = input_data['input_ids'].unsqueeze(0)  # Add batch dimension
            attention_mask = input_data['attention_mask'].unsqueeze(0)  # Add batch dimension
            outputs = model(input_ids, attention_mask).logits
            predicted_probs = torch.softmax(outputs, dim=1).squeeze().cpu().numpy()
            prob_matrix.append(predicted_probs)
            predictions.append(label_dict[torch.argmax(outputs, dim=1).item()])

    csv_data = []
    for local_id, text, primary_emotion in zip(local_ids, texts, predictions):
        row = [local_id, text, primary_emotion]
        csv_data.append(row)

    # Create a DataFrame from the CSV data
    columns = ["local_id", "tweet", "primary_emotion"]
    df = pd.DataFrame(csv_data, columns=columns)
    df.to_csv('predictions_pe.csv', index=False)
    model = model.to(torch.device('cpu'))
    return df


In [None]:
def ae_predict(test_file,model,tokenizer):
    device = torch.device('cpu')
    # tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large') if tokenizer is None else tokenizer
    # tqdm.pandas()
    test_data = test_file
    # test_data['tweet'] = test_data['tweet'].progress_apply(Preprocessing())
    targets = ["Anger", "Fear", "Happiness", "Hatred", "Sadness", "Wonder"]
    results = []

    model.eval()
    model = model.to(device)

    for j in range(len(test_data)):
        data_point = test_data.iloc[j]
        text = data_point["tweet"]
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        # inputs = {key: value.to(device) for key, value in inputs.items()}  # Move tensors to CUDA

        with torch.no_grad():
            outputs = model(**inputs)
        probabilities = torch.sigmoid(outputs)

        # Find the emotion with the highest predicted value
        primary_emotion_index = torch.argmax(probabilities)
        primary_emotion = targets[primary_emotion_index]

        # Normalize and apply threshold
        threshold = 0.15
        predictions = (probabilities > threshold).cpu().numpy().tolist()[0]

        # Create a dictionary with the required information
        result = {
            "local_id": data_point["local_id"],
            "tweet": text,
            "primary_emotion": primary_emotion,
        }

        for i, emotion in enumerate(targets):
            # Convert True/False to 1/0
            result[emotion] = int(predictions[i])

        results.append(result)

    # Create a DataFrame
    result_df = pd.DataFrame(results)
    result_df = result_df.rename(columns={"Anger":'anger', "Sadness":'sadness', "Wonder":'surprise', "Happiness":'happiness', "Fear":'fear', "Hatred":'disgust'})
    model = model.to(torch.device('cpu'))
    return result_df


In [None]:
# @markdown load data
from tqdm.notebook import tqdm
import pandas as pd
import os

progress = tqdm(total=1)

!wget https://raw.githubusercontent.com/faezesarlakifar/emotion-recognition/main/final_test.csv -q

progress.set_description(f"Downloading file")
progress.update(1)

progress.close()

test_data=pd.read_csv('final_test.csv')
test_data = test_data.drop('Unnamed: 0', axis=1)


test_data.head()

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,local_id,tweet
0,1660281900949291009,گزارش اردی‌بهشت‌ماه در حال آماده‌سازی‌ست و فرد...
1,1635650914840195072,به بهانه خشونت اینستاگرام حتی اجازه انتشار عکس...
2,1467190780808904711,زمانیکه بازخورد و انعکاسی نمی‌گیرید و در نتیجه...
3,1637574629991563264,در هشتمین نوروز در زندان اگر آزادی می‌خواهیم م...
4,1505080291005984768,آقا و خانمی که تو مجازی داری عمرتو هدر میدی هی...


In [None]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')

Downloading (…)lve/main/config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
class XLMRobertaGRU(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.num_classes = num_classes
        self.xlmroberta = AutoModel.from_pretrained('xlm-roberta-large')

        # Add a GRU layer
        self.gru = nn.GRU(self.xlmroberta.config.hidden_size, hidden_size=self.xlmroberta.config.hidden_size, num_layers=1, batch_first=True)

        # # Correct the hidden size for the linear layer
        self.linear = nn.Linear(self.xlmroberta.config.hidden_size, num_classes)
        self.dropout = nn.Dropout(0.2)

    def forward(self, input_ids, attention_mask):
        outputs = self.xlmroberta(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        logits = outputs.last_hidden_state

        # Pass the logits through the GRU layer
        gru_output, _ = self.gru(logits)

        logits = self.linear(self.dropout(gru_output[:, -1, :]))
        return logits

In [None]:
def XLMRobertaLargeForClassification(num_labels=7):
    model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-large',num_labels=num_labels)
    return model

In [None]:
model_pe = XLMRobertaLargeForClassification(num_labels=7)

Downloading model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model_ae = XLMRobertaGRU(num_classes=6)

In [None]:
!pip install gdown



In [None]:
import gdown

# Replace 'YOUR_SHARED_LINK' with the actual shared link to the model
shared_link = 'https://drive.google.com/file/d/1zzm9voS2ILDeux8q32xNMv03pEqN2hkd/view?usp=drivesdk'

# Extract the file ID from the shared link
file_id = shared_link.split('/')[-2]

# Construct the direct download link
download_link = f'https://drive.google.com/uc?id={file_id}'

# Define the destination path where the model file will be saved
destination_path = 'model_pe.pt'  # You can change the filename and extension

# Download the model file
gdown.download(download_link, destination_path, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1zzm9voS2ILDeux8q32xNMv03pEqN2hkd
To: /content/model_pe.pt
100%|██████████| 2.24G/2.24G [00:24<00:00, 91.9MB/s]


'model_pe.pt'

In [None]:
import gdown

# Replace 'YOUR_SHARED_LINK' with the actual shared link to the model
shared_link = 'https://drive.google.com/file/d/1mnB7fNxlu-PD1MgCTSCCT_ki7fcX6X9v/view?usp=sharing'

# Extract the file ID from the shared link
file_id = shared_link.split('/')[-2]

# Construct the direct download link
download_link = f'https://drive.google.com/uc?id={file_id}'

# Define the destination path where the model file will be saved
destination_path = 'model_ae.pth'  # You can change the filename and extension

# Download the model file
gdown.download(download_link, destination_path, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1mnB7fNxlu-PD1MgCTSCCT_ki7fcX6X9v
To: /content/model_ae.pth
100%|██████████| 2.26G/2.26G [00:19<00:00, 115MB/s]


'model_ae.pth'

In [None]:
AE_PATH = 'model_ae.pth'
PE_PATH = 'model_pe.pt'

In [None]:
checkpoint_ae = torch.load(AE_PATH, map_location=torch.device('cpu'))
model_ae.load_state_dict(checkpoint_ae)

<All keys matched successfully>

In [None]:
checkpoint_pe = torch.load(PE_PATH, map_location=torch.device('cpu'))
model_pe.load_state_dict(checkpoint_pe)

<All keys matched successfully>

In [None]:
test_file = test_data

In [None]:
pe_predict(test_file,model_pe,tokenizer)

In [None]:
pe_prediction = pe_predict(test_file,model_pe,tokenizer)

In [None]:
pe_prediction.head()

Unnamed: 0,local_id,tweet,primary_emotion
0,1660281900949291009,گزارش اردی‌بهشت‌ماه در حال آماده‌سازی‌ست و فرد...,other
1,1635650914840195072,به بهانه خشونت اینستاگرام حتی اجازه انتشار عکس...,other
2,1467190780808904711,زمانیکه بازخورد و انعکاسی نمی‌گیرید و در نتیجه...,sadness
3,1637574629991563264,در هشتمین نوروز در زندان اگر آزادی می‌خواهیم م...,other
4,1505080291005984768,آقا و خانمی که تو مجازی داری عمرتو هدر میدی هی...,anger


In [None]:
ae_prediction = ae_predict(test_file,model_ae,tokenizer)
ae_prediction['primary_emotion'] = pe_prediction['primary_emotion']

In [None]:
columns = ["local_id", "tweet", "primary_emotion", "anger", "disgust", "fear", "sadness", "happiness", "surprise"]
final_result = ae_prediction[columns]
final_result.to_csv('final_result.csv')
final_result.head()

Unnamed: 0,local_id,tweet,primary_emotion,anger,disgust,fear,sadness,happiness,surprise
0,1660281900949291009,گزارش اردی‌بهشت‌ماه در حال آماده‌سازی‌ست و فرد...,other,0,0,1,1,1,0
1,1635650914840195072,به بهانه خشونت اینستاگرام حتی اجازه انتشار عکس...,other,1,1,1,1,0,1
2,1467190780808904711,زمانیکه بازخورد و انعکاسی نمی‌گیرید و در نتیجه...,sadness,1,0,0,1,0,0
3,1637574629991563264,در هشتمین نوروز در زندان اگر آزادی می‌خواهیم م...,other,1,1,0,1,0,0
4,1505080291005984768,آقا و خانمی که تو مجازی داری عمرتو هدر میدی هی...,anger,1,1,0,1,0,0
