In [None]:
import os
import time
import json
import logging
import warnings
import pandas as pd
import numpy as np
from glob import glob
from functools import reduce
from sklearn.metrics import f1_score
from datasets import Dataset, DatasetDict
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments, AutoTokenizer,AutoModelForSequenceClassification

%run Balance_Data.ipynb
%run Preprocessing_Functions.ipynb
%run TRAIN_AND_TEST.ipynb



# Functions

In [None]:
#  MPress task Train and Test Function Cross-Dataset

def train_test_binary_crossdataset(df_train, df_tests, model_name, directory, train_name,test_names,task, prompt,lexicon,undersampling=False):
    #-----info-----
    id2label = {0: "NO-MORAL", 1: "MORAL"}
    label2id = {"NO-MORAL": 0, "MORAL": 1}
    print('START TRAINING:\n')
    print('INFO:')
    print(f'Train dataset  {train_name}\n')
    print(f'Lexicon: {lexicon}\n')
    print(f'Prompt: {prompt}\n')
    print(f'Moral: {task}\n')
    print(f'Directory: {directory}\n')
    print(f'Undersampling: {undersampling}\n')


    #-----data preparation-----
    df_train = preprocess_data_binary(df_train, task)
    logging.info(f'Train examples of {train_name}: {df_train.label.value_counts()}')
    for _ in test_names:
        df_tests[_]= preprocess_data_binary(df_tests[_], task)

    if undersampling:
        df_train=undersampling_data(df_train)
        for _ in test_names:
            df_tests[_]= undersampling_data(df_tests[_])
    else:
        pass


    #-----split data-----
    train_df, val_df = train_test_split(df_train, test_size=0.2, random_state=42)
    print(f'{train_df.shape},{train_name}')
    datasets = {'train': Dataset.from_pandas(train_df), 'val': Dataset.from_pandas(val_df)}
    for _ in test_names: 
        datasets[_] = Dataset.from_pandas(df_tests[_])
    datasets = DatasetDict(datasets)


    
    #----model-----
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2label, label2id=label2id)
    tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True)
    tokenized_datasets = datasets.map(tokenize_function, batched=True)
    
 
  
    #-----train----
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["val"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics)

    
    print('Training...')
    #start_time_all_data = time.time()
    trainer.train()
    #end_time_all_data = time.time()
    #end_time_all_data = (end_time_all_data - start_time_all_data) 
    #print(f'Finished training for {train_name}. Time: {end_time_all_data} seg.\n')
    
    #-----inference-----
    print('Testing...')
    print(f'Datasets for test {test_names}')
    for _ in test_names:
        print(f'Test dataset: {_}')
        predictions = trainer.predict(tokenized_datasets[_])
        predicted_class_ids = predictions.predictions.argmax(axis=1)
        actual_labels = tokenized_datasets[_]["label"]
        results = classification_report(actual_labels, predicted_class_ids, digits=5, output_dict=True)
        file_name = f"result_{train_name}_{_}_{lexicon}_moral{task}_{prompt}"
        
         #-----save metrics-----   
        results_directory = os.path.join(directory, file_name)
        os.makedirs(results_directory, exist_ok=True)
        print(f'File name: {file_name}')

        
        with open(os.path.join(results_directory, f'results_classification_{model_name}.json'), 'w') as f:
            json.dump(results, f)
            print(f'Finished test for {_}.\n')
    print(f'FINISH! TRAIN DATASET : {train_name}\n')
    print('-----------------------------------------------------------------------------------------------------------------')


In [None]:
# MultiPress task Train and Test Function Cross-Dataset
def train_test_multiclass_crossdataset(df_train, df_tests, model_name, directory, train_name,test_names, prompt,lexicon,undersampling=False):
    start_time = time.time()
    #-----prepare data-----
    print('START TRAINING:\n')
    print('INFO:')
    print(f'Train dataset  {train_name}\n')
    print(f'Lexicon: {lexicon}\n')
    print(f'Prompt: {prompt}\n')
    print(f'Directory: {directory}\n')
    print(f'Undersampling: {undersampling}\n')

    id2label,label2id = multiclass_task_5()
    print('labels :' )
    print(label2id)

    #pre-procesar datasets de train y test - según la moral
    df_train = label_multiclass5(df_train)
    for _ in test_names:
        df_tests[_]= label_multiclass5(df_tests[_])


    #balancear datos si undersampling=True (varía según el rasgo moral)
    if undersampling:
        df_train=undersampling_data(df_train)
        logging.info(f'Train examples: {df_train.label.value_counts()}')
        for _ in test_names:
            df_tests[_]= undersampling_data(df_tests[_])
            logging.info(f'Test examples of {_} {df_tests[_].label.value_counts()}')
    else:
        pass
        
    logging.info('')

    
        #guardar los datos en objeto Dataset 
    train_df, val_df = train_test_split(df_train, test_size=0.2, random_state=42)
    print(f'{train_df.shape},{train_name}')
    datasets = {'train': Dataset.from_pandas(train_df), 'val': Dataset.from_pandas(val_df)}
    for _ in test_names: 
        datasets[_] = Dataset.from_pandas(df_tests[_])

    #tokenizar
    datasets = DatasetDict(datasets)
    tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True)
    tokenized_datasets = datasets.map(tokenize_function, batched=True)


    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6, id2label=id2label, label2id=label2id)
    #-----train-----
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["val"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
        )
    logging.info('Training...')
    trainer.train()
    
    
    print('Testing...')
    #-----test-----
    print(test_names)
    for _ in test_names:
        print(f'Test dataset: {_}')
        logging.info(f'Test dataset: {_}')
        predictions = trainer.predict(tokenized_datasets[_])
        predicted_class_ids = predictions.predictions.argmax(axis=1)
        actual_labels = tokenized_datasets[_]["label"]
        results = classification_report(actual_labels, predicted_class_ids, digits=5, output_dict=True)
        file_name = f"result_multiclass_{train_name}_{_}_{lexicon}_{prompt}"
        results_directory = os.path.join(directory, file_name)
        os.makedirs(results_directory, exist_ok=True)
        print(f'file: {file_name}')
        logging.info('')
        #-----guardar resultados-----
        
        with open(os.path.join(results_directory, f'results_classification_{model_name}.json'), 'w') as f:
            json.dump(results, f)
            print(f'Finished test for {_}.\n')
            logging.info(f'Finished test for {_}')
            logging.info('')
        
    print(f'FINISH! TRAIN DATASET : {train_name}\n')
    print('-----------------------------------------------------------------------------------------------------------------')


# Parameters

In [None]:
directory='Results_Cross_Dataset'
undersampling=True

training_args = TrainingArguments(
    output_dir = directory,
    learning_rate=2e-5,
    num_train_epochs=15,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    evaluation_strategy = "epoch",
    push_to_hub=False,
    save_strategy='no')


#Select model
models=['bert-base-uncased','roberta-base']
model_name=models[1]

#select prompt
prompts=['noprompt',1,2,3,4]
prompt_number=prompts[2]

#select lexicon
lexicons=['baseline','moralstrength','depechemood','moralstrength+depechemood']
lexicon=lexicons[3]


#model tokenizer
if model_name=='bert-base-uncased':
    folder='Bert'
elif model_name=='roberta-base':
    folder='Roberta' 
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True)

# MPress Cross-Dataset

In [None]:
#Load data
folder_path='DATASETS'
if model_name=='roberta-base':
    if undersampling==True:
        directory=('Results_Cross_Dataset/Balanced/Roberta_Moralstrength')
    elif undersampling==False:
        directory=('Results_Cross_Dataset/Original/Roberta_Moralstrength')


files = [file for file in os.listdir(folder_path) if file.endswith('_dataset.csv')]
test_names = []

In [None]:
# Train and Test
for f in files:
    df_train = pd.read_csv(os.path.join(folder_path, f))
    train_name=f.replace('_dataset.csv', '')
    print(f'TRAIN {train_name}\n')
    print(f'Data example of the original dataset: {df_train.text.loc[0]}')
    # Dictionary of test dataframes
    df_tests = {}
    files2=[]
    
    for i,elemento in enumerate(files):
        if elemento != f: 
            files2.append(elemento)
            
    print(f'Test Data: {files2}\n')
    test_names=[]
    for f2 in files2:  
        # Load only test dataframes from remaining CSV files
        df_test = pd.read_csv(os.path.join(folder_path, f2))
        df_tests[f2.replace('_dataset.csv', '')] = df_test
        test_names.append(f2.replace('_dataset.csv', ''))

    print(f'KEYS {df_tests.keys()}\n')
    print('-------------------------------')
            
    #Adapt lexicon and prompt
    df_train=  prompt_template(df_train, prompt_number,lexicon)
    print(f'Data example of the dataset: {df_train.text.loc[0]}')
    print(test_names)
    for _ in test_names:
        df_tests[_]= prompt_template(df_tests[_], prompt_number,lexicon)

    #Apply function
    for task in range(1, 6):
        train_test_binary_crossdataset(df_train.copy(), df_tests.copy(), model_name, directory, train_name,test_names,task,prompt_number,lexicon,undersampling=True)


# MultiPres Cross Dataset

In [None]:
#Load data
folder_path='DATASETS'
if model_name=='roberta-base':
    if undersampling==True:
        directory=('Results_Cross_Dataset/Balanced/Roberta_Moralstrength')
    elif undersampling==False:
        directory=('Results_Cross_Dataset/Original/Roberta_Moralstrength')


files = [file for file in os.listdir(folder_path) if file.endswith('_dataset.csv')]
test_names = []

In [None]:
#Train and Test
for f in files:
    df_train = pd.read_csv(os.path.join(folder_path, f))
    train_name=f.replace('_dataset.csv', '')
    print(f'TRAIN {train_name}\n')
    print(f'Data example of the original dataset: {df_train.text.loc[0]}')
    # Dictionary of test dataframes
    df_tests = {}
    files2=[]
    
    for i,elemento in enumerate(files):
        if elemento != f: 
            files2.append(elemento)

    print(f'Datos de test: {files2}\n')
    test_names=[]
    for f2 in files2:  
        # Load only test dataframes from remaining CSV files
        df_test = pd.read_csv(os.path.join(folder_path, f2))
        df_tests[f2.replace('_dataset.csv', '')] = df_test
        test_names.append(f2.replace('_dataset.csv', ''))

    print(f'KEYS {df_tests.keys()}\n')
    print('-------------------------------')
            
    df_train=  prompt_template(df_train, prompt_number,lexicon)
    print(f'Data example of the dataset: {df_train.text.loc[0]}')
    print(test_names)
    for _ in test_names:
        df_tests[_]= prompt_template(df_tests[_], prompt_number,lexicon)

    #entrenar para cada moral 
    train_test_multiclass_crossdataset(df_train.copy(), df_tests.copy(), model_name, directory, train_name,test_names,prompt_number,lexicon,undersampling=True)
