In [1]:
import time
import pandas as pd
import torch.cuda
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay
import os
import json
import matplotlib.pyplot as plt
import torch
import warnings
from multiprocessing import freeze_support
print(torch.cuda.is_available())
warnings.filterwarnings("ignore")

True


# Train/Eval loop

In [2]:
def merge_and_relable(old_data, custom_data, users):
    # switch column
    old_data = old_data[["id", "text"]]
    imdb_5 = old_data[old_data['id'].between(0, users-1)]
    merged = pd.concat([imdb_5, custom_data], axis=0)
    # convert ids for BERTAA
    merged['id'] = merged['id'].replace({101: 5, 102: 6, 103: 7, 104: 8, 105: 9})
    return merged

In [3]:
imdb100 = pd.read_csv('data/imdb100.csv')

def train_predict(iteration, train, test):
    model = ClassificationModel(
        "bert",
        "bert-base-cased",
        use_cuda=True,
        num_labels=iteration,
        args= {'overwrite_output_dir': True, 'num_train_epochs': 5})

    model.train_model(train) 
    preds, model_output = model.predict(list(test['text']))
    return preds, model_output

In [4]:
def iterate_authors_custom(data: pd.DataFrame, authors_considered: list, output_file: str):
    model_outputs = []
    predictions = []
    y_true = []
    scores = []
    os.makedirs("results", exist_ok=True)
    os.makedirs("scores", exist_ok=True)
    for iteration in authors_considered:
        # extract data 0-iteration-1 (-1 as we start at 0)
        unique_ints_count = data['id'].nunique()
        print(50 * "-")
        print("Number of authors:", unique_ints_count)
        # split into train, test stratified =(similar class distributions)
        selected_df = data[['text', 'id']]
        train, test = train_test_split(selected_df, test_size=0.2, stratify=selected_df["id"])
        y_true.append(test["id"].tolist())
        print(f"Shape train: {train.shape}, Shape test: {test.shape}")
        majority = test["id"].value_counts().loc[0] / sum(test['id'].value_counts())
        print(f"majority class baseline test accuracy: {round(majority*100,2)} % ")
        preds, model_output = train_predict(iteration, train, test)
        model_outputs.append(model_output.tolist()), predictions.append(preds.tolist())
        score = accuracy_score(preds, test["id"])
        f1 = f1_score(preds, test["id"], average="macro")
        scores.append({"num_authors": unique_ints_count, "accuracy": score, "f1": f1})
        print(f"Test accuracy: {score}")
        print(f"Test f1 score: {f1}")
        # conf matrix
        
        conf_matrix = ConfusionMatrixDisplay.from_predictions(test["id"], preds)
        plt.grid(False)
        plt.title(f'Confusion Matrix {output_file}_{iteration}')
        plt.savefig(f'results/Confusion Matrix {output_file}_{iteration}.png')
        print(50 * "-")
    
    scores_df = pd.DataFrame(scores)
    scores_df.to_csv(f'scores/scores_{output_file}.csv', index=False)
    output_path = os.path.join("results", f"{output_file}.json")
    # save results for eval in ipynb
    output_data = {
        "model_outputs": model_outputs,
        "predictions": predictions,  # model preds on test
        "y_true": y_true  # test y labels
    }
    with open(output_path, "w") as json_file:
        json.dump(output_data, json_file)



# Baseline

## Per User

### Llama

In [5]:
users = 5
authors_considered = [users*2]
imdbLlama = pd.read_csv('data/baseline/reviews_Llama_baseline_new.csv')
llama_merged = merge_and_relable(imdb100, imdbLlama, users)

In [None]:
if __name__ == '__main__':
    freeze_support()
    print("#########START IMDBLlama#########")
    iterate_authors_custom(llama_merged,authors_considered, "Llama_results_baseline")

### Gemma

In [14]:
users = 5
authors_considered = [users*2]
imdbgemma = pd.read_csv('data/baseline/reviews_gemma_baseline_new.csv')
gemma_merged = merge_and_relable(imdb100, imdbgemma, users)

In [None]:

if __name__ == '__main__':
    freeze_support()
    print("#########START IMDBLlama#########")
    iterate_authors_custom(gemma_merged,authors_considered, "gemma_results_baseline")

### Zephyr

In [16]:
users = 5
authors_considered = [users*2]
imdbzephyr = pd.read_csv('data/baseline/reviews_zephyr_baseline_new.csv')
zephyr_merged = merge_and_relable(imdb100, imdbzephyr, users)

In [None]:
if __name__ == '__main__':
    freeze_support()
    print("#########START IMDBzephyr#########")
    iterate_authors_custom(zephyr_merged,authors_considered, "zephyr_results_baseline")

## All users

In [6]:
def custom_data(data, data_to_merge, users, label_dict):
    data_n = data_to_merge[data_to_merge['id'].between(101, 101+users-1)]
    merged = pd.concat([data, data_n], axis=0)
    # convert ids for BERTAA
    merged['id'] = merged['id'].replace(label_dict)
    return merged

In [7]:
imdb100 = pd.read_csv('data/imdb100.csv')
imdbLlama = pd.read_csv('data/baseline/reviews_Llama_baseline_new.csv')
imdbgemma = pd.read_csv('data/baseline/reviews_gemma_baseline_new.csv')
imdbzephyr = pd.read_csv('data/baseline/reviews_zephyr_baseline_new.csv')

In [None]:
users = [1,3,5]
imdb_p = imdb100[["id", "text"]]
label_dicts = [[{101: 1},{101: 2}, {101: 3}],
               [{101: 3, 102: 4, 103: 5}, {101: 6, 102: 7, 103: 8}, {101: 9, 102: 10, 103: 11}],
               [{101: 5, 102: 6, 103: 7, 104: 8, 105: 9}, {101: 10, 102: 11, 103: 12, 104: 13, 105: 14}, {101: 15, 102: 16, 103: 17, 104: 18, 105: 19}]]
for idx,user in enumerate(users):
    imdb_n = imdb_p[imdb_p['id'].between(0, user-1)]
    merged = custom_data(imdb_n, imdbLlama, user, label_dicts[idx][0])
    merged = custom_data(merged, imdbgemma, user, label_dicts[idx][1])
    merged = custom_data(merged, imdbzephyr, user, label_dicts[idx][2])
    authors_considered = [user*4]
    if __name__ == '__main__':
        freeze_support()
        print(f"#########START mixed {user} authors#########")
        iterate_authors_custom(merged,authors_considered, f"mixed_{user}_results_baseline")


In [21]:
def custom_data(data, data_to_merge, users, label):
    # 101 1st user, 102 second, ...
    data_n = data_to_merge[data_to_merge['id']==(100+users)]
    merged = pd.concat([data, data_n], axis=0)
    # convert ids for BERTAA
    merged['id'] = merged['id'].replace({100 + users: label})
    return merged

In [None]:
users = [1,2,3,4,5]
imdb_p = imdb100[["id", "text"]]


for idx,user in enumerate(users):
    imdb_n = imdb_p[imdb_p['id']==(user-1)]
    # replace user_id by 0
    imdb_n['id'] = imdb_n['id'].replace({user-1: 0})
    merged = custom_data(imdb_n, imdbLlama, user, 1)
    merged = custom_data(merged, imdbgemma, user, 2)
    merged = custom_data(merged, imdbzephyr, user, 3)
    # 4 since original + 3 generated
    authors_considered = [4]
    if __name__ == '__main__':
        freeze_support()
        print(f"#########START user comparison {user}#########")
        iterate_authors_custom(merged,authors_considered, f"user_comparison_{user}_results_baseline")


# Best Params

## Per User

### Llama

In [9]:
users = 5
authors_considered = [users*2]
imdbLlama = pd.read_csv('data/final_gens/reviews_Llama_new.csv')
llama_merged = merge_and_relable(imdb100, imdbLlama, users)

In [None]:
if __name__ == '__main__':
    freeze_support()
    print("#########START IMDBLlama#########")
    iterate_authors_custom(llama_merged,authors_considered, "Llama_results_final_gens")

### Gemma

In [11]:
users = 5
authors_considered = [users*2]
imdbgemma = pd.read_csv('data/final_gens/reviews_gemma_new.csv')
gemma_merged = merge_and_relable(imdb100, imdbgemma, users)

In [None]:

if __name__ == '__main__':
    freeze_support()
    print("#########START IMDBGemma#########")
    iterate_authors_custom(gemma_merged,authors_considered, "gemma_results_final_gens")

### Zephyr

In [None]:
users = 5
authors_considered = [users*2]
imdbzephyr = pd.read_csv('data/final_gens/reviews_zephyr_new.csv')
zephyr_merged = merge_and_relable(imdb100, imdbzephyr, users)

In [None]:
if __name__ == '__main__':
    freeze_support()
    print("#########START IMDBzephyr#########")
    iterate_authors_custom(zephyr_merged,authors_considered, "zephyr_results_final_gens")

## All users

In [5]:
def custom_data(data, data_to_merge, users, label_dict):
    data_n = data_to_merge[data_to_merge['id'].between(101, 101+users-1)]
    merged = pd.concat([data, data_n], axis=0)
    # convert ids for BERTAA
    merged['id'] = merged['id'].replace(label_dict)
    return merged

In [9]:
imdb100 = pd.read_csv('data/imdb100.csv')
imdbLlama = pd.read_csv('data/reviews_lama.csv')
imdbgemma = pd.read_csv('data/reviews_gemma.csv')
imdbzephyr = pd.read_csv('data/reviews_zephyr.csv')

In [None]:
users = [1,3,5]
imdb_p = imdb100[["id", "text"]]
label_dicts = [[{101: 1},{101: 2}, {101: 3}],
               [{101: 3, 102: 4, 103: 5}, {101: 6, 102: 7, 103: 8}, {101: 9, 102: 10, 103: 11}],
               [{101: 5, 102: 6, 103: 7, 104: 8, 105: 9}, {101: 10, 102: 11, 103: 12, 104: 13, 105: 14}, {101: 15, 102: 16, 103: 17, 104: 18, 105: 19}]]
for idx,user in enumerate(users):
    imdb_n = imdb_p[imdb_p['id'].between(0, user-1)]
    merged = custom_data(imdb_n, imdbLlama, user, label_dicts[idx][0])
    merged = custom_data(merged, imdbgemma, user, label_dicts[idx][1])
    merged = custom_data(merged, imdbzephyr, user, label_dicts[idx][2])
    authors_considered = [user*4]
    if __name__ == '__main__':
        freeze_support()
        print(f"#########START mixed {user} authors#########")
        iterate_authors_custom(merged,authors_considered, f"mixed_{user}_results_final_gens")


In [None]:
def custom_data(data, data_to_merge, users, label):
    # 101 1st user, 102 second, ...
    data_n = data_to_merge[data_to_merge['id']==(100+users)]
    merged = pd.concat([data, data_n], axis=0)
    # convert ids for BERTAA
    merged['id'] = merged['id'].replace({100 + users: label})
    return merged

In [None]:
users = [1,2,3,4,5]
imdb_p = imdb100[["id", "text"]]


for idx,user in enumerate(users):
    imdb_n = imdb_p[imdb_p['id']==(user-1)]
    # replace user_id by 0
    imdb_n['id'] = imdb_n['id'].replace({user-1: 0})
    merged = custom_data(imdb_n, imdbLlama, user, 1)
    merged = custom_data(merged, imdbgemma, user, 2)
    merged = custom_data(merged, imdbzephyr, user, 3)
    # 4 since original + 3 generated
    authors_considered = [4]
    if __name__ == '__main__':
        freeze_support()
        print(f"#########START user comparison {user}#########")
        iterate_authors_custom(merged,authors_considered, f"user_comparison_{user}_results")
