In [3]:
import pandas as pd
import unicodedata
import os
import numpy as np

from sklearn.metrics import f1_score
import numpy as np
from scipy import stats

from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F

In [4]:
test_df = pd.read_csv('../data/test_data.csv', encoding="utf8")

In [5]:
def reencode_and_normalize(text):
    transl_table = dict([(ord(x), ord(y)) for x,y in zip( u"‘’´“”–-",  u"'''\"\"--")])
    fixed_text = text.replace('鈥�', '"').replace('鉂�', '').encode('gb18030').decode('utf8')
    fixed_text = unicodedata.normalize("NFKD", fixed_text)
    fixed_text = fixed_text.translate(transl_table)
    return fixed_text

test_df['Text data'] = test_df['Text data'].apply(reencode_and_normalize)

In [6]:
test_df['Text data'] = test_df['Text data'].str.strip()

In [7]:
test_texts = list(test_df['Text data'].values)

In [8]:
models_path = 'kfold/output'
n_folds = 4

model_names = [d for d in os.listdir(models_path) if os.path.isdir(os.path.join(models_path, d))]

In [9]:
model_names

['deberta-mental-health-v3',
 'roberta-large-v3-maxlen',
 'regression-v1',
 'deberta-large-v3-maxlen',
 'regression-headtail-50',
 'roberta-mental-health-headtail-75',
 'roberta-mental-health-headtail-0',
 'roberta-mental-health-v3-maxlen',
 'regression',
 'roberta-mental-health-headtail-50',
 'roberta-mental-health-headtail-25']

In [10]:
def load_model(model_path):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    return model, tokenizer

#model, tokenizer = load_model('hypsearch/output/roberta-large-v3-maxlen/trial_4')
#model = model.to(device)

In [11]:
def predict(texts, model, tokenizer, batchsize=64, device=torch.device('cpu')):
    all_logits = []
    for i in range(0, len(texts), batchsize):
        #print(len(all_logits), len(texts)//batchsize)
        batch = texts[i:i+batchsize]
        inputs = tokenizer(batch, truncation=True, padding='max_length', return_tensors='pt', max_length=512).to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        all_logits.append(outputs.logits.detach().cpu())
        
        del inputs
        del outputs

    logits_output = torch.cat(all_logits)
    softmax_output = F.softmax(logits_output, dim=-1)
    preds = softmax_output.argmax(-1)
    
    return preds.numpy(), softmax_output.numpy(), logits_output.numpy()

In [10]:
device = torch.device('cuda:5')
outputs = []
for model_name in model_names:
    fold_outputs = []
    for fold in range(n_folds):
        model_path = os.path.join(models_path, model_name, 'fold_'+str(fold))
        print(model_path)

        model, tokenizer = load_model(model_path)
        model = model.to(device)
        model_outputs = predict(test_texts, model, tokenizer, batchsize=128, device=device)
        fold_outputs.append(model_outputs)
    outputs.append(fold_outputs)

kfold/output/deberta-mental-health-v3/fold_0




kfold/output/deberta-mental-health-v3/fold_1
kfold/output/deberta-mental-health-v3/fold_2
kfold/output/deberta-mental-health-v3/fold_3
kfold/output/roberta-large-v3-maxlen/fold_0
kfold/output/roberta-large-v3-maxlen/fold_1
kfold/output/roberta-large-v3-maxlen/fold_2
kfold/output/roberta-large-v3-maxlen/fold_3
kfold/output/deberta-large-v3-maxlen/fold_0
kfold/output/deberta-large-v3-maxlen/fold_1
kfold/output/deberta-large-v3-maxlen/fold_2
kfold/output/deberta-large-v3-maxlen/fold_3
kfold/output/roberta-mental-health-v3-maxlen/fold_0
kfold/output/roberta-mental-health-v3-maxlen/fold_1
kfold/output/roberta-mental-health-v3-maxlen/fold_2
kfold/output/roberta-mental-health-v3-maxlen/fold_3


In [13]:
id2label = {
    0:"moderate",
    1:"not depression",
    2:"severe"
}

label2id = {v: k for k, v in id2label.items()}

In [18]:
outputs_flatten = []
for m in outputs:
    for f in m:
        outputs_flatten.append(f)
len(outputs_flatten)

16

In [31]:
preds = np.asarray([o[0] for o in outputs_flatten])
softmax = np.asarray([o[1] for o in outputs_flatten])
logits = np.asarray([o[2] for o in outputs_flatten])

In [32]:
def ordered_mean(preds):
    map_ordered = {0:1, 1:0, 2:2}
    unmap_ordered = {1:0, 0:1, 2:2}
    
    pred_ordered = np.vectorize(lambda x: map_ordered[x])(preds)
    pred_ordered_mean = np.mean(pred_ordered, axis=0).round()
    pred_ordered_mean_unmap = np.vectorize(lambda x: unmap_ordered[x])(pred_ordered_mean)
    return pred_ordered_mean_unmap

def mode(preds):
    return stats.mode(preds, axis=0).mode[0]

def logits_mean(logits):
    return np.mean(logits, axis=0).argmax(-1)

In [51]:
results = {
    "ordered_mean": ordered_mean(preds),
    "mode": mode(preds),
    "logits_mean": logits_mean(logits),
}

for name, preds_model in results.items():
    sub_df = test_df.copy()
    sub_df['class_label'] = preds_model
    sub_df['class_label'] = sub_df['class_label'].map(id2label)
    sub_df['pid'] = sub_df['Pid']
    sub_df.to_csv(os.path.join('submissions', name+'.csv'), index=False)
    sub_df[['pid', 'class_label']].to_csv(os.path.join('submissions', 'DeepLearningBrasil_'+name+'.tsv'), sep='\t', index=False)

In [50]:
results_list = list(results.values())
(results_list[1] != results_list[1].copy()).any()

False

In [28]:
sub_df

Unnamed: 0,Pid,Text data,class_label
0,test_id_1,"This is me. Don't get me wrong, it's better th...",moderate
1,test_id_2,I hate that people don't understand that i don...,moderate
2,test_id_3,"But here I am, 24 years old man and doing exac...",not depression
3,test_id_4,I'm trapped inside. Does anyone else get that ...,moderate
4,test_id_5,I read a lot of posts on here of people strugg...,moderate
...,...,...,...
494,test_id_495,I'm 14\nmy mom doesn't take my mental health s...,moderate
495,test_id_496,I was quite shocked at their reactions. I sort...,not depression
496,test_id_497,Lying on my bed..... fantasising another life ...,moderate
497,test_id_498,"I was bullied in elementary school, and I alwa...",moderate


In [None]:
results = {
    "softmax_mean_preds": np.mean(softmax, axis=0).argmax(-1),
    "softmax_max_preds": np.max(softmax, axis=0).argmax(-1),
    "logits_mean_preds": np.mean(logits, axis=0).argmax(-1),
    "logits_max_preds": np.max(logits, axis=0).argmax(-1),
    "preds_mode": stats.mode(preds, axis=0).mode[0],
}

In [None]:
for r in results:
    print(r, f1_score(y_true, results[r], average="macro"))