In [None]:
%pwd

# Create dataset

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

In [None]:
def read_posts_from_user(writing):
    posts = []
    for wrt in writing:
        user_post = {}
        user_post['TITLE'] = wrt.find('TITLE').text
        user_post['DATE'] = wrt.find('DATE').text
        user_post['INFO'] = wrt.find('INFO').text
        user_post['TEXT'] = wrt.find('TEXT').text
        posts.append(user_post)
        training_sample = pd.DataFrame(posts)
    return training_sample

In [None]:
def read_posts_from_user_test(writing):
    posts = []
    for wrt in writing:
        user_post = {}
        user_post['TITLE'] = wrt.find('TITLE').text[3:-2]
        user_post['DATE'] = wrt.find('DATE').text[3:-2]
        user_post['INFO'] = wrt.find('INFO').text
        user_post['TEXT'] = wrt.find('TEXT').text[3:-2]
        posts.append(user_post)
        training_sample = pd.DataFrame(posts)
    return training_sample

In [None]:
from bs4 import BeautifulSoup
import os
FOLDER = '/content/drive/MyDrive/erisk/data2018/train/positive_examples'
train_data = pd.DataFrame([])
for i in range(1, 10):
    chunk = f'/content/drive/MyDrive/erisk/data2018/train/positive_examples/chunk{i}'
    files = os.listdir(chunk)
    for file in files:
      with open(f'{chunk}/{file}') as fp:
          soup = BeautifulSoup(fp, 'xml')
      writing = soup.find_all('WRITING')
      training_sample = read_posts_from_user(writing)
      training_sample['USER'] = f'eRisk2022-T3_Subject{i}'
      train_data = pd.concat([train_data, training_sample])

In [None]:
train_data = train_data.drop(['TITLE', 'DATE', 'INFO', 'USER'], axis=1)
train_data.head()

In [None]:
labels = pd.read_csv('/content/drive/MyDrive/erisk/data2018/test/risk-golden-truth-test.txt', delim_whitespace=True, header=None)

In [None]:
labels.head()

In [None]:
positive_subjects = labels[labels[1] == 1][0].values
positive_subjects

In [None]:
test_data = pd.DataFrame([])
for i in range(1, 10):
    chunk = f'/content/drive/MyDrive/erisk/data2018/test/Task2_chunk{i}/chunk{i}'
    files = os.listdir(chunk)
    for file in files:
      subject = file.split('.')[0].split('_')[0]
      if subject in positive_subjects:
        #print(subject)
        with open(f'{chunk}/{file}') as fp:
            soup = BeautifulSoup(fp, 'xml')
        writing = soup.find_all('WRITING')
        training_sample = read_posts_from_user(writing)
        training_sample['USER'] = f'eRisk2022-T3_Subject{i}'
        test_data = pd.concat([test_data, training_sample])

In [None]:
test_data = test_data.drop(['TITLE', 'DATE', 'INFO', 'USER'], axis=1)

In [None]:
test_data.head()

In [None]:
all_data = pd.concat([train_data, test_data], ignore_index=True)

In [None]:
import numpy as np
all_data['TEXT'] = [data.strip() for data in all_data['TEXT']]
all_data['TEXT'] = [np.nan if data == '' else data for data in all_data['TEXT']]
all_data = all_data.dropna()

In [None]:
all_data

In [None]:
pip install -U sentence-transformers

In [None]:
all_data_txt = all_data.to_csv('all_data.csv', header=None, index=None, sep=' ')

In [None]:
all_data_txt = all_data.to_csv('all_data.txt', header=None, index=None, sep=' ')

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
with open('all_data.txt', 'a') as f:
    df_string = all_data['TEXT'].to_string(header=False, index=False)
    f.write(df_string)

In [None]:
"""
This file loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
TSDAE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
Usage:
python train_tsdae_from_file.py path/to/sentences.txt
"""
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, datasets, losses
import logging
import gzip
from torch.utils.data import DataLoader
from datetime import datetime
import sys
import tqdm

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Train Parameters
model_name = 'distilroberta-base'
batch_size = 8

#Input file path (a text file, each line a sentence)
if len(sys.argv) < 2:
    print("Run this script with: python {} path/to/sentences.txt".format(sys.argv[0]))
    exit()

filepath = 'all_data.txt'

# Save path to store our model
output_name = ''
if len(sys.argv) >= 3:
    output_name = "-"+sys.argv[2].replace(" ", "_").replace("/", "_").replace("\\", "_")

#model_output_path = 'output/train_tsdae{}-{}'.format(output_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
model_output_path = '/content/drive/MyDrive/erisk/output_roberta/train_tsdae{}-{}'.format(output_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))


################# Read the train corpus  #################
train_sentences = []
with gzip.open(filepath, 'rt', encoding='utf8') if filepath.endswith('.gz') else open(filepath, encoding='utf8') as fIn:
    for line in tqdm.tqdm(fIn, desc='Read file'):
        line = line.strip()
        if len(line) >= 10:
            train_sentences.append(line)


logging.info("{} train sentences".format(len(train_sentences)))

################# Intialize an SBERT model #################

word_embedding_model = models.Transformer(model_name)
# Apply **cls** pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

################# Train and evaluate the model (it needs about 1 hour for one epoch of AskUbuntu) #################
# We wrap our training sentences in the DenoisingAutoEncoderDataset to add deletion noise on the fly
train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)


logging.info("Start training")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=20,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True,
    checkpoint_path=model_output_path,
    use_amp=False                #Set to True, if your GPU supports FP16 cores
)

In [None]:
!pip install transformers

In [None]:
"""
This file runs Masked Language Model. You provide a training file. Each line is interpreted as a sentence / paragraph.
Optionally, you can also provide a dev file.
The fine-tuned model is stored in the output/model_name folder.
Usage:
python train_mlm.py model_name data/train_sentences.txt [data/dev_sentences.txt]
"""

from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import Trainer, TrainingArguments
import sys
import gzip
from datetime import datetime

if len(sys.argv) < 3:
    print("Usage: python train_mlm.py model_name data/train_sentences.txt [data/dev_sentences.txt]")
    exit()

model_name = 'mental/mental-bert-base-uncased'
per_device_train_batch_size = 64

save_steps = 1000               #Save model every 1k steps
num_train_epochs = 1           #Number of epochs
use_fp16 = False                #Set to True, if your GPU supports FP16 operations
max_length = 100                #Max length for a text input
do_whole_word_mask = True       #If set to true, whole words are masked
mlm_prob = 0.15                 #Probability that a word is replaced by a [MASK] token

# Load the model
model = AutoModelForMaskedLM.from_pretrained(model_name, use_auth_token='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')
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token='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')


output_dir = "/content/drive/MyDrive/erisk/output_mhb1_epochs"
print("Save checkpoints to:", output_dir)


##### Load our training datasets

train_sentences = []
train_path = 'all_data.txt'
with gzip.open(train_path, 'rt', encoding='utf8') if train_path.endswith('.gz') else  open(train_path, 'r', encoding='utf8') as fIn:
    for line in fIn:
        line = line.strip()
        if len(line) >= 10:
            train_sentences.append(line)

print("Train sentences:", len(train_sentences))

dev_sentences = []
if len(sys.argv) >= 4:
    dev_path = sys.argv[3]
    with gzip.open(dev_path, 'rt', encoding='utf8') if dev_path.endswith('.gz') else open(dev_path, 'r', encoding='utf8') as fIn:
        for line in fIn:
            line = line.strip()
            if len(line) >= 10:
                dev_sentences.append(line)

print("Dev sentences:", len(dev_sentences))

#A dataset wrapper, that tokenizes our data on-the-fly
class TokenizedSentencesDataset:
    def __init__(self, sentences, tokenizer, max_length, cache_tokenization=False):
        self.tokenizer = tokenizer
        self.sentences = sentences
        self.max_length = max_length
        self.cache_tokenization = cache_tokenization

    def __getitem__(self, item):
        if not self.cache_tokenization:
            return self.tokenizer(self.sentences[item], add_special_tokens=True, truncation=True, max_length=self.max_length, return_special_tokens_mask=True)

        if isinstance(self.sentences[item], str):
            self.sentences[item] = self.tokenizer(self.sentences[item], add_special_tokens=True, truncation=True, max_length=self.max_length, return_special_tokens_mask=True)
        return self.sentences[item]

    def __len__(self):
        return len(self.sentences)

train_dataset = TokenizedSentencesDataset(train_sentences, tokenizer, max_length)
dev_dataset = TokenizedSentencesDataset(dev_sentences, tokenizer, max_length, cache_tokenization=True) if len(dev_sentences) > 0 else None


##### Training arguments

if do_whole_word_mask:
    data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)
else:
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    evaluation_strategy="steps" if dev_dataset is not None else "no",
    per_device_train_batch_size=per_device_train_batch_size,
    eval_steps=save_steps,
    save_steps=save_steps,
    logging_steps=save_steps,
    save_total_limit=1,
    prediction_loss_only=True,
    fp16=use_fp16
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset
)

print("Save tokenizer to:", output_dir)
tokenizer.save_pretrained(output_dir)

trainer.train()

print("Save model to:", output_dir)
model.save_pretrained(output_dir)

print("Training done")

In [None]:
model = AutoModelForMaskedLM.from_pretrained(model_name)
import torch
# /content/output/distilroberta-base-2023-04-07_01-22-09/checkpoint-1000/pytorch_model.bin
checkpoint = torch.load('/content/output/distilroberta-base-2023-04-07_01-22-09/checkpoint-1000/pytorch_model.bin')
model.load_state_dict(checkpoint)

In [None]:
for layer in model.state_dict():
  print(layer)

In [None]:
for layer in checkpoint:
  print(layer)