In [1]:
!pip install empath

Collecting empath
  Downloading empath-0.89.tar.gz (57 kB)
Building wheels for collected packages: empath
  Building wheel for empath (setup.py): started
  Building wheel for empath (setup.py): finished with status 'done'
  Created wheel for empath: filename=empath-0.89-py3-none-any.whl size=57824 sha256=661d561750728a829354f42116f35e301b57e40abe8e1df2637d491620d8de96
  Stored in directory: c:\users\dsmoljan\appdata\local\pip\cache\wheels\5b\58\77\7eed8eef4c6be0cca8920ac319d916811537a37407da220bf1
Successfully built empath
Installing collected packages: empath
Successfully installed empath-0.89


In [2]:
root = "/content/drive/MyDrive/TAR/"

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
from sklearn.metrics import *
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
import pdb
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
import random


from sklearn.metrics import classification_report


from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

SAVE_PATH = "models"
directory_path = "test-mlm"

MAX_LEN = 512
TEST_BATCH_SIZE = 1

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

class DreadditData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        #self.targets = self.data.label
        self.max_len = max_len
        self.data_id = dataframe.id

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        
        data_id = self.data_id[index]

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'data_id': torch.tensor(data_id, dtype=torch.int)
        }

class RobertaClass(torch.nn.Module):
    def __init__(self, add_train = True):
        super(RobertaClass, self).__init__()
        if (add_train is True):
            print("Initializing additionaly trained roberta model")
            self.roberta_layer = RobertaModel.from_pretrained(directory_path)
        else:
            print("Initializing basic roberta model")
            self.roberta_layer = RobertaModel.from_pretrained('roberta-base')
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 2)

    # TODO: mozda train metodu stavi ovdje?
    def forward(self, input_ids, attention_mask, token_type_ids):
        roberta_out = self.roberta_layer(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = roberta_out[0]
        pooler = hidden_state[:, 0]
        s = self.pre_classifier(pooler)
        h = torch.nn.ReLU()(s)
        h_dropout = self.dropout(h)
        logits = self.classifier(h_dropout)
        return logits
    
# metoda na osnovu prediction dataloadera vraća rječnik u kojem
# je predikcija zadanog modela sa id-om posta
def predict(model, prediction_loader):
    prediction_dict = dict()
    #test_acc_list = []
    for i, data in tqdm(enumerate(prediction_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        #targets = data['targets'].to(device, dtype = torch.long)
        
        post_ids = data['data_id'].cpu().numpy()
        outputs = model(ids, mask, token_type_ids).squeeze()
        #pdb.set_trace()
        
        big_idx = torch.max(outputs.data)
        # y_true = targets.detach().cpu().numpy()
        if (big_idx > 0.5):
          y_pred = 1
        else:
          y_pred = 0
        # print("\ny_true", y_true)
        # print("y_pred", y_pred)

        post_id = post_ids[0]
        prediction_dict.update({post_id:y_pred})
    
    #print("Accuracy on prediction: ", np.nanmean(test_acc_list))    
    return prediction_dict

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [3]:
def roberta_predict(df):
  test_params = {'batch_size': 10,
              'shuffle': True,
                  'num_workers': 0
              }
      
  model = RobertaClass(True)
  model.load_state_dict(torch.load("roberta_cl_model.chkpt")) 
  model.to(device)
  tokenizer = RobertaTokenizer.from_pretrained(directory_path, padding = True, truncation=True, do_lower_case=True)
  text = "stres"
  id = 1
  pred_data = df
  #pred_data = pd.read_csv('/content/drive/MyDrive/TAR/to_predict.csv')
  #pred_data = pred_data[['text', 'id']]

  #print(pred_data.head)
          
  pred_set = DreadditData(pred_data, tokenizer, MAX_LEN)

  prediction_loader = DataLoader(pred_set, **test_params)

  predictions = predict(model, prediction_loader)
  return (list(predictions.values())[0])


In [16]:
predictions

{0: 1}

In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import random
from empath import Empath
from sklearn.preprocessing import StandardScaler

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)

set_seed(1)
lexicon = Empath()

def prepare_data(dataset_path_train = None,):
    if (dataset_path_train is None):
        raise RuntimeException("Error! Dataset must be provided")
    train = pd.read_csv(dataset_path_train)
    y_train = train['label']
    X_train = train[['roberta_prediction', 'text']]
    return X_train, y_train

def empathfeats(X, train=False, sclr=None):
    rows = []
    for t in X['text']:
        empath = lexicon.analyze(t, normalize=True)
        rows.append(pd.DataFrame([empath]))
    df = pd.concat(rows, ignore_index=True)
    X = pd.concat([X,df], axis=1)
    
    X = X.drop(columns=['text'])
    to_drop = ['health','banking','night','college','exasperation','reading','worship','eating','water','legend','neglect','swimming','love','sympathy','vehicle','disgust','sound','sailing','warmth','fun','joy','affection','lust','shame','anger','car','technology','power','white_collar_job','party','cleaning','competing','ocean','contentment','musical']
    X = X.drop(columns=to_drop)
    
    if(train): 
        sclr = StandardScaler()
        X = sclr.fit_transform(X)
    else:
        X = sclr.transform(X)
    return X, sclr

def robertafeat(text):
    df = pd.DataFrame({'id': [0],'text': [text]})
    prediction = roberta_predict(df)
    return prediction
    
def train_model():
    X_train, y_train = prepare_data(dataset_path_train = 'train_pred_mental_health.csv')
    X_train, sclr = empathfeats(X_train, train=True)
    hp = {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
    model = LogisticRegression(max_iter=1000000, **hp)
    model.fit(X_train, y_train)
    return model, sclr

def make_prediction(text, model, sclr):
    trans, sclr = empathfeats(pd.DataFrame({'roberta_prediction': [robertafeat(text)], 'text': [text]}), sclr=sclr)
    return model.predict(trans)

def final_prediction(text):
    global model, sclr
    return make_prediction(text, model, sclr)[0]

model, sclr = train_model()

In [18]:
from IPython.display import HTML
HTML('''<script>
code_show_err=false; 
function code_toggle_err() {
 if (code_show_err){
 $('div.output_stderr').hide();
 } else {
 $('div.output_stderr').show();
 }
 code_show_err = !code_show_err
} 
$( document ).ready(code_toggle_err);
</script>
To toggle on/off output_stderr, click <a href="javascript:code_toggle_err()">here</a>.''')

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

while(True):
    txt = input()
    pred = final_prediction(txt)
    if pred == 0:
        print("Not stressful")
    else:
        print("Stressful")

 I hate my life, i want to die


Initializing additionaly trained roberta model


Some weights of the model checkpoint at test-mlm were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at test-mlm and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  3.82it/s]


Stressful


 I love my life


Initializing additionaly trained roberta model


Some weights of the model checkpoint at test-mlm were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at test-mlm and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  6.02it/s]


Not stressful


 I've been staying off social media and have avoided everything related to news about the Texas massacre, so that I don't go crazy. Well outside of knowing the basic details anyway (that there was a shooting a school in Texas and children were killed). It may sound selfish to avoid the news, but I can't handle it...I'm completely at my limit with all the back to back traumatic things happening across the world.


Initializing additionaly trained roberta model


Some weights of the model checkpoint at test-mlm were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at test-mlm and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  2.57it/s]


Stressful
