In [None]:
pip install -U sentence-transformers

In [None]:
import pandas as pd

## Data loading

In [None]:
comments = pd.read_csv('/content/classification_dataset.csv')
comments['text_clean'] = comments['text']
comments

Unnamed: 0.1,Unnamed: 0,comment_id,entry_id,date,text,author_login,vote_count,receiver,hate_word_counts,is_hateful,annotation,text_clean
0,0,185386257,52292479,2020-09-23 00:18:39,Ty no kurwa że też ja na to nie wpadłem ale ze...,Cybek-Marian,1,atteint,2,1,0,Ty no kurwa że też ja na to nie wpadłem ale ze...
1,3,174804569,49417029,2020-05-14 19:45:44,coś ponad 1 a mniej niż 2,wytrzzeszcz,1,Kosciany,0,0,0,coś ponad 1 a mniej niż 2
2,6,189533891,53440779,2020-11-09 20:41:58,Motor ma już w garażu,piSSowiec39,3,Zagmadfany2,0,0,0,Motor ma już w garażu
3,9,172538589,48804329,2020-04-17 01:00:24,znow robic na tego zlodzieja,ranunculus,3,Graner,0,0,0,znow robic na tego zlodzieja
4,12,185485247,52313979,2020-09-24 11:51:11,kurwa człowieku no do kurwy nędzy chociaż wytn...,Cybek-Marian,0,AgentGRU,3,1,1,kurwa człowieku no do kurwy nędzy chociaż wytn...
...,...,...,...,...,...,...,...,...,...,...,...,...
5816,17458,186254803,52523329,2020-10-04 01:25:55,ale jak to Nocna a ty w gaciach,Paula_pi,2,Graner,0,0,0,ale jak to Nocna a ty w gaciach
5817,17461,190203499,53620979,2020-11-17 20:48:21,mam ledwo 21 lat ja tam z dorosłością mam niew...,Anty_Chryst,0,SkrytyZolw,0,0,0,mam ledwo 21 lat ja tam z dorosłością mam niew...
5818,17464,188107553,53038579,2020-10-25 15:19:22,tylko niech potem nikogo nie zdziwi że protest...,galicjanin,0,muwieszeptem,1,1,0,tylko niech potem nikogo nie zdziwi że protest...
5819,17468,174952993,49460679,2020-05-16 17:17:34,a wiesz co jest najgorsze ze jak Michau bedzie...,niezdiagnozowany,0,Gon70,0,0,1,a wiesz co jest najgorsze ze jak Michau bedzie...


In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoModelForPreTraining

tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
model = AutoModel.from_pretrained("allegro/herbert-base-cased")
model = model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=472.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=906984.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=555571.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=654201076.0, style=ProgressStyle(descri…




## User feature embeddings

In [None]:
from sklearn import preprocessing
import numpy as np

user_features = pd.read_csv('user_features.csv').drop_duplicates()
user_features.color = user_features.color.astype('category').cat.codes

author_features = comments.merge(user_features, left_on='author_login', right_on='login', how='left').iloc[:, 14:]
receivers_features = comments.merge(user_features, left_on='receiver', right_on='login', how='left').iloc[:, 14:]

min_max_scaler = preprocessing.StandardScaler()

receivers_embeddings = np.nan_to_num(min_max_scaler.fit_transform(receivers_features.values))
authors_embeddings = np.nan_to_num(min_max_scaler.fit_transform(author_features.values))

receivers_embeddings = torch.tensor(receivers_embeddings).float().to(device)
authors_embeddings = torch.tensor(authors_embeddings).float().to(device)

### User node2vec embeddings

In [None]:
import pickle

n2v_embeddings_dict = pickle.load(open('user_embeddings.p', "rb" ) )

n2v_embeddings = []
for author, receiver in comments.loc[:, ['author_login', 'receiver']].values.tolist():
    n2v_embeddings.append([n2v_embeddings_dict[author], n2v_embeddings_dict[receiver]])

n2v_embeddings = np.array(n2v_embeddings)
authors_n2v_embeddings = n2v_embeddings[:, 0, :]
receivers_n2v_embeddings = n2v_embeddings[:, 1, :]

authors_n2v_embeddings = torch.tensor(authors_n2v_embeddings).float().to(device)
receivers_n2v_embeddings = torch.tensor(receivers_n2v_embeddings).float().to(device)

## Comment augmentation

In [None]:
def get_augmented_texts(idx, num_samples=2000):
  df = comments.loc[idx]

  not_offensive = df.loc[df.annotation == 1]
  offensive = df.loc[df.annotation == 2]

  texts, annotations = [], []
  for _ in range(num_samples):
    annotation = 0
    text = ""
    for _ in range(2):
      if np.random.rand() < 0.5:
        annotation = 1
        text += offensive.text_clean.sample(n=1).item()
      else:
        text += ' ' + not_offensive.text_clean.sample(n=1).item()

    texts.append(text)
    annotations.append(annotation)

  return texts, annotations


## Bert embeddings

In [None]:
from tqdm import tqdm

def get_embeddings(max_seq_len=50):
  def batch(iterable, n=1):
      l = len(iterable)
      for ndx in range(0, l, n):
          yield iterable[ndx:min(ndx + n, l)]

  all_embeddings = []
  for b_comments in tqdm(batch(comments.loc[:, 'text_clean'].tolist(), 20)):
    
    with torch.no_grad():
      batch_encoding = tokenizer.batch_encode_plus(
            b_comments,
            padding='longest',
            add_special_tokens=True,
            truncation=True, max_length=max_seq_len,
            return_tensors='pt',
        ).to(device)

      emb = model(**batch_encoding)

    for i in range(emb[0].size()[0]):
      all_embeddings.append(emb[0][i, batch_encoding['input_ids'][i] != 1, :].mean(axis=0)[None, :])

  all_embeddings = torch.cat(all_embeddings, axis=0).to(device)

  return all_embeddings

In [None]:
all_embeddings = get_embeddings(200)
all_labels = torch.tensor(comments.annotation.values-1).to(device)

In [None]:
embeddings_dict = {}
for idx, comment_id in enumerate(comments.comment_id.values):
  embeddings_dict[comment_id] = all_embeddings[idx].to('cpu').numpy()

pickle.dump( embeddings_dict, open( "bert_embeddings.p", "wb" ) )

# Bert embeddings + SVM

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_validate
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.utils import class_weight
import gc

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
embeddings = all_embeddings.cpu().numpy()
labels = comments.annotation.values

def check_svm(X, y, c=1.0, downsampling=True, ratio=1.0):
  accuracy, recall, precision, f1, macro_f1 = [], [], [], [], []

  for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    if downsampling:
      offensive_indices = np.where(y_train == 1)[0]
      nonoffensive_indices = np.where(y_train == 0)[0]
      sampled_nonoffensive_indices = np.random.choice(nonoffensive_indices,
                                                      size=int(ratio * len(offensive_indices)),
                                                      replace=False)
      indices = np.concatenate((offensive_indices, sampled_nonoffensive_indices))
      np.random.shuffle(indices)
      downsampled_y = y_train[indices]
      downsampled_X = X_train[indices, :]
    else:
      downsampled_y, downsampled_X = y_train, X_train
    clf = SVC(C=c)
    clf.fit(downsampled_X, downsampled_y)
    preds = clf.predict(X_test)
    accuracy.append(accuracy_score(y_test, preds))
    precision.append(precision_score(y_test, preds))
    recall.append(recall_score(y_test, preds))
    f1.append(f1_score(y_test, preds))
    macro_f1.append(f1_score(y_test, preds, average='macro'))

  return accuracy, recall, precision, f1, macro_f1


results = {}

for max_seq_len in [150]:
  embeddings = get_embeddings(max_seq_len).cpu().numpy()
  labels = comments.annotation.values
  for c in range(3, 7):
    for ratio in [1.5, 1.7, 2.0, 2.5]:
      svm_result = check_svm(embeddings, labels, c=c, downsampling=True, ratio=ratio)
      results[(max_seq_len, c, ratio)] = np.mean(svm_result[-2])

In [None]:
results

{(150, 3, 1.5): 0.4720466921352434,
 (150, 3, 1.7): 0.4900810850722099,
 (150, 3, 2.0): 0.5003060969753295,
 (150, 3, 2.5): 0.4819306847236334,
 (150, 4, 1.5): 0.4713074676966893,
 (150, 4, 1.7): 0.4938735652213042,
 (150, 4, 2.0): 0.4934706331273041,
 (150, 4, 2.5): 0.4805226350489423,
 (150, 5, 1.5): 0.4745383677347549,
 (150, 5, 1.7): 0.4928043256695811,
 (150, 5, 2.0): 0.4980697638560693,
 (150, 5, 2.5): 0.47942737503481486,
 (150, 6, 1.5): 0.46704066113569603,
 (150, 6, 1.7): 0.47924695172824255,
 (150, 6, 2.0): 0.4848725583449388,
 (150, 6, 2.5): 0.4883330549560226}

## Models

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):

    def __init__(self, classes_num=2, feature_num=768):
        super(Net, self).__init__()
        self.feature_num = feature_num

        self.dp = nn.Dropout(p=0.5)
        self.fc1 = nn.Linear(feature_num, 300)  # 6*6 from image dimension
        self.fc2 = nn.Linear(300, 150)
        self.fc3 = nn.Linear(150, classes_num)

        self.softplus = nn.Softplus()

    def forward(self, x, text_lengths):
        x = x.view(-1, self.feature_num)
        x = self.dp(x)
        x = self.softplus(self.fc1(x))
        x = self.dp(x)
        x = self.softplus(self.fc2(x))
        x = self.dp(x)
        x = self.fc3(x)
        return x

class NetLSTM(nn.Module):

    def __init__(self, classes_num=2, feature_num=768):
        super(NetLSTM, self).__init__()
        self.feature_num = feature_num

        self.dp = nn.Dropout(p=0.5)
        self.lstm = nn.LSTM(feature_num, hidden_size=32, batch_first=True, dropout=0.5, bidirectional=True, num_layers=1)
        self.fc1 = nn.Linear(32*2, 100)  # 6*6 from image dimension
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, classes_num)

        self.softplus = nn.Softplus()

    def forward(self, x, text_lengths):
        x = self.dp(x)
        text_lengths = text_lengths.to('cpu')
        x = torch.nn.utils.rnn.pack_padded_sequence(x, text_lengths, batch_first=True, enforce_sorted=False)

        _, (x, _) = self.lstm(x)
        x = torch.cat([x[i] for i in range(x.size()[0])], axis=1)

        x = self.dp(x.squeeze())

        x = self.softplus(self.fc1(x))
        x = self.dp(x)

        x = self.softplus(self.fc2(x))
        x = self.dp(x)

        x = self.fc3(x)
        return x

## Train and test functions

In [None]:
import time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, WeightedRandomSampler

def train_func(net, criterion, optimizer, sub_train_):
    net.train()
    train_loss = 0
    train_acc = 0

    if CFG['undersample']:
      classes = [x[1].to('cpu').item() for x in sub_train_]
      weights = torch.zeros(len(classes))
      weights[np.array(classes) == 0] = 0.1 
      weights[np.array(classes) == 1] = 0.9 
      sampler = WeightedRandomSampler(weights, len(classes))
      data = DataLoader(sub_train_, batch_size=CFG['batch_size'], sampler=sampler)
    else:
      data = DataLoader(sub_train_, batch_size=CFG['batch_size'], shuffle=True)

    num_augmented = CFG['num_augmented']
    if num_augmented:
      train_indexes = [x[0] for x in sub_train_]
      additional_texts, additional_labels = get_augmented_texts(train_indexes, num_augmented)

    for i, (text_vector_ids, cls) in enumerate(data):
        optimizer.zero_grad()

        if CFG['use_lstm']:
          texts = comments.iloc[text_vector_ids, :].loc[:, 'text_clean'].tolist()
          text_vector, text_lengths = get_embeddings(texts, return_sequence=True)
        elif CFG['user_embeddings']:
          
          if CFG.get('node2vec_embeddings', False):
            r_embeddings = receivers_n2v_embeddings[text_vector_ids]
            a_embeddings = authors_n2v_embeddings[text_vector_ids]
          else:
            r_embeddings = receivers_embeddings[text_vector_ids]
            a_embeddings = authors_embeddings[text_vector_ids]

          text_vector, text_lengths = all_embeddings[text_vector_ids], None
          text_vector = torch.cat([text_vector, r_embeddings, a_embeddings], dim=1)
        elif CFG['user_embeddings_only']:
          text_lengths = None
          text_vector = torch.cat([receivers_embeddings[text_vector_ids], authors_embeddings[text_vector_ids]], dim=1)
        else:
          text_vector, text_lengths = all_embeddings[text_vector_ids], None

          if num_augmented:
            additional_idx = np.random.choice(np.arange(num_augmented), CFG['num_augmented_sample'])
            additional_embeddings = get_embeddings([additional_texts[a_i] for a_i in additional_idx])
            add_labels = [additional_labels[a_i] for a_i in additional_idx]

            text_vector = torch.cat([text_vector, additional_embeddings], axis=0)
            cls = torch.cat([cls, torch.tensor(add_labels).to(device)], axis=0)

        text_vector, cls = text_vector.to(device), cls.to(device)
        output = net(text_vector, text_lengths)
        loss = criterion(output, cls)

        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(net, criterion, data_):
    net.eval()
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=CFG['batch_size'])
    
    predictions, true_labels = [], []

    for text_vector_ids, cls in data:
        
        if CFG['use_lstm']:
          texts = comments.iloc[text_vector_ids, :].loc[:, 'text_clean'].tolist()
          text_vector, text_lengths = get_embeddings(texts, return_sequence=True)
        elif CFG['user_embeddings']:
          text_vector, text_lengths = all_embeddings[text_vector_ids], None

          if CFG.get('node2vec_embeddings', False):
            r_embeddings = receivers_n2v_embeddings[text_vector_ids]
            a_embeddings = authors_n2v_embeddings[text_vector_ids]
          else:
            r_embeddings = receivers_embeddings[text_vector_ids]
            a_embeddings = authors_embeddings[text_vector_ids]

          text_vector = torch.cat([text_vector, r_embeddings, a_embeddings], dim=1)

        elif CFG['user_embeddings_only']:
          text_lengths = None
          text_vector = torch.cat([receivers_embeddings[text_vector_ids], authors_embeddings[text_vector_ids]], dim=1)
        else:
          text_vector, text_lengths = all_embeddings[text_vector_ids], None

        text_vector, cls = text_vector.to(device), cls.to(device)

        with torch.no_grad():
            output = net(text_vector, text_lengths)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

            predictions.extend(output.argmax(axis=1).tolist())
            true_labels.extend(cls.tolist())

    return loss / len(data_), acc / len(data_), predictions, true_labels

In [None]:
def get_embeddings(texts, return_sequence=False):
  with torch.no_grad():
    batch_encoding = tokenizer.batch_encode_plus(
            texts,
            padding='longest',
            add_special_tokens=True,
            truncation=True, max_length=150,
            return_tensors='pt'
        ).to(device)
    
    emb = model(**batch_encoding)

    if return_sequence:
      emb = emb[0]
      return emb.to(device), (batch_encoding['input_ids'] != 1).sum(axis=1)
    else:
      all_embeddings = []

      for i in range(emb[0].size()[0]):
        all_embeddings.append(emb[0][i, batch_encoding['input_ids'][i] != 1, :].mean(axis=0)[None, :])

      return torch.cat(all_embeddings, axis=0).to(device)

In [None]:
def show_stats(reports):
  precisions, recalls, f1s = [], [], []
  for fold in range(CFG['fold_num']):
    precisions.append(reports[fold]['1']['precision'])
    recalls.append(reports[fold]['1']['recall'])
    f1s.append(reports[fold]['1']['f1-score'])

  print(f'Average precision = {np.mean(precisions):.4f}')
  print(f'Average recall = {np.mean(recalls):.4f}')
  print(f'Average f1 = {np.mean(f1s):.4f}')

# Experiments

In [None]:
from torch.optim.lr_scheduler import StepLR

def make_experiment(feature_num=768):
  folds = StratifiedKFold(n_splits=CFG['fold_num'], shuffle=True, random_state=0).split(np.arange(all_labels.size()[0]), comments.annotation.values)

  all_folds_predictions = []
  all_folds_true = []
  all_folds_reports = {}

  for fold, (trn_idx, test_idx) in enumerate(folds):
      train_dataset = list(zip(trn_idx, all_labels[trn_idx]))
      test_dataset = list(zip(test_idx, all_labels[test_idx]))

      if CFG['use_lstm']:
        net = NetLSTM(classes_num=2, feature_num=feature_num).to(device)
      else:
        net = Net(classes_num=2, feature_num=feature_num).to(device)

      criterion = torch.nn.CrossEntropyLoss(torch.tensor(CFG['class_weights'])).to(device)
      optimizer = torch.optim.Adam(net.parameters(), lr=CFG['lr'])
      scheduler = StepLR(optimizer, step_size=50, gamma=0.2)

      for epoch in range(CFG['epochs']):
          start_time = time.time()
          train_loss, train_acc = train_func(net, criterion, optimizer, train_dataset)
          valid_loss, valid_acc, predictions, true_labels = test(net, criterion, test_dataset)

          secs = int(time.time() - start_time)
          mins = secs / 60
          secs = secs % 60

          print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
          print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
          print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

          epoch_report = classification_report(true_labels, predictions, output_dict=True)
          valid_f1 = epoch_report['1']['f1-score']
          valid_recall = epoch_report['1']['recall']
          valid_precision = epoch_report['1']['precision']
          print(f'\tValid F1: {valid_f1:.4f}')
          print(f'\tValid Recall: {valid_recall:.4f}')
          print(f'\tValid Precision: {valid_precision:.4f}')
          
          scheduler.step()

      all_folds_reports[fold] = classification_report(true_labels, predictions, output_dict=True)
      all_folds_predictions.extend(predictions)
      all_folds_true.extend(true_labels)

  return all_folds_true, all_folds_predictions, all_folds_reports

## Bert embeddings + MLP Model

In [None]:
CFG =  {
    "batch_size": 150,
    'fold_num': 5,
    'lr': 5*1e-4,
    'epochs': 60,
    'num_augmented': 0,
    'num_augmented_sample': 0,
    'use_lstm': False,
    'user_embeddings':False,
    'user_embeddings_only':False,
    'class_weights': [0.25, 0.9],
    'undersample':False
}
all_folds_true, all_folds_predictions, all_folds_reports = make_experiment()

In [None]:
  show_stats(all_folds_reports)
  print(classification_report(all_folds_true, all_folds_predictions))
  print(confusion_matrix(all_folds_true, all_folds_predictions))

Average precision = 0.4414
Average recall = 0.5687
Average f1 = 0.4963
              precision    recall  f1-score   support

           0       0.95      0.91      0.93      5316
           1       0.44      0.57      0.50       640

    accuracy                           0.88      5956
   macro avg       0.69      0.74      0.71      5956
weighted avg       0.89      0.88      0.88      5956

[[4853  463]
 [ 276  364]]


## Bert embeddings + MLP Model + Data augmentation

In [None]:
CFG = {
    "batch_size": 100,
    'fold_num': 5,
    'lr': 5*1e-4,
    'epochs': 25,
    'num_augmented': 300,
    'num_augmented_sample': 15,
    'use_lstm': False,
    'user_embeddings':False,
    'user_embeddings_only':False,
    'class_weights': [0.6, 0.9],
    'undersample':False
}

all_folds_true, all_folds_predictions, all_folds_reports = make_experiment()

In [None]:
  show_stats(all_folds_reports)
  print(classification_report(all_folds_true, all_folds_predictions))
  print(confusion_matrix(all_folds_true, all_folds_predictions))

Average precision = 0.4777
Average recall = 0.4266
Average f1 = 0.4489
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      5316
           1       0.47      0.43      0.45       640

    accuracy                           0.89      5956
   macro avg       0.70      0.68      0.69      5956
weighted avg       0.88      0.89      0.89      5956

[[5014  302]
 [ 367  273]]


## Bert embeddings + MLP Model + Undersampling

In [None]:
CFG = {    "batch_size": 150,
    'fold_num': 5,
    'lr': 3*1e-4,
    'epochs': 60,
    'num_augmented': 0,
    'num_augmented_sample': 0,
    'use_lstm': False,
    'user_embeddings':False,
    'user_embeddings_only':False,
    'class_weights': [0.9, 0.9],
    'undersample':True
}

all_folds_true, all_folds_predictions, all_folds_reports = make_experiment()

In [None]:
  show_stats(all_folds_reports)
  print(classification_report(all_folds_true, all_folds_predictions))
  print(confusion_matrix(all_folds_true, all_folds_predictions))

Average precision = 0.2836
Average recall = 0.8063
Average f1 = 0.4193
              precision    recall  f1-score   support

           0       0.97      0.75      0.85      5316
           1       0.28      0.81      0.42       640

    accuracy                           0.76      5956
   macro avg       0.63      0.78      0.63      5956
weighted avg       0.90      0.76      0.80      5956

[[4007 1309]
 [ 124  516]]


## MLP + User embeddings only

In [None]:
CFG = {
    "batch_size": 150,
    'fold_num': 5,
    'lr': 3*1e-4,
    'epochs': 30,
    'num_augmented': 0,
    'num_augmented_sample': 0,
    'use_lstm': False,
    'class_weights': [0.25, 0.9],
    'user_embeddings_only':True,
    'user_embeddings':False,
    'undersample':False
}

all_folds_true, all_folds_predictions, all_folds_reports = make_experiment(feature_num=20)

In [None]:
  show_stats(all_folds_reports)
  print(classification_report(all_folds_true, all_folds_predictions))
  print(confusion_matrix(all_folds_true, all_folds_predictions))

Average precision = 0.0000
Average recall = 0.0000
Average f1 = 0.0000
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      5316
           1       0.00      0.00      0.00       640

    accuracy                           0.89      5956
   macro avg       0.45      0.50      0.47      5956
weighted avg       0.80      0.89      0.84      5956

[[5316    0]
 [ 640    0]]


  _warn_prf(average, modifier, msg_start, len(result))


## MLP + User embeddings

In [None]:
CFG = {
    "batch_size": 150,
    'fold_num': 5,
    'lr': 5*1e-4,
    'epochs': 60,
    'num_augmented': 0,
    'num_augmented_sample': 0,
    'use_lstm': False,
    'class_weights': [0.25, 0.9],
    'user_embeddings_only':False,
    'user_embeddings':True,
    'undersample':False
}

all_folds_true, all_folds_predictions, all_folds_reports = make_experiment(feature_num=768+20)

In [None]:
  show_stats(all_folds_reports)
  print(classification_report(all_folds_true, all_folds_predictions))
  print(confusion_matrix(all_folds_true, all_folds_predictions))

Average precision = 0.4448
Average recall = 0.5828
Average f1 = 0.5040
              precision    recall  f1-score   support

           0       0.95      0.91      0.93      5316
           1       0.44      0.58      0.50       640

    accuracy                           0.88      5956
   macro avg       0.70      0.75      0.72      5956
weighted avg       0.89      0.88      0.88      5956

[[4850  466]
 [ 267  373]]


## MLP + node2vec embeddings

In [None]:
CFG = {
    "batch_size": 150,
    'fold_num': 5,
    'lr': 3*1e-4,
    'epochs': 60,
    'num_augmented': 0,
    'num_augmented_sample': 0,
    'use_lstm': False,
    'class_weights': [0.3, 0.9],
    'user_embeddings_only':False,
    'user_embeddings':True,
    'node2vec_embeddings':True,
    'undersample':False
}

all_folds_true, all_folds_predictions, all_folds_reports = make_experiment(feature_num=768+64)

In [None]:
  show_stats(all_folds_reports)
  print(classification_report(all_folds_true, all_folds_predictions))
  print(confusion_matrix(all_folds_true, all_folds_predictions))

Average precision = 0.4825
Average recall = 0.5109
Average f1 = 0.4947
              precision    recall  f1-score   support

           0       0.94      0.93      0.94      5316
           1       0.48      0.51      0.49       640

    accuracy                           0.89      5956
   macro avg       0.71      0.72      0.72      5956
weighted avg       0.89      0.89      0.89      5956

[[4960  356]
 [ 313  327]]


## False positives

'O chuj ale dzisiaj dojebali XD',

'Takie koszulki za 810zl z hm czy innej sieciowki to zajebista sprawa ale jako jednorazowka bardziej albo do ratowania dupy jak sie twoja czyms pobrudzi a nie lubisz wygladac jak bezdomny xD', 

'więc jestem zjebany nawet według vieniasna', 

'Halabala co kobieta sukcesu jak ty robi o tej porze na nocnej', 

'Pamiętaj ze śpisz z milionami roztoczami które z pewnością wejdą Ci w nocy do kutasa', 

'10Warthog nie ma niczego fuj w miłości i nawet by Ci ręki nie starczylo żeby objąć tego bydlaka', 

'a jaki kobiet na świecie pracuje w modelingu Po co się do nich porównywać Analogiczna sytuacja do płaczów na tagu bo nie wygląda się jak chad', 

'no tak trochę jakby rysownik w corelu po pijaku od linijki kreski napierdalał',

## False Negatives

'naparz sobie cały dzbanek i napełnij nim siebie dzbanie', 

'za meczet obcięli by ci głowę tu mam nadzieję chociaż o solidny wjeb', 

'do specjalisty to ty powinieneś się się skierować pies tam woli suke ciekawe dlaczego', 

'monarchia konstytucja odwrotnie jak Orban zakaże pedałowania to Szwecja a jak stracę robotę to taki jak w Chile', 

'Ta ale to różowe zostawiają po sobie smród okresu z pizdy', 'Ja znów w Sejmie kurwy nie dadzą mi spokoju XD', 

'no ale leżac na płasko też się nie leży płasko to prawie niemożliwe trzeba były się postarać Ddodatkowo można robić i na placka jak i z nogami przed siebie jak i skos gdzie się nie robi leg drive jak i płaską z nogami w górze wszystko zależy od celu tego jaki RoM chcemy osiągnąć lub dla osób właśnie po urazach bardzo mnie denerwuje jednowymiarowość na wykopie i traktowanie formy trójbojowej jako wyznacznika No pomijam że OP pewnie to spotkał sebixa który nie wie o co chodzi', 

'masz paranoje większe niż ja po piątce palenia ogarnij się', 

'xDDDDDDDDDDDDDDDDD No faktycznie wpierdalać mniej to taki wielki wysiłek', 

'mundi od razu seria zartow o jasiu mi sie przypomniala kot jest moj i bede go pierdolic czy komus sie podoba czy nie xDDD', 

'no i kuj niech mnie jebią', 

'o pa jak to się robi przegrywie walony Hej bejbe chcesz wyskoczyc na jakiegos browarka Na dwór nie można wychodzić ale możemy posiedzieć u mnie w łóżku', 

'widać bo pjerdilusz od rzeczy sprawdź sobie pierwszy wkład to dopiero zmienisz zdanie', 

'Widać i słychać że psychiczna Nie wiadomo czy by nawet jej psychiatryk pomogl'
