In [20]:
import os
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import tqdm
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#from wykop_scraper.hatebase import hate_words, curse_words

In [35]:
import os 
import urllib.request

if not os.path.exists('kgr10.plain.lower.skipgram.dim300.neg10.bin'):
  file_path = 'kgr10.plain.lower.skipgram.dim300.neg10.bin'
  urllib.request.urlretrieve('https://nextcloud.clarin-pl.eu/index.php/s/luubhnS0AvjmtQc/download?path=%2F&files=kgr10.plain.lower.skipgram.dim300.neg10.bin', file_path)

In [3]:
with open('data/poleval_train_text.txt') as f:
    train_text = f.read().split('\n')[:-1]

with open('data/poleval_train_tags.txt') as f:
    train_tags = f.read().split('\n')[:-1]
    
with open('data/poleval_test_text.txt') as f:
    test_text = f.read().split('\n')[:-1]

with open('data/poleval_test_tags.txt') as f:
    test_tags = f.read().split('\n')[:-1]

In [4]:
train_data = pd.DataFrame({'tags': train_tags, 'text': train_text})
train_data['tags'] = train_data['tags'].astype('int')

test_data = pd.DataFrame({'tags': test_tags, 'text': test_text})
test_data['tags'] = test_data['tags'].astype('int')

In [5]:
train_data.head(), test_data.head()

(   tags                                               text
 0     0  Dla mnie faworytem do tytułu będzie Cracovia. ...
 1     0  @anonymized_account @anonymized_account Brawo ...
 2     0  @anonymized_account @anonymized_account Super,...
 3     0  @anonymized_account @anonymized_account Musi. ...
 4     0    Odrzut natychmiastowy, kwaśna mina, mam problem,
    tags                                               text
 0     0  @anonymized_account Spoko, jak im Duda z Moraw...
 1     0  @anonymized_account @anonymized_account Ale on...
 2     0  @anonymized_account No czy Prezes nie miał rac...
 3     0  @anonymized_account @anonymized_account Przeci...
 4     0  @anonymized_account @anonymized_account Owszem...)

# Baseline models 

In [None]:
class BaselineModel:
    def __init__(self, dictionary: list):
        self.dictionary = dictionary
        
    def fit(self, X, y):
        pass
    
    def get_params(self, deep=True):
        return {'dictionary': self.dictionary}
    
    def predict(self, X):
        """X: a list of strings"""
        return np.reshape(np.array([1 if any(word in text for word in self.dictionary) else 0 for text in X]), (-1,))

In [None]:
baseline = BaselineModel(hate_words)
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
X, y = test_data.text.to_numpy(), test_data.tags.to_numpy()

In [None]:
def train_and_return_results(model, train, test) -> dict:
    
    X, y = train_data.text.to_numpy(), train_data.tags.to_numpy()
    
    X_test, y_test = test_data.text.to_numpy(), test_data.tags.to_numpy()
    model.fit(X, y)
    prediction = model.predict(X_test)
    
    return {
        'test_accuracy': accuracy_score(y_test, prediction),
        'test_precision': precision_score(y_test, prediction),
        'test_recall': recall_score(y_test, prediction),
        'test_f1': f1_score(y_test, prediction),
        'test_f1_macro': f1_score(y_test, prediction, average='macro')
    }

In [None]:
results = train_and_return_results(baseline, train_data, test_data)
accuracy = results['test_accuracy']
precision = results['test_precision']
recall = results['test_recall']
f1 = results['test_f1']
macro_f1 = results['test_f1_macro']
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(f'Macro F1-Score: {macro_f1}')

Accuracy: 0.863
Precision: 0.42857142857142855
Recall: 0.06716417910447761
F1-Score: 0.11612903225806451
Macro F1-Score: 0.5209371448553195


In [None]:
baseline2 = BaselineModel(hate_words + curse_words)
results = train_and_return_results(baseline2, train_data, test_data)
accuracy = results['test_accuracy']
precision = results['test_precision']
recall = results['test_recall']
f1 = results['test_f1']
macro_f1 = results['test_f1_macro']
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(f'Macro F1-Score: {macro_f1}')

Accuracy: 0.88
Precision: 0.625
Recall: 0.26119402985074625
F1-Score: 0.3684210526315789
Macro F1-Score: 0.6510613550450712


In [7]:
!git clone https://github.com/facebookresearch/fastText.git
!pip install ./fastText

Cloning into 'fastText'...
remote: Enumerating objects: 3854, done.[K
remote: Total 3854 (delta 0), reused 0 (delta 0), pack-reused 3854[K
Receiving objects: 100% (3854/3854), 8.23 MiB | 8.65 MiB/s, done.
Resolving deltas: 100% (2416/2416), done.
Processing ./fastText
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3032434 sha256=ffef303b6c9936eb9e25670a75de558fe1e9e2ba25ddc6e4cd6397728596816e
  Stored in directory: /tmp/pip-ephem-wheel-cache-x44qi8dc/wheels/a1/9f/52/696ce6c5c46325e840c76614ee5051458c0df10306987e7443
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2


In [8]:
import fasttext

Fasttext embeddings


In [9]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz

--2021-01-13 22:07:57--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503081312 (4.2G) [application/octet-stream]
Saving to: ‘cc.pl.300.bin.gz’


2021-01-13 22:13:27 (13.1 MB/s) - ‘cc.pl.300.bin.gz’ saved [4503081312/4503081312]



In [10]:
!7za x cc.pl.300.bin.gz


7-Zip (a) [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs AMD EPYC 7B12 (830F10),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 4503081312 bytes (4295 MiB)

Extracting archive: cc.pl.300.bin.gz
--
Path = cc.pl.300.bin.gz
Type = gzip
Headers Size = 24

  0% - cc.pl.300.bin                      1% - cc.pl.300.bin                      2% - cc.pl.300.bin                      3% - cc.pl.300.bin                      4% - cc.pl.300.bin                      5% - cc.pl.300.bin                      6% - cc.pl.300.bin                      7% - cc.pl.300.bin             

In [11]:
fasttext_model = fasttext.load_model('cc.pl.300.bin')

In [12]:
train_embeddings = np.zeros((train_data.shape[0], 300))
for index, row in tqdm.tqdm(train_data.iterrows(), total=train_data.shape[0]):
            vector = fasttext_model.get_sentence_vector(row['text'])
            train_embeddings[index, :] = vector

100%|██████████| 10041/10041 [00:02<00:00, 4374.79it/s]


In [13]:
test_embeddings = np.zeros((test_data.shape[0], 300))
for index, row in tqdm.tqdm(test_data.iterrows(), total=test_data.shape[0]):
            vector = fasttext_model.get_sentence_vector(row['text'])
            test_embeddings[index, :] = vector

100%|██████████| 1000/1000 [00:00<00:00, 3999.97it/s]


In [21]:
def check_svm(c=1.0, downsampling=True, ratio=1.0):
    accuracy, recall, precision, f1, macro_f1 = [], [], [], [], []
    X_train, X_test = train_embeddings, test_embeddings
    y_train, y_test = train_data.tags.to_numpy(), test_data.tags.to_numpy()
    if downsampling:
      offensive_indices = np.where(y_train == 1)[0]
      nonoffensive_indices = np.where(y_train == 0)[0]
      sampled_nonoffensive_indices = np.random.choice(nonoffensive_indices,
                                                        size=int(ratio * len(offensive_indices)),
                                                        replace=False)
      indices = np.concatenate((offensive_indices, sampled_nonoffensive_indices))
      np.random.shuffle(indices)
      downsampled_y = y_train[indices]
      downsampled_X = X_train[indices, :]
    else:
      downsampled_y, downsampled_X = y_train, X_train
    clf = SVC(C=c)
    clf.fit(downsampled_X, downsampled_y)
    preds = clf.predict(X_test)
    accuracy.append(accuracy_score(y_test, preds))
    precision.append(precision_score(y_test, preds))
    recall.append(recall_score(y_test, preds))
    f1.append(f1_score(y_test, preds))
    macro_f1.append(f1_score(y_test, preds, average='macro'))
    return accuracy, recall, precision, f1, macro_f1

In [22]:
accuracy, recall, precision, f1, macro_f1 = check_svm(10.0, False)
print(f'Avg accuracy: {np.array(accuracy).mean()}')
print(f'Avg precision: {np.array(precision).mean()}')
print(f'Avg recall: {np.array(recall).mean()}')
print(f'Avg F1-Score: {np.array(f1).mean()}')
print(f'Avg macro F1-Score: {np.array(macro_f1).mean()}')

Avg accuracy: 0.89
Avg precision: 0.7857142857142857
Avg recall: 0.2462686567164179
Avg F1-Score: 0.375
Avg macro F1-Score: 0.6573464912280702


In [23]:
accuracy, recall, precision, f1, macro_f1 = check_svm(10.0, True, 1)
print(f'Avg accuracy: {np.array(accuracy).mean()}')
print(f'Avg precision: {np.array(precision).mean()}')
print(f'Avg recall: {np.array(recall).mean()}')
print(f'Avg F1-Score: {np.array(f1).mean()}')
print(f'Avg macro F1-Score: {np.array(macro_f1).mean()}')

Avg accuracy: 0.825
Avg precision: 0.41832669322709165
Avg recall: 0.7835820895522388
Avg F1-Score: 0.5454545454545455
Avg macro F1-Score: 0.7185477061638053


In [24]:
accuracy, recall, precision, f1, macro_f1 = check_svm(1.0, True, 1)
print(f'Avg accuracy: {np.array(accuracy).mean()}')
print(f'Avg precision: {np.array(precision).mean()}')
print(f'Avg recall: {np.array(recall).mean()}')
print(f'Avg F1-Score: {np.array(f1).mean()}')
print(f'Avg macro F1-Score: {np.array(macro_f1).mean()}')

Avg accuracy: 0.841
Avg precision: 0.44813278008298757
Avg recall: 0.8059701492537313
Avg F1-Score: 0.576
Avg macro F1-Score: 0.7390769230769231


In [25]:
accuracy, recall, precision, f1, macro_f1 = check_svm(1.0, True, 2)
print(f'Avg accuracy: {np.array(accuracy).mean()}')
print(f'Avg precision: {np.array(precision).mean()}')
print(f'Avg recall: {np.array(recall).mean()}')
print(f'Avg F1-Score: {np.array(f1).mean()}')
print(f'Avg macro F1-Score: {np.array(macro_f1).mean()}')

Avg accuracy: 0.888
Avg precision: 0.5774647887323944
Avg recall: 0.6119402985074627
Avg F1-Score: 0.5942028985507247
Avg macro F1-Score: 0.7646188506674738


In [29]:
def check_nn(downsampling=True, ratio=1.0, weighting=True, weights_dict=None,
             batch_size=200, epochs=50, only_one=False):
    accuracy, recall, precision, f1, macro_f1 = [], [], [], [], []
  
    X_train, X_test = train_embeddings, test_embeddings
    y_train, y_test = train_data.tags.to_numpy(), test_data.tags.to_numpy()
    if downsampling:
          offensive_indices = np.where(y_train == 1)[0]
          nonoffensive_indices = np.where(y_train == 0)[0]
          sampled_nonoffensive_indices = np.random.choice(nonoffensive_indices,
                                                          size=int(ratio * len(offensive_indices)),
                                                          replace=False)
          indices = np.concatenate((offensive_indices, sampled_nonoffensive_indices))
          np.random.shuffle(indices)
          downsampled_y = y_train[indices]
          downsampled_X = X_train[indices, :]    
    else:
        downsampled_y, downsampled_X = y_train, X_train
    if weighting and weights_dict is None:
        class_weights = class_weight.compute_class_weight('balanced',
                                                        np.unique(downsampled_y),
                                                        downsampled_y)
        weights_dict = dict(enumerate(class_weights))

    keras_nn = keras.Sequential([layers.Dense(128, activation="relu"),
                                 layers.Dropout(0.5),
                                 layers.Dense(32, activation="relu"),
                                 layers.Dense(1, activation='sigmoid')])
    opt = keras.optimizers.Adam(0.001)
    keras_nn.compile(loss='binary_crossentropy', optimizer=opt, 
                     metrics=[tf.keras.metrics.Recall(name="recall"), tf.keras.metrics.Precision(name="prec")])
    if only_one:
      keras_nn.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
                   validation_data=(X_test, y_test), class_weight=weights_dict)
    if weighting:
      keras_nn.fit(X_train, y_train, batch_size=batch_size, 
                   class_weight=weights_dict, epochs=epochs, verbose=0)
    else:
      keras_nn.fit(X_train, y_train, batch_size=batch_size,
                   epochs=epochs, verbose=0)
    preds = np.round(keras_nn.predict(X_test))
    accuracy.append(accuracy_score(y_test, preds))
    precision.append(precision_score(y_test, preds))
    recall.append(recall_score(y_test, preds))
    f1.append(f1_score(y_test, preds))
    macro_f1.append(f1_score(y_test, preds, average='macro'))
    return accuracy, recall, precision, f1, macro_f1

In [30]:
acc, recall, prec, f1, macro_f1 = check_nn(downsampling=False, weighting=True, 
                                           weights_dict=None, batch_size=200, epochs=50)
print(f'Avg accuracy: {np.array(acc).mean()}')
print(f'Avg precision: {np.array(prec).mean()}')
print(f'Avg recall: {np.array(recall).mean()}')
print(f'Avg F1-Score: {np.array(f1).mean()}')
print(f'Avg macro F1-Score: {np.array(macro_f1).mean()}')

Avg accuracy: 0.888
Avg precision: 0.5733333333333334
Avg recall: 0.6417910447761194
Avg F1-Score: 0.6056338028169014
Avg macro F1-Score: 0.7701828687744181


In [31]:
acc, recall, prec, f1, macro_f1 = check_nn(downsampling=False, weighting=True, 
                                           weights_dict={0: 0.2, 1: 1.0}, batch_size=200, epochs=50)
print(f'Avg accuracy: {np.array(acc).mean()}')
print(f'Avg precision: {np.array(prec).mean()}')
print(f'Avg recall: {np.array(recall).mean()}')
print(f'Avg F1-Score: {np.array(f1).mean()}')
print(f'Avg macro F1-Score: {np.array(macro_f1).mean()}')

Avg accuracy: 0.896
Avg precision: 0.625
Avg recall: 0.5597014925373134
Avg F1-Score: 0.5905511811023623
Avg macro F1-Score: 0.7654932308719142


In [32]:
acc, recall, prec, f1, macro_f1 = check_nn(downsampling=True, ratio=1.0,
                                           weighting=False, batch_size=100, epochs=30)
print(f'Avg accuracy: {np.array(acc).mean()}')
print(f'Avg precision: {np.array(prec).mean()}')
print(f'Avg recall: {np.array(recall).mean()}')
print(f'Avg F1-Score: {np.array(f1).mean()}')
print(f'Avg macro F1-Score: {np.array(macro_f1).mean()}')

Avg accuracy: 0.897
Avg precision: 0.8974358974358975
Avg recall: 0.26119402985074625
Avg F1-Score: 0.4046242774566473
Avg macro F1-Score: 0.6741238519193473


In [33]:
acc, recall, prec, f1, macro_f1 = check_nn(downsampling=True, ratio=2.0,
                                           weighting=False, batch_size=100, epochs=30)
print(f'Avg accuracy: {np.array(acc).mean()}')
print(f'Avg precision: {np.array(prec).mean()}')
print(f'Avg recall: {np.array(recall).mean()}')
print(f'Avg F1-Score: {np.array(f1).mean()}')
print(f'Avg macro F1-Score: {np.array(macro_f1).mean()}')

Avg accuracy: 0.901
Avg precision: 0.7692307692307693
Avg recall: 0.373134328358209
Avg F1-Score: 0.5025125628140703
Avg macro F1-Score: 0.7237715507018714


In [34]:
del fasttext_model

KGR Embeddings

In [36]:
fasttext_model = fasttext.load_model('kgr10.plain.lower.skipgram.dim300.neg10.bin')

In [37]:
train_embeddings = np.zeros((train_data.shape[0], 300))
for index, row in tqdm.tqdm(train_data.iterrows(), total=train_data.shape[0]):
            vector = fasttext_model.get_sentence_vector(row['text'])
            train_embeddings[index, :] = vector

100%|██████████| 10041/10041 [00:03<00:00, 2835.07it/s]


In [38]:
test_embeddings = np.zeros((test_data.shape[0], 300))
for index, row in tqdm.tqdm(test_data.iterrows(), total=test_data.shape[0]):
            vector = fasttext_model.get_sentence_vector(row['text'])
            test_embeddings[index, :] = vector

100%|██████████| 1000/1000 [00:00<00:00, 3072.40it/s]


In [40]:
accuracy, recall, precision, f1, macro_f1 = check_svm(10.0, False)
print(f'Avg accuracy: {np.array(accuracy).mean()}')
print(f'Avg precision: {np.array(precision).mean()}')
print(f'Avg recall: {np.array(recall).mean()}')
print(f'Avg F1-Score: {np.array(f1).mean()}')
print(f'Avg macro F1-Score: {np.array(macro_f1).mean()}')

Avg accuracy: 0.886
Avg precision: 0.8571428571428571
Avg recall: 0.1791044776119403
Avg F1-Score: 0.2962962962962963
Avg macro F1-Score: 0.6171361786160481


In [41]:
accuracy, recall, precision, f1, macro_f1 = check_svm(1, True, 1)
print(f'Avg accuracy: {np.array(accuracy).mean()}')
print(f'Avg precision: {np.array(precision).mean()}')
print(f'Avg recall: {np.array(recall).mean()}')
print(f'Avg F1-Score: {np.array(f1).mean()}')
print(f'Avg macro F1-Score: {np.array(macro_f1).mean()}')

Avg accuracy: 0.841
Avg precision: 0.4458874458874459
Avg recall: 0.7686567164179104
Avg F1-Score: 0.5643835616438356
Avg macro F1-Score: 0.7335679276109086


In [42]:
accuracy, recall, precision, f1, macro_f1 = check_svm(1, True, 2)
print(f'Avg accuracy: {np.array(accuracy).mean()}')
print(f'Avg precision: {np.array(precision).mean()}')
print(f'Avg recall: {np.array(recall).mean()}')
print(f'Avg F1-Score: {np.array(f1).mean()}')
print(f'Avg macro F1-Score: {np.array(macro_f1).mean()}')

Avg accuracy: 0.9
Avg precision: 0.6287878787878788
Avg recall: 0.6194029850746269
Avg F1-Score: 0.6240601503759399
Avg macro F1-Score: 0.7831950117508304


In [39]:
# Keras NN

In [43]:
acc, recall, prec, f1, macro_f1 = check_nn(downsampling=False, weighting=True, 
                                           weights_dict=None, batch_size=200, epochs=50)
print(f'Avg accuracy: {np.array(acc).mean()}')
print(f'Avg precision: {np.array(prec).mean()}')
print(f'Avg recall: {np.array(recall).mean()}')
print(f'Avg F1-Score: {np.array(f1).mean()}')
print(f'Avg macro F1-Score: {np.array(macro_f1).mean()}')

Avg accuracy: 0.882
Avg precision: 0.5512820512820513
Avg recall: 0.6417910447761194
Avg F1-Score: 0.593103448275862
Avg macro F1-Score: 0.7620488001613228


In [44]:
acc, recall, prec, f1, macro_f1 = check_nn(downsampling=True, ratio=1.0,
                                           weighting=False, batch_size=100, epochs=30)
print(f'Avg accuracy: {np.array(acc).mean()}')
print(f'Avg precision: {np.array(prec).mean()}')
print(f'Avg recall: {np.array(recall).mean()}')
print(f'Avg F1-Score: {np.array(f1).mean()}')
print(f'Avg macro F1-Score: {np.array(macro_f1).mean()}')

Avg accuracy: 0.899
Avg precision: 0.8367346938775511
Avg recall: 0.30597014925373134
Avg F1-Score: 0.4480874316939891
Avg macro F1-Score: 0.6962506503544243


In [45]:
acc, recall, prec, f1, macro_f1 = check_nn(downsampling=True, ratio=2.0,
                                           weighting=False, batch_size=100, epochs=30)
print(f'Avg accuracy: {np.array(acc).mean()}')
print(f'Avg precision: {np.array(prec).mean()}')
print(f'Avg recall: {np.array(recall).mean()}')
print(f'Avg F1-Score: {np.array(f1).mean()}')
print(f'Avg macro F1-Score: {np.array(macro_f1).mean()}')

Avg accuracy: 0.896
Avg precision: 0.8571428571428571
Avg recall: 0.26865671641791045
Avg F1-Score: 0.40909090909090906
Avg macro F1-Score: 0.6760366826156301
