In [1]:
import copy
import joblib
import numpy as np
import pandas as pd
import re
import string
import torch

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModel, pipeline

In [2]:
# Helper methods
def train_nn(X_train, y_train):
    clf = MLPClassifier(
        hidden_layer_sizes=(256, 64, 16), max_iter=150, activation='relu', 
        solver='adam', alpha=1e-5, early_stopping=True)
    clf.fit(X_train, y_train)
    return clf

def print_results(X_train, X_test, y_train, y_test):
    print("Train acc:\t{0:.3f}".format(clf.score(X_train, y_train)))
    print("Test acc:\t{0:.3f}".format(clf.score(X_test, y_test)))
    print(classification_report(y_train, clf.predict(X_train)))
    print(classification_report(y_test, clf.predict(X_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))

In [3]:
labels = pd.read_csv("sinkaf/data/troff-v1.0.tsv",  sep='\t')['label'] != 'non'
print(labels.value_counts())

False    28439
True      6845
Name: label, dtype: int64


In [4]:
# Onceden egitilmis Turkce BERT modeli kullanilarak deney derlemi vektore cevirilmistir
# https://drive.google.com/file/d/1fq_Vkvg0QFpZaG1EgwdhyXYNSLqdu2tq/view?usp=sharing
# Dosyayi indirip sinkaf/data'ya kopyalayiniz 
bert_data = pd.read_csv("sinkaf/data/bert_data.csv", header=None)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(bert_data, labels)

In [6]:
# Az olan siniftaki degerlerden ornek uretimi
smote = SMOTE(sampling_strategy=1)
X_train, y_train = smote.fit_sample(X_train, y_train)

In [7]:
# basit NN egitimi
clf = train_nn(X_train, y_train)

In [8]:
# Deney setinde %97, test setinde %82'a yakin basari olmasina ragmen
# model kufur iceren cumlelerde dusuk basari gostermektedir. Test setindeki
# veri dengesizligi sebebi ile basarisi yuksek cikmaktadir
print_results(X_train, X_test, y_train, y_test)

Train acc:	0.979
Test acc:	0.817
              precision    recall  f1-score   support

       False       0.98      0.98      0.98     21392
        True       0.98      0.98      0.98     21392

    accuracy                           0.98     42784
   macro avg       0.98      0.98      0.98     42784
weighted avg       0.98      0.98      0.98     42784

              precision    recall  f1-score   support

       False       0.88      0.89      0.89      7047
        True       0.55      0.52      0.53      1774

    accuracy                           0.82      8821
   macro avg       0.71      0.70      0.71      8821
weighted avg       0.81      0.82      0.82      8821

[[6293  754]
 [ 858  916]]


In [9]:
#Veriyi direkt olarak kullanma
X_train, X_test, y_train, y_test = train_test_split(bert_data, labels)
clf = train_nn(X_train, y_train)
print_results(X_train, X_test, y_train, y_test)

Train acc:	0.868
Test acc:	0.853
              precision    recall  f1-score   support

       False       0.88      0.97      0.92     21347
        True       0.79      0.43      0.56      5116

    accuracy                           0.87     26463
   macro avg       0.83      0.70      0.74     26463
weighted avg       0.86      0.87      0.85     26463

              precision    recall  f1-score   support

       False       0.87      0.97      0.91      7092
        True       0.74      0.39      0.51      1729

    accuracy                           0.85      8821
   macro avg       0.80      0.68      0.71      8821
weighted avg       0.84      0.85      0.83      8821

[[6852  240]
 [1054  675]]


In [9]:
joblib.dump(clf, "sinkaf/data/clf_nn_precision.joblib")

['sinkaf/data/clf_nn_precision.joblib']

In [10]:
# Kufurlu veriyi aza ornekleme
# Undersampling non offensive data
undersampler = RandomUnderSampler()
bert_under, labels_under = undersampler.fit_resample(bert_data, labels)
print(f"Normal length: {len(bert_data)}, Undersampled length: {len(bert_under)}")

Normal length: 35284, Undersampled length: 13690


In [20]:
# Aza orneklenen model denemesi
X_train, X_test, y_train, y_test = train_test_split(bert_under, labels_under, stratify=labels_under)
clf = train_nn(X_train, y_train)
print_results(X_train, X_test, y_train, y_test)

Train acc:	0.812
Test acc:	0.758
              precision    recall  f1-score   support

       False       0.82      0.80      0.81      5133
        True       0.81      0.82      0.81      5134

    accuracy                           0.81     10267
   macro avg       0.81      0.81      0.81     10267
weighted avg       0.81      0.81      0.81     10267

              precision    recall  f1-score   support

       False       0.76      0.76      0.76      1712
        True       0.76      0.75      0.76      1711

    accuracy                           0.76      3423
   macro avg       0.76      0.76      0.76      3423
weighted avg       0.76      0.76      0.76      3423

[[1306  406]
 [ 423 1288]]


In [12]:
# Final model hazirlanmasi
# Aza orneklenen veri kullanilmistir
# Rastgele secilen datalardan maximum basariya sahip olan model kullanilmistir
undersampler = RandomUnderSampler()
best_clf = None
best_acc = 0
for i in range(15):
    bert_under, labels_under = undersampler.fit_resample(bert_data, labels)
    X_train, X_test, y_train, y_test = train_test_split(bert_under, labels_under, stratify=labels_under)
    clf = train_nn(X_train, y_train)
    acc = clf.score(bert_under, labels_under)
    print(acc)
    if acc > best_acc:
        best_acc = acc
        best_clf = copy.deepcopy(clf)

0.806281957633309
0.7807888970051132
0.7836376917457999
0.8073046018991965
0.7657414170927684
0.7983929875821768
0.8069393718042367
0.7966398831263696
0.7837107377647918
0.7783783783783784
0.7810080350620892
0.7761139517896275
0.7613586559532506
0.798027757487217
0.8130021913805697


In [13]:
print(best_acc)
joblib.dump(best_clf, "sinkaf/data/clf_nn_recall.joblib")

0.8130021913805697


['sinkaf/data/clf_nn_recall.joblib']

In [14]:
# Pre-trained BERT kullanarak vektore cevirme
# Tek seferlik bir yukleme yapicak
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")
bert = AutoModel.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")

In [15]:
# Cumleleri vektore cevirirken eksik kisimlarda maximum uzunluga gore padding yapilmistir
# Max uzunluk sinkaf datasi icin 113
MAX_SENTENCE_TOKEN_LENGTH = 113

In [16]:
# Offensive? - Kufur mu?

test = [
    "guzel karisin ha", 
    "cok guzelsin", 
    "yaz transfer sezonuna lionel messi damga vuracak gibi gözüküyor", 
    "dal sarkar kartal kalkar",
    "amk cocugu",
    "aq bebesindeki havaya bak sen",
    "kral cocuk bizim alper",
    "erol bulut istifa",
    "sen kendini ne saniyorsun kopek"]

In [17]:
tokenized = [tokenizer.encode(s, add_special_tokens=True) for s in test]
padded = np.array([s + [0]*(MAX_SENTENCE_TOKEN_LENGTH-len(s)) for s in tokenized])
input_ids = torch.tensor(np.array(padded)).to(torch.int64)

In [18]:
# Cumleleri vektore cevirme
def sentence_2_vec(input_id):
    with torch.no_grad():
        last_hidden_states = bert(input_id)
        features = last_hidden_states[0][:,0,:].numpy()
    return features

In [19]:
test_vector = sentence_2_vec(input_ids)
clf.predict(test_vector)

array([ True, False, False, False,  True,  True, False, False,  True])