In [9]:
#: Try different models

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sinkaf.utils import Preprocessor

stop_words_tr = pd.read_csv("https://raw.githubusercontent.com/ahmetax/trstop/master/dosyalar/turkce-stop-words", header=None)
stop_words_tr = stop_words_tr[0].to_numpy()

def score_model(X, y, X_test = None, y_test = None, estimator = None, **kwargs):
    """
    Test various estimators.
    """
    # y = LabelEncoder().fit_transform(y)

    model = Pipeline([
        ('preprocess', Preprocessor()),
        ('counts', CountVectorizer(min_df=5, max_df=0.99, stop_words = frozenset(stop_words_tr))),
        ('tf_idf', TfidfTransformer()),
        ('estimator', estimator)
    ])

    # Instantiate the classification model and visualizer
    model.fit(X, y, **kwargs)

    # if not used for training
    if (X_test is None):
        return model

    expected  = y
    predicted = model.predict(X)

    expected_test = LabelEncoder().fit_transform(y_test)
    predicted_test = model.predict(X_test)

    # Compute and return F1 (harmonic mean of precision and recall)
    print("\n- {}:\n\nTrain:\n{}\nTest:\n{}".format(estimator.__class__.__name__, \
        classification_report(expected, predicted), classification_report(expected_test, predicted_test)))

In [10]:
#: load data
data = pd.read_csv("sinkaf/data/troff-v1.0.tsv",  sep='\t')
data.head()

Unnamed: 0,id,timestamp,text,label
0,973568937593065472,1520952977415,@USER06095 Hırsız demişken Tuncay sizin şu 1.2...,grp
1,973568937723035648,1520952977446,Ne bileyim sen hastayım deyince bende veterine...,ind
2,973568937911873536,1520952977491,Akşam eve gittiğimizde yorgunluğuma iyi gelece...,grp
3,973568939925090304,1520952977971,Kook’un sesini 18381 kez dinledikten sonra eğe...,prof
4,973568940667539457,1520952978148,@USER05270 @USER04816 o macta adam 6 7 tane ne...,grp


In [11]:
#: create labels
data['label_sinkaf'] = data['label'] != 'non'

data['label_sinkaf'].value_counts()
# 6.8K offensive - kufurlu yorum
# 28K  non-offensive - kufursuz yorum

False    28439
True      6845
Name: label_sinkaf, dtype: int64

In [46]:
#: split the data & find best model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label_sinkaf'])
 
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.metrics import f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


models = [
    SVC(gamma='auto'), LinearSVC(),
    SGDClassifier(max_iter=100, tol=1e-3), KNeighborsClassifier(),
    LogisticRegression(solver='lbfgs'), LogisticRegressionCV(cv=3),
    BaggingClassifier(), ExtraTreesClassifier(n_estimators=300),
    RandomForestClassifier(n_estimators=300), MultinomialNB()
]

for model in models:
    score_model(X_train, y_train, X_test, y_test, model)


- SVC:

Train:
              precision    recall  f1-score   support

           0       0.81      1.00      0.89     21340
           1       0.00      0.00      0.00      5123

    accuracy                           0.81     26463
   macro avg       0.40      0.50      0.45     26463
weighted avg       0.65      0.81      0.72     26463

Test:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89      7099
           1       0.00      0.00      0.00      1722

    accuracy                           0.80      8821
   macro avg       0.40      0.50      0.45      8821
weighted avg       0.65      0.80      0.72      8821


- LinearSVC:

Train:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96     21340
           1       0.94      0.66      0.78      5123

    accuracy                           0.93     26463
   macro avg       0.93      0.82      0.87     26463
weighted avg       0.93      0

In [12]:
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.metrics import f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [13]:
#: Use best model (LinearSVC)

# Tum veriyi kullanma
X = data['text']
y = data['label_sinkaf']

# LinearSVC does not have predict_proba function
from sklearn.calibration import CalibratedClassifierCV
clf = CalibratedClassifierCV(LinearSVC()) 

model = score_model(X, y, estimator = clf)

In [16]:
# Offensive? - Kufur mu?
test = ["cok iyi", 
        "bi git", 
        "bi siktir git", 
        "bi defol",
        "mukemmel bir insansin"]

print(model.predict(test))
print(model.predict_proba(test)[:,1])

[False False  True  True False]
[0.08525992 0.44037578 0.98655027 0.77686361 0.12219944]


In [18]:
import joblib

#dump it (higher), dump it (higher!)
joblib.dump(model, "sinkaf/data/model_linearSVC.joblib")

['sinkaf/data/model_linearSVC.joblib']