In [2]:
import simdjson
import itertools
from itertools import repeat
import numpy as np
from math import ceil
from sklearn.preprocessing import MultiLabelBinarizer
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import string
import fasttext as ft
import fasttext.util as ft_util
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
import multiprocessing
import time
from scipy import stats

In [2]:
with open("data/dataset_100.json") as f:
    dataset = simdjson.load(f)

In [43]:
stopwords = set(["a","ako","ali","bi","bih","bila","bili","bilo","bio","bismo","biste","biti","bumo","da","do","duž","ga","hoće","hoćemo","hoćete","hoćeš","hoću","i","iako","ih","ili","iz","ja","je","jedna","jedne","jedno","jer","jesam","jesi","jesmo","jest","jeste","jesu","jim","joj","još","ju","kada","kako","kao","koja","koje","koji","kojima","koju","kroz","li","me","mene","meni","mi","mimo","moj","moja","moje","mu","na","nad","nakon","nam","nama","nas","naš","naša","naše","našeg","ne","nego","neka","neki","nekog","neku","nema","netko","neće","nećemo","nećete","nećeš","neću","nešto","ni","nije","nikoga","nikoje","nikoju","nisam","nisi","nismo","niste","nisu","njega","njegov","njegova","njegovo","njemu","njezin","njezina","njezino","njih","njihov","njihova","njihovo","njim","njima","njoj","nju","no","o","od","odmah","on","ona","oni","ono","ova","pa","pak","po","pod","pored","prije","s","sa","sam","samo","se","sebe","sebi","si","smo","ste","su","sve","svi","svog","svoj","svoja","svoje","svom","ta","tada","taj","tako","te","tebe","tebi","ti","to","toj","tome","tu","tvoj","tvoja","tvoje","u","uz","vam","vama","vas","vaš","vaša","vaše","već","vi","vrlo","za","zar","će","ćemo","ćete","ćeš","ću","što"])

In [3]:
X, Y = [], []

for key, group in itertools.groupby(dataset, lambda x: x["document_id"]):
    X.append(key)
    Y.append(next(group)["labels"])

mlb = MultiLabelBinarizer().fit(Y)
X = np.array(X)
Y = mlb.transform(Y)

classes = mlb.classes_

In [4]:
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.3)

for train_index, test_index in msss.split(X, Y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    break
    
useless = set(itertools.chain(*mlb.inverse_transform(Y_test))) ^ set(itertools.chain(*mlb.inverse_transform(Y_train)))

print(len(useless))

645


In [73]:
def generate_features(k, X_train, X_test):
    documents = {}

    for key, group in itertools.groupby(dataset, lambda x: x["document_id"]):
        group = list(group)
        
        sentences = []
        for sentence in group:
            if len(sentences) == k:
                break
                
            filtered_indices = [i for i, lemma in enumerate(sentence["lemmas"]) if not all(x.isdigit() or x in string.punctuation for x in lemma) and not lemma in stopwords]
            filtered_tokens = [sentence["tokens"][i] for i in filtered_indices]
            
            if len(filtered_tokens) < 5:
                continue

            sentences.append(" ".join(filtered_tokens))

        documents[key] = " ".join(sentences)

        if len(sentences) == 0:
            documents[key] = "UNK"
            
    X_train_documents = []
    X_test_documents = []
    
    for x in X_train:
        X_train_documents.append(documents[x])
    
    for x in X_test:
        X_test_documents.append(documents[x])
        
    vectorizer = TfidfVectorizer()
    X_train_features = vectorizer.fit_transform(X_train_documents)
    X_test_features = vectorizer.transform(X_test_documents)
    
    return X_train_features, X_test_features

In [98]:
X_train_features, X_test_features = generate_features(4, X_train, X_test)

print(X_train_features.shape)

for i in range(1, 5):
    clf = LinearSVC(class_weight='balanced', max_iter=10000).fit(X_train_features, Y_train[:, i])
    
    f1_test = f1_score(Y_test[:, i], clf.predict(X_test_features), zero_division=0)
    f1_train = f1_score(Y_train[:, i], clf.predict(X_train_features), zero_division=0)
    print(f1_train, f1_test)

(7307, 43271)
1.0 0.0
1.01.0 0.0
1.0 0.5
1.0 0.5


In [99]:
def train_classifier(i, X_train_features, Y_train, X_test_features, Y_test):
    params = {
        "C" : [1]
    }
    
    best_C = None
    best_f1 = None
    best_predictions = None
    try:
        for C in params["C"]:
            clf = LinearSVC(class_weight='balanced', max_iter=10000, C=C).fit(X_train_features, Y_train[:, i])
            
            Y_test_pred = clf.predict(X_test_features)
            
            f1 = f1_score(Y_test[:, i], Y_test_pred, zero_division=0)
            
            if best_C is None or best_f1 < f1:
                best_f1 = f1
                best_C = c
                best_predictions = Y_test_pred
    except ValueError:
        best_predictions = np.zeros((Y_test.shape[0],))
    
    return best_predictions

In [None]:
pool = multiprocessing.Pool(10)

scores = []
for i in range(1, 16):
    X_train_features, X_test_features = generate_features(i, X_train, X_test)

    start = time.perf_counter()
    Y_test_pred = np.stack(pool.starmap(train_classifier, zip(range(Y_test.shape[1]), repeat(X_train_features), repeat(Y_train), repeat(X_test_features), repeat(Y_test))), axis=-1)
    print("here")
    f1_macro = f1_score(Y_test, Y_test_pred, average='macro', zero_division=0)
    f1_micro = f1_score(Y_test, Y_test_pred, average='micro', zero_division=0)

    precision_macro = precision_score(Y_test, Y_test_pred, average='macro', zero_division=0)
    precision_micro = precision_score(Y_test, Y_test_pred, average='micro', zero_division=0)

    recall_macro = recall_score(Y_test, Y_test_pred, average='macro', zero_division=0)
    recall_micro = recall_score(Y_test, Y_test_pred, average='micro', zero_division=0)

    scores.append((f1_macro, f1_micro, precision_macro, precision_micro, recall_macro, recall_micro, time.perf_counter() - start))
    
    print(scores[-1])

Process ForkPoolWorker-1303:
Traceback (most recent call last):
  File "/home/scurkovic/.conda/envs/research/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/scurkovic/.conda/envs/research/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/scurkovic/.conda/envs/research/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/home/scurkovic/.conda/envs/research/lib/python3.8/multiprocessing/queues.py", line 358, in get
    return _ForkingPickler.loads(res)
_pickle.UnpicklingError: invalid load key, '\x00'.
Process ForkPoolWorker-1371:
Traceback (most recent call last):
  File "/home/scurkovic/.conda/envs/research/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/scurkovic/.conda/envs/research/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
 