In [1]:
#imports
from datasets import load_dataset
from thai2transformers.metrics import classification_metrics
from pythainlp.ulmfit import process_thai
import pandas as pd

Corpus: wiki_lm_lstm
- Already up to date.


In [10]:
#parameters
class Args:
    dataset_name_or_path = 'prachathai67k'
    feature_col = 'title'
    label_cols = ['politics', 'human_rights', 'quality_of_life', 
                  'international', 'social', 'environment', 'economics', 
                  'culture', 'labor', 'national_security', 'ict', 'education']
    metric_for_best_model = 'f1_macro'
    seed = 1412

args = Args()

In [3]:
dataset = load_dataset(args.dataset_name_or_path)
dataset

Reusing dataset prachathai67k (/Users/admin/.cache/huggingface/datasets/prachathai67k/prachathai67k/1.1.0/2eeb3bfaf307043e606a58f1f2af8b3d6bbf8a2d0b957d7bfafaf1dc1ef4b5ac)


DatasetDict({
    train: Dataset({
        features: ['url', 'date', 'title', 'body_text', 'politics', 'human_rights', 'quality_of_life', 'international', 'social', 'environment', 'economics', 'culture', 'labor', 'national_security', 'ict', 'education'],
        num_rows: 54379
    })
    validation: Dataset({
        features: ['url', 'date', 'title', 'body_text', 'politics', 'human_rights', 'quality_of_life', 'international', 'social', 'environment', 'economics', 'culture', 'labor', 'national_security', 'ict', 'education'],
        num_rows: 6721
    })
    test: Dataset({
        features: ['url', 'date', 'title', 'body_text', 'politics', 'human_rights', 'quality_of_life', 'international', 'social', 'environment', 'economics', 'culture', 'labor', 'national_security', 'ict', 'education'],
        num_rows: 6789
    })
})

In [4]:
#nbsvm class
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse

class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, penalty='l2', C=1.0, dual=False, seed=1412):
        self.penalty = penalty
        self.C = C
        self.dual = dual
        self.seed = seed
        
    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.toarray().ravel() if type(y)!=np.ndarray else y.ravel()
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(penalty = self.penalty, 
                                       C=self.C, 
                                       dual=self.dual,
                                       solver='liblinear',
                                       random_state=self.seed,).fit(x_nb, y)
        return self

In [5]:
texts_train = dataset['train'][args.feature_col]
texts_valid = dataset['validation'][args.feature_col]
texts_test = dataset['test'][args.feature_col]

In [6]:
#x
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,2), tokenizer=process_thai,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

x_train = tfidf.fit_transform(texts_train)
x_valid = tfidf.transform(texts_valid)
x_test = tfidf.transform(texts_test)
x_train,x_valid,x_test

(<54379x57108 sparse matrix of type '<class 'numpy.float64'>'
 	with 1275794 stored elements in Compressed Sparse Row format>,
 <6721x57108 sparse matrix of type '<class 'numpy.float64'>'
 	with 152058 stored elements in Compressed Sparse Row format>,
 <6789x57108 sparse matrix of type '<class 'numpy.float64'>'
 	with 152024 stored elements in Compressed Sparse Row format>)

In [7]:
#y
import numpy as np
y_train = np.array([dataset['train'][col] for col in args.label_cols]).transpose()
y_valid = np.array([dataset['validation'][col] for col in args.label_cols]).transpose()
y_test = np.array([dataset['test'][col] for col in args.label_cols]).transpose()
y_train.shape, y_valid.shape, y_test.shape

((54379, 12), (6721, 12), (6789, 12))

In [8]:
#thresholding
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

def best_threshold(y, probs):
    f1s = []
    for th in range(1,100):
        f1s.append((th/100,f1_score(y,(probs> (th/100)).astype(int))))
    f1s_df = pd.DataFrame(f1s).sort_values(1,ascending=False).reset_index(drop=True)
    f1s_df.columns = ['th_label','f1_label']
    return f1s_df.th_label[0], f1s_df.f1_label[0]

In [141]:
#validation
hyperparams = []
for p in ['l1','l2']:
    for c in range(1,5):
        d = {'penalty':p, 'C':c, 'seed': seed}
        for i in range(y_valid.shape[1]):
            if p == 'l1':
                model = NbSvmClassifier(penalty='l1', 
                                        C=c, 
                                        dual=False,
                                        seed=seed).fit(x_train, y_train[:,i])
            else:
                model = NbSvmClassifier(penalty='l2', 
                                        C=c, 
                                        dual=True,
                                        seed=seed).fit(x_train, y_train[:,i])
            probs = model.predict_proba(x_valid)[:,1]
            d[f'th_label_{i}'], d[f'f1_label_{i}'] = best_threshold(y_valid[:,i],probs)
        #macro f1
        d['f1_macro'] = np.mean([d[f'f1_label_{i}'] for i in range(y_valid.shape[1])])
        hyperparams.append(d)
        
hyperparams_df = pd.DataFrame(hyperparams).sort_values('f1_macro',ascending=False).reset_index(drop=True)
best_hyperparams = hyperparams_df[['penalty','C','seed']+[f'th_label_{i}' for i in range(y_valid.shape[1])]].iloc[0,:].to_dict()
hyperparams_df[['penalty','C','f1_macro']]

Unnamed: 0,penalty,C,f1_macro
0,l2,1,0.61105
1,l2,2,0.607425
2,l2,3,0.60561
3,l2,4,0.601663
4,l1,1,0.59017
5,l1,2,0.585137
6,l1,3,0.578731
7,l1,4,0.574738


In [90]:
#test
probs = np.zeros((x_test.shape[0], y_test.shape[1]))
preds = np.zeros((x_test.shape[0], y_test.shape[1]))
for i in range(y_test.shape[1]):
    model = NbSvmClassifier(penalty=best_hyperparams['penalty'],
                           C=best_hyperparams['C'],
                           seed=best_hyperparams['seed']).fit(x_train, y_train[:,i])
    probs[:,i] = model.predict_proba(x_test)[:,1]
    preds[:,i] = (probs[:,i] > best_hyperparams[f'th_label_{i}']).astype(int)

In [91]:
probs.shape, preds.shape, y_test.shape

((6789, 12), (6789, 12), (6789, 12))

In [135]:
#micro
micro_df = pd.DataFrame.from_dict({'accuracy': (preds==y_test).mean(),
'f1_micro':f1_score(y_test.reshape(-1),preds.reshape(-1)),
'precision_micro':precision_score(y_test.reshape(-1),preds.reshape(-1)),
'recall_micro':recall_score(y_test.reshape(-1),preds.reshape(-1))}, orient='index').transpose()

In [138]:
#macro
test_performances = []
for i in range(y_test.shape[1]):
    d = {}
    d['f1_macro'] = f1_score(y_test[:,i],preds[:,i])
    d['precision_macro'] = precision_score(y_test[:,i],preds[:,i])
    d['recall_macro'] = recall_score(y_test[:,i],preds[:,i])
    test_performances.append(d)
macro_df = pd.DataFrame(pd.DataFrame(test_performances).mean()).transpose()

In [139]:
#test performance
test_df = pd.concat([micro_df,macro_df],1)
test_df

Unnamed: 0,accuracy,f1_micro,precision_micro,recall_micro,f1_macro,precision_macro,recall_macro
0,0.903913,0.667742,0.627623,0.71334,0.607269,0.589908,0.633342


In [26]:
pd.DataFrame.from_dict({'x':1,'y':2},orient='index').to_dict()[0]

{'x': 1, 'y': 2}