In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score,recall_score
from sklearn.model_selection import train_test_split
# Helper Functions 
# How to encode Categorical variables: https://data-newbie.tistory.com/90
def ordinal_encoding(df):
    enc = OrdinalEncoder()
    return enc.fit_transform(df[cat_col].values)
def onehot_encoding(df):
    enc = OneHotEncoder(handle_unknown='ignore')
    return enc.fit_transform(df[cat_col].values)
def _eval(model, X, y):    
    pred_y = model.predict(X)
    prob = model.predict_proba(X)
    true_y = y
    acc = accuracy_score(y_pred=pred_y,y_true=true_y)
#     auroc = roc_auc_score(y_true=y, y_score=prob,multi_class='ovr') # standard ovr auroc
    f1 = f1_score(y_pred=pred_y,y_true=true_y,average='macro')
    precision, recall = precision_score(y_pred=pred_y,y_true=true_y,average='macro'),recall_score(y_pred=pred_y,y_true=true_y,average='macro')
    #,auroc
    return acc,f1,precision,recall
## SETTINGS and VARIABLES 

enc_method = 'onehot'
# Split categorical variables and numerical variables
cat_col = ['AgencyCode','RecipientCode','RegionCode']
num_col = ['randomNumCol1','randomNumCol2']
# Basic classifiers
model_names = ['SVM','RF','MLP','NB'] # SVM , Random Forest,' Multi Layer Perceptron' ,'Naive Bayes'
model_name='RF'
# LOAD DATASET
data = pd.read_csv("data/crs_final_df_kor.csv", header = 0)
# add 
data[['AgencyCode','RecipientCode','RegionCode','IncomegroupCode','FlowCode','Bi_Multi']]

### add random numerical column (since I dont have more numerical columns in this data)
data['randomNumCol1'] = [np.random.randint(1,10) for i in range(data.shape[0])]
data['randomNumCol2'] = [np.random.rand() for i in range(data.shape[0])]
### 

# Split categorical variables and numerical variables
cat_col = ['AgencyCode','RecipientCode','RegionCode']
num_col = ['randomNumCol1','randomNumCol2']

if enc_method =='ordinal':
    x_cat = ordinal_encoding(data)
elif enc_method=='onehot':
    x_cat = onehot_encoding(data)
else:
    raise

x_num = data[num_col].values
X = np.hstack([x_cat.toarray(),x_num])
# print(x_cat.shape,x_num.shape,X.shape)
y = data['PurposeCode'].values
y_set = list(set(y))
y = [y_set.index(i) for i in y]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
# MAIN
if model_name=='SVM':
    model = make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)) # SVM
elif model_name=='RF':
    model = RandomForestClassifier(n_estimators=5,random_state=42) # RF
elif model_name=='MLP':
    model = MLPClassifier(random_state=42, max_iter=300) # MLP
elif model_name == 'NB':
    model = GaussianNB() #NB 
elif model_name=='XGB':
    model = XGBClassifier(eval_metric='mlogloss')
model.fit(X_train,y_train)
acc,f1,prec,recall = _eval(model,X_train,y_train)
print('train:',model_name,acc,f1,prec,recall)
acc,f1,prec,recall = _eval(model,X_test,y_test)
print('test:',model_name,acc,f1,prec,recall)


In [4]:
X.shape, X_train.shape

((70269, 183), (56215, 183))

# Dataloader

In [None]:
from dataloader import *

data = load_data()
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4"
# available_gpu = [0,1,2,3,4]
data['PurposeCode'] = data['PurposeCode'].apply(lambda x: int(float(x)))
n_cls = list(data['PurposeCode'].unique())
random.seed(0)
np.random.seed(0)
train_corpus, train_targets, test_corpus, test_targets ,val_corpus, val_targets = train_test_split (data,0.75,0.25,0.0,n_cls)


# BERT model

In [2]:
from transformers import BertTokenizer, BertModel
class Model(nn.Module):
    def __init__(self,  bert,num_cls):
        super(Model, self).__init__()
        self.dim = 768
        self.encoder = BertModel.from_pretrained(bert)
        # self.fc = nn.Linear(self.dim, num_cls)
        self.hidden = 100
        self.mlp_projection =  nn.Sequential(nn.Linear(self.dim,self.hidden),
                                             nn.ReLU(),
                                             nn.Linear(self.hidden,self.hidden,bias=True))
        self.mlp_prediction =  nn.Sequential(nn.Linear(self.dim,self.hidden),
                                             nn.ReLU(),
                                             nn.Linear(self.hidden,num_cls,bias=True))
        #nn.Linear(self.dim,self.hidden), nn.ReLU(),nn.Linear(self.hidden,num_cls)
    def forward(self, input_ids, attention_mask,ce=False):
        output = self.encoder(input_ids = input_ids, attention_mask = attention_mask)
        embedding = output['pooler_output']
        if ce:
            return self.mlp_prediction(embedding)
        else:
            return self.mlp_projection(embedding)
class CRSdataset(Dataset):
    def __init__(self, model_name, targets, text_list, max_len = 512):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.data = []
        self.max_len=max_len
        self.targets = targets
        for text in tqdm(text_list):
            org_input = self.tokenizer(text, padding='max_length', truncation=True,
                                       max_length=self.max_len, return_tensors='pt')
            org_input['input_ids'] = torch.squeeze(org_input['input_ids'])
            org_input['attention_mask'] = torch.squeeze(org_input['attention_mask'])
            self.data.append(org_input)
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx],self.targets[idx]

In [3]:
bsz = 8
n_class = len(data['PurposeCode'].unique())
model_name = 'bert-base-uncased'
model = Model(bert=model_name, num_cls = n_class)
available_gpu = [0,3]
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model,device_ids=available_gpu) # device_ids=device_ids
# model.cpu()
model.cuda()
model.load_state_dict(torch.load('../bert_0816.pth'),strict=True)
testds = CRSdataset(model_name=model_name,targets=test_targets,text_list=test_corpus)
testloader = DataLoader(testds, batch_size=bsz, shuffle=False, num_workers=1,drop_last=True)
model.eval()
import sklearn.metrics
with torch.no_grad():
    tbar= tqdm(testloader)
    all_outputs = []
    test_y = []
    loss = []
    for inputs, targets in tbar:
        input_ids = inputs['input_ids'].long().cuda()
        attention_mask = inputs['attention_mask'].long().cuda()
        output = model(input_ids=input_ids,attention_mask=attention_mask,ce=True)
        all_outputs.append(output.cpu())
        test_y.append(targets)
    all_outputs = torch.cat(all_outputs)
    all_losses = torch.mean(torch.tensor(loss))
    test_y = torch.cat(test_y)
    val_preds = all_outputs.softmax(dim=1)
    pred_y = val_preds.argmax(axis=1) 
    print(pred_y.shape,test_y.shape)
    # val_accuracy = sum(val_preds.argmax(axis=1)==test_y)/len(test_y)
    acc = sklearn.metrics.accuracy_score(y_pred=pred_y,y_true=test_y)
    f1 = sklearn.metrics.f1_score(y_true=test_y,y_pred=pred_y,average='macro')
    auc = sklearn.metrics.roc_auc_score(y_true=test_y,y_score=val_preds,multi_class='ovr')
    prec = sklearn.metrics.precision_score(y_true=test_y,y_pred=pred_y,average='macro')
    recall = sklearn.metrics.recall_score(y_true=test_y,y_pred=pred_y,average='macro')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15135 [00:00<?, ?it/s]

  0%|          | 0/1891 [00:00<?, ?it/s]

torch.Size([15128]) torch.Size([15128])


In [4]:
print('bert:',acc,f1,prec,recall)

bert: 0.8972104706504495 0.8782385829877718 0.8875509801412697 0.8765621279373174


# Electra model

In [5]:


bsz = 8
n_class = len(data['PurposeCode'].unique())
model_name = 'google/electra-small-discriminator'
model = Model_electra_wofreeze(model_name=model_name, num_cls = n_class)
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model,device_ids=available_gpu) # device_ids=device_ids
# model.cpu()
model.cuda()
model.load_state_dict(torch.load('electra_model_0816.pth'),strict=True)
testds = CRSdataset(model_name=model_name,targets=test_targets,text_list=test_corpus)
testloader = DataLoader(testds, batch_size=bsz, shuffle=False, num_workers=1,drop_last=True)
model.eval()
import sklearn.metrics
with torch.no_grad():
    tbar= tqdm(testloader)
    all_outputs = []
    test_y = []
    loss = []
    for inputs, targets in tbar:
        input_ids = inputs['input_ids'].long().cuda()
        attention_mask = inputs['attention_mask'].long().cuda()
        output = model(input_ids=input_ids,attention_mask=attention_mask,ce=True)
        all_outputs.append(output.cpu())
        test_y.append(targets)
    all_outputs = torch.cat(all_outputs)
    all_losses = torch.mean(torch.tensor(loss))
    test_y = torch.cat(test_y)
    val_preds = all_outputs.softmax(dim=1)
    pred_y = val_preds.argmax(axis=1) 
    print(pred_y.shape,test_y.shape)
    # val_accuracy = sum(val_preds.argmax(axis=1)==test_y)/len(test_y)
    acc = sklearn.metrics.accuracy_score(y_pred=pred_y,y_true=test_y)
    f1 = sklearn.metrics.f1_score(y_true=test_y,y_pred=pred_y,average='macro')
    auc = sklearn.metrics.roc_auc_score(y_true=test_y,y_score=val_preds,multi_class='ovr')
    prec = sklearn.metrics.precision_score(y_true=test_y,y_pred=pred_y,average='macro')
    recall = sklearn.metrics.recall_score(y_true=test_y,y_pred=pred_y,average='macro')
print('electra:',acc,f1,prec,recall)

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ElectraTokenizer'. 
Th

  0%|          | 0/15135 [00:00<?, ?it/s]

  0%|          | 0/1891 [00:00<?, ?it/s]

torch.Size([15128]) torch.Size([15128])
electra: 0.8972765732416711 0.8740787824992978 0.8856782858958924 0.872673920728798


In [6]:
from sklearn.metrics import top_k_accuracy_score
import torch
from torch import tensor
from sklearn.metrics import f1_score, accuracy_score, jaccard_score,precision_score,recall_score
# sourcecode from  https://gist.github.com/weiaicunzai/2a5ae6eac6712c70bde0630f3e76b77b
def top_k_eval(logits, y, k : int = 1):
    """
    logits : (bs, n_labels)
    y : (bs,)
    """
    labels_dim = 1
    assert 1 <= k <= logits.size(labels_dim)
    k_labels = torch.topk(input = logits, k = k, dim=labels_dim, largest=True, sorted=True)[1]

    # True (#0) if `expected label` in k_labels, False (0) if not
    a = ~torch.prod(input = torch.abs(y.unsqueeze(labels_dim) - k_labels), dim=labels_dim).to(torch.bool)
    
    # These two approaches are equivalent
    if False :
        y_pred = torch.empty_like(y)
        for i in range(y.size(0)):
            if a[i] :
                y_pred[i] = y[i]
            else :
                y_pred[i] = k_labels[i][0]
        #correct = a.to(torch.int8).numpy()
    else :
        a = a.to(torch.int8)
        y_pred = a * y + (1-a) * k_labels[:,0]
        #correct = a.numpy()

#     f1 = f1_score(y_pred, y, average='weighted')*100
    #acc = sum(correct)/len(correct)*100
    acc = accuracy_score(y_pred, y)#*100
    f1 = sklearn.metrics.f1_score(y_true=y,y_pred=y_pred,average='macro')
    prec = sklearn.metrics.precision_score(y_true=y,y_pred=y_pred,average='macro')
    recall = sklearn.metrics.recall_score(y_true=y,y_pred=y_pred,average='macro')
    return acc, f1,prec,recall

topk_results = []
for k in [1,3,5]:
    topk_acc,topk_f1,topk_prec,topk_recall = top_k_eval(logits=val_preds,y=test_y,k=k)
#     print(topk_acc,topk_acc2,topk_prec,topk_recall,topk_f1)
    topk_results.append([topk_acc,topk_f1,topk_prec,topk_recall,k])
topk_results = pd.DataFrame(topk_results,columns=['acc','f1','prec','recall','k']).set_index('k')

In [7]:
topk_results

Unnamed: 0_level_0,acc,f1,prec,recall
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.897277,0.874079,0.885678,0.872674
3,0.957496,0.944474,0.950573,0.940359
5,0.970518,0.957421,0.962017,0.954112


# BiLSTM, CNN model

In [None]:
from dataloader_cnn_bilstm import *
batch_size = 512
valid_loader, test_y, x_cv, le =  load_data_cnn_bilstm(data_path = "data/crs_final_df_kor.csv")


In [4]:
model = torch.load('results/textcnn_model')
model.eval()        
avg_val_loss = 0.
val_preds = np.zeros((len(x_cv),len(le.classes_))) 
# keep/store predictions
with torch.no_grad(): # You should use no_grad session when you evaluate the model. (It is much faster!)
    for i, (x_batch, y_batch) in enumerate(valid_loader): 
        y_pred = model(x_batch).detach()
        val_preds[i * batch_size:(i+1) * batch_size] =F.softmax(y_pred, dim=1).cpu().numpy()

# Check Accuracy
pred_y = val_preds.argmax(axis=1) 
import sklearn.metrics
# val_accuracy = sum(val_preds.argmax(axis=1)==test_y)/len(test_y)
acc = sklearn.metrics.accuracy_score(y_pred=pred_y,y_true=test_y)
f1 = sklearn.metrics.f1_score(y_true=test_y,y_pred=pred_y,average='macro')
auc = sklearn.metrics.roc_auc_score(y_true=test_y,y_score=val_preds,multi_class='ovr')
prec = sklearn.metrics.precision_score(y_true=test_y,y_pred=pred_y,average='macro')
recall = sklearn.metrics.recall_score(y_true=test_y,y_pred=pred_y,average='macro')
print(acc,f1,auc,prec,recall)

0.8724810042946812 0.8445913147821237 0.9964722572607925 0.8784275814401158 0.8253003129921374


In [None]:
model = torch.load('results/bilstm_model')
model.eval()        
avg_val_loss = 0.
val_preds = np.zeros((len(x_cv),len(le.classes_))) 

# keep/store predictions
with torch.no_grad(): # You should use no_grad session when you evaluate the model. (It is much faster!)
    for i, (x_batch, y_batch) in enumerate(valid_loader): 
        y_pred = model(x_batch).detach()
        val_preds[i * batch_size:(i+1) * batch_size] =F.softmax(y_pred, dim=1).cpu().numpy()

# Check Accuracy
pred_y = val_preds.argmax(axis=1) 
import sklearn.metrics
# val_accuracy = sum(val_preds.argmax(axis=1)==test_y)/len(test_y)
acc = sklearn.metrics.accuracy_score(y_pred=pred_y,y_true=test_y)
f1 = sklearn.metrics.f1_score(y_true=test_y,y_pred=pred_y,average='macro')
auc = sklearn.metrics.roc_auc_score(y_true=test_y,y_score=val_preds,multi_class='ovr')
prec = sklearn.metrics.precision_score(y_true=test_y,y_pred=pred_y,average='macro')
recall = sklearn.metrics.recall_score(y_true=test_y,y_pred=pred_y,average='macro')
print(acc,f1,auc,prec,recall)
#0.8197555335315494 0.7206335487417044 0.9901975804315533 0.7829225534517649 0.6965562367234499