In [1]:
import json
import pandas as pd, numpy as np
import sklearn.metrics as skm
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [2]:
corpus_path = "./quda_corpus.txt"

lable_typies =['Retrieve Value',
     'Filter',
     'Compute Derived Value',
     'Find Extremum',
     'Sort',
     'Determine Range',
     'Characterize Distribution',
     'Find Anomalies',
     'Cluster',
     'Correlate']

num_labels = len(lable_typies)

split_info = {
    "random": False,
    "expert": [20, 4],
    "bundle": [920, 1],
    "table": [36, 3]
}

In [3]:
def transform_df(data1, data2=None):
    
    data1_df = []

    for i in data1:
        for j in i:
            data1_df.append({"text": j[0], "labels": j[1]})

    if data2:
        data2_df = []
        for i in data2:
            for j in i:
                data2_df.append({"text": j[0], "labels": j[1]})

        return pd.DataFrame(shuffle(data1_df)), pd.DataFrame(shuffle(data2_df))

    return pd.DataFrame(shuffle(data1_df))


def dataset_split(data_path, split_type, test_size=0.2):
    ''' Retrun a train set and a test set after specified a certain data-split type
        Args:
            data_path: The path of Quda corpus
            split_type: 'random', 'expert', 'bundle', or 'table'.
            test_size: The proportion of the dataset to include in the test split
    '''
    
    split = split_info[split_type]
    if split:
        [num, pi] = split
        data = [[] for i in range(num)]
        
        with open(data_path, "r", encoding='utf-8') as fp:
            for line in fp.readlines():
                word = line.split()
                info = word[0].split(":")
                typeId = json.loads(info[0])
                query = " ".join(word[1:])
                index = int(info[pi]) - 1
                labels = [0] * num_labels
                for i in range(len(typeId)):
                    labels[typeId[i]-1] = 1
                
                data[index].append([query,labels])
                

        for i in range(num):
            data[i] = shuffle(data[i])
            data[i] = np.asarray(data[i])

        data = shuffle(data)
        
        train_s, test_s = train_test_split(data, test_size=test_size)
        print("The number of %ss for train: %d; for test: %d" % (split_type, len(train_s), len(test_s)))
        
        train_set, test_set = transform_df(train_s, test_s)
        print("The number of queries for train: %d; for test: %d" % (len(train_set), len(test_set)))
        
        return train_set, test_set
    
    data = []
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            word = line.split()
            info = word[0].split(":")
            typeId = json.loads(info[0])
            query = " ".join(word[1:])
            labels = [0] * num_labels
            for i in range(len(typeId)):
                labels[typeId[i]-1] = 1
            data.append([query,labels])
    data = shuffle(data)
    
    train_s, test_s = train_test_split(data, test_size=test_size)
    train_set = pd.DataFrame(train_s,columns=["text", "labels"])
    test_set = pd.DataFrame(test_s,columns=["text", "labels"])
    print("The number of queries for train: %d; for test: %d" % (len(train_set), len(test_set)))
    
    return train_set, test_set

In [4]:
# There are four types for spliting Quda corpus: random, expert, bundle, and table.
train_set, test_set = dataset_split(corpus_path, "random")

The number of queries for train: 11228; for test: 2807


In [5]:
train_set

Unnamed: 0,text,labels
0,calculate the quality of o3 .,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
1,records with na values are to be removed,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"taking all the free gaming apps , what is the ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0]"
3,using a chosen year of release as cluster para...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
4,show me the station that is the southernmost .,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
...,...,...
11223,has there been any correlation made between th...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
11224,locate all regions with a number of missing re...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
11225,how do boys and girls cw2 scores differ ?,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
11226,how about the correlation between age and trav...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"


In [6]:
test_set

Unnamed: 0,text,labels
0,are there any clusters of countries in terms o...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
1,does it appear more likes are given to videos ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
2,could you please enumerate the books written b...,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0]"
3,is there a correlation between life expectancy...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,"do you have to pay money for "" coloring book m...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...
2802,give me videos that have received dislikes .,"[0, 1, 0, 0, 0, 0, 0, 1, 0, 0]"
2803,organize all these things by the reviews they ...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
2804,"looking through the suicide rates , do any out...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
2805,i would like to sort all the counties in conne...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"


In [7]:
max_seq_length = int(max(train_set["text"].str.split().str.len().max(), test_set["text"].str.split().str.len().max()))
max_seq_length

41

In [8]:
from simpletransformers.classification import MultiLabelClassificationModel

# initialize a model
model = MultiLabelClassificationModel('bert', 'bert-base-cased', num_labels=num_labels, use_cuda=True, args={
    'train_batch_size': 4, 
    'gradient_accumulation_steps': 8, 
    'learning_rate': 3e-5, 
    'num_train_epochs': 3, 
    'max_seq_length': max_seq_length, # 41
    'fp16': False
})

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMultiLabelSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultiLabelSequenceClassification were not in

In [9]:
# train the model
model.train_model(train_set) 

# load a model from outpus directory
# model = MultiLabelClassificationModel('bert', 'outputs/', num_labels=num_labels)

HBox(children=(FloatProgress(value=0.0, max=11228.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 3', max=2807.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 3', max=2807.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 3', max=2807.0, style=ProgressStyle(de…





In [10]:
# predict
preds, outputs = model.predict(test_set.text)

HBox(children=(FloatProgress(value=0.0, max=2807.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=351.0), HTML(value='')))




In [11]:
# evaluate the results
y_true = list(test_set.labels)
y_pred = np.array(preds)
cm = skm.multilabel_confusion_matrix(y_true, y_pred)
print(cm)
print( skm.classification_report(y_true,y_pred, target_names=lable_typies))
print("accuracy_score: %f" % skm.accuracy_score(y_true,y_pred))
print("f1_score: %f" % skm.f1_score(y_true,y_pred,average='micro'))
print("fbeta_score: %f" % skm.fbeta_score(y_true,y_pred, average='micro', beta=0.5))
print("hamming_loss: %f" % skm.hamming_loss(y_true,y_pred))
print("jaccard_score: %f" % skm.jaccard_score(y_true,y_pred, average='micro'))
print("precision_score: %f" % skm.precision_score(y_true,y_pred, average='micro'))
print("recall_score: %f" % skm.recall_score(y_true,y_pred, average='micro'))
print("zero_one_loss: %f" % skm.zero_one_loss(y_true,y_pred))

[[[2543    3]
  [  11  250]]

 [[1853    8]
  [  20  926]]

 [[2382   13]
  [  22  390]]

 [[2375   12]
  [  10  410]]

 [[2524    8]
  [   9  266]]

 [[2534    5]
  [  22  246]]

 [[2525    1]
  [  18  263]]

 [[2536    5]
  [  13  253]]

 [[2569    5]
  [   7  226]]

 [[2464    1]
  [  10  332]]]
                           precision    recall  f1-score   support

           Retrieve Value       0.99      0.96      0.97       261
                   Filter       0.99      0.98      0.99       946
    Compute Derived Value       0.97      0.95      0.96       412
            Find Extremum       0.97      0.98      0.97       420
                     Sort       0.97      0.97      0.97       275
          Determine Range       0.98      0.92      0.95       268
Characterize Distribution       1.00      0.94      0.97       281
           Find Anomalies       0.98      0.95      0.97       266
                  Cluster       0.98      0.97      0.97       233
                Correlate    

  _warn_prf(average, modifier, msg_start, len(result))
