In [1]:
import json
import pandas as pd, numpy as np
import sklearn.metrics as skm
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [2]:
corpus_path = "./quda_corpus.txt"

lable_typies =['Retrieve Value',
     'Filter',
     'Compute Derived Value',
     'Find Extremum',
     'Sort',
     'Determine Range',
     'Characterize Distribution',
     'Find Anomalies',
     'Cluster',
     'Correlate']

num_labels = len(lable_typies)

split_info = {
    "random": False,
    "expert": [20, 4],
    "bundle": [920, 1],
    "table": [36, 3]
}

In [3]:
def transform_df(data1, data2=None):
    
    data1_df = []

    for i in data1:
        for j in i:
            data1_df.append({"text": j[0], "labels": j[1]})

    if data2:
        data2_df = []
        for i in data2:
            for j in i:
                data2_df.append({"text": j[0], "labels": j[1]})

        return pd.DataFrame(shuffle(data1_df)), pd.DataFrame(shuffle(data2_df))

    return pd.DataFrame(shuffle(data1_df))


def dataset_split(data_path, split_type, test_size=0.2):
    ''' Retrun a train set and a test set after specified a certain data-split type
        Args:
            data_path: The path of Quda corpus
            split_type: 'random', 'expert', 'bundle', or 'table'.
            test_size: The proportion of the dataset to include in the test split
    '''
    
    split = split_info[split_type]
    if split:
        [num, pi] = split
        data = [[] for i in range(num)]
        
        with open(data_path, "r", encoding='utf-8') as fp:
            for line in fp.readlines():
                word = line.split()
                info = word[0].split(":")
                typeId = json.loads(info[0])
                query = " ".join(word[1:])
                index = int(info[pi]) - 1
                labels = [0] * num_labels
                for i in range(len(typeId)):
                    labels[typeId[i]-1] = 1
                
                data[index].append([query,labels])
                

        for i in range(num):
            data[i] = shuffle(data[i])
            data[i] = np.asarray(data[i])

        data = shuffle(data)
        
        train_s, test_s = train_test_split(data, test_size=test_size)
        print("The number of %ss for train: %d; for test: %d" % (split_type, len(train_s), len(test_s)))
        
        train_set, test_set = transform_df(train_s, test_s)
        print("The number of queries for train: %d; for test: %d" % (len(train_set), len(test_set)))
        
        return train_set, test_set
    
    data = []
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            word = line.split()
            info = word[0].split(":")
            typeId = json.loads(info[0])
            query = " ".join(word[1:])
            labels = [0] * num_labels
            for i in range(len(typeId)):
                labels[typeId[i]-1] = 1
            data.append([query,labels])
    data = shuffle(data)
    
    train_s, test_s = train_test_split(data, test_size=test_size)
    train_set = pd.DataFrame(train_s,columns=["text", "labels"])
    test_set = pd.DataFrame(test_s,columns=["text", "labels"])
    print("The number of queries for train: %d; for test: %d" % (len(train_set), len(test_set)))
    
    return train_set, test_set

In [4]:
# There are four types for spliting Quda corpus: random, expert, bundle, and table.
train_set, test_set = dataset_split(corpus_path, "table")

The number of tables for train: 28; for test: 8
The number of queries for train: 11093; for test: 2942


  return array(a, dtype, copy=False, order=order)


In [5]:
train_set

Unnamed: 0,text,labels
0,what is the average height of the floors on th...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
1,the students ' scores in the cw2 were what ran...,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0]"
2,what they want to know is what football tactic...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
3,are there any regions with extremely low hf po...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
4,"for o3 , what is the distribution value ?","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
...,...,...
11088,how is the population of a given country relat...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
11089,can we group the russian cities on the basis o...,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 0]"
11090,are there any outliers in the correlation betw...,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 1]"
11091,what was the national distribution of postgrad...,"[0, 1, 0, 0, 0, 0, 1, 0, 0, 0]"


In [6]:
test_set

Unnamed: 0,text,labels
0,"how many likes does the video "" plush - bad un...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,can you tell me if videos with longer names re...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
2,"what language was the book "" the art of super ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,"can you tell me how many likes the video "" plu...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,can you buy a transfer player for less than hi...,"[0, 1, 0, 0, 0, 0, 0, 1, 0, 0]"
...,...,...
2937,the video had already been trending before the...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
2938,can you find all the videos that received more...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
2939,determine the distribution of male and female ...,"[0, 1, 1, 0, 0, 0, 1, 0, 0, 0]"
2940,"which company is the company that published "" ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [7]:
max_seq_length = int(max(train_set["text"].str.split().str.len().max(), test_set["text"].str.split().str.len().max()))
max_seq_length

41

In [8]:
from simpletransformers.classification import MultiLabelClassificationModel

# initialize a model
model = MultiLabelClassificationModel('bert', 'bert-base-cased', num_labels=num_labels, use_cuda=True, args={
    'train_batch_size': 4, 
    'gradient_accumulation_steps': 8, 
    'learning_rate': 3e-5, 
    'num_train_epochs': 3, 
    'max_seq_length': max_seq_length, # 41
    'fp16': False
})

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMultiLabelSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultiLabelSequenceClassification were not in

In [9]:
# train the model
model.train_model(train_set) 

# load a model from outpus directory
# model = MultiLabelClassificationModel('bert', 'outputs/', num_labels=num_labels)

HBox(children=(FloatProgress(value=0.0, max=11093.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 3', max=2774.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 3', max=2774.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 3', max=2774.0, style=ProgressStyle(de…





In [10]:
# predict
preds, outputs = model.predict(test_set.text)

HBox(children=(FloatProgress(value=0.0, max=2942.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=368.0), HTML(value='')))




In [11]:
# evaluate the results
y_true = list(test_set.labels)
y_pred = np.array(preds)
cm = skm.multilabel_confusion_matrix(y_true, y_pred)
print(cm)
print( skm.classification_report(y_true,y_pred, target_names=lable_typies))
print("accuracy_score: %f" % skm.accuracy_score(y_true,y_pred))
print("f1_score: %f" % skm.f1_score(y_true,y_pred,average='micro'))
print("fbeta_score: %f" % skm.fbeta_score(y_true,y_pred, average='micro', beta=0.5))
print("hamming_loss: %f" % skm.hamming_loss(y_true,y_pred))
print("jaccard_score: %f" % skm.jaccard_score(y_true,y_pred, average='micro'))
print("precision_score: %f" % skm.precision_score(y_true,y_pred, average='micro'))
print("recall_score: %f" % skm.recall_score(y_true,y_pred, average='micro'))
print("zero_one_loss: %f" % skm.zero_one_loss(y_true,y_pred))

[[[2507  107]
  [ 124  204]]

 [[1668  249]
  [ 309  716]]

 [[2493   72]
  [ 151  226]]

 [[2473   41]
  [  57  371]]

 [[2713    6]
  [  20  203]]

 [[2688    7]
  [ 112  135]]

 [[2686   17]
  [  43  196]]

 [[2586   51]
  [  74  231]]

 [[2713    7]
  [  35  187]]

 [[2477   50]
  [  65  350]]]
                           precision    recall  f1-score   support

           Retrieve Value       0.66      0.62      0.64       328
                   Filter       0.74      0.70      0.72      1025
    Compute Derived Value       0.76      0.60      0.67       377
            Find Extremum       0.90      0.87      0.88       428
                     Sort       0.97      0.91      0.94       223
          Determine Range       0.95      0.55      0.69       247
Characterize Distribution       0.92      0.82      0.87       239
           Find Anomalies       0.82      0.76      0.79       305
                  Cluster       0.96      0.84      0.90       222
                Correlate    

  _warn_prf(average, modifier, msg_start, len(result))
