In [75]:
import spacy
import random
from spacy.util import minibatch
import pandas as pd
from sklearn.model_selection import train_test_split
import string
import numpy as np

In [5]:
df = pd.read_csv('multi_label_df.csv')
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
unique_classes = ['trade', 'grain', 'crude', 'nat-gas', 'corn', 'rice', 'sugar', 'veg-oil', 'ship', 'coffee', 'wheat', 'gold', 'acq', 'interest', 'money-fx', 'copper', 'ipi', 'carcass', 'livestock', 'oilseed', 'soybean', 'earn', 'bop', 'gas', 'jobs', 'cpi', 'gnp', 'dlr', 'yen', 'cocoa', 'cotton', 'money-supply', 'iron-steel', 'alum', 'reserves', 'barley']

In [7]:
# Import packages
import spacy
import pandas as pd
import re
from spacy.tokens import DocBin
from tqdm import tqdm

## Preprocess

In [8]:
def preprocess(df, embed, unique_classes = unique_classes):
    '''
    Preprocess the dataframe into spacy pipeline for later classification
    ---
    Input:
    df (DataFrame): Pandas dataframe containing the raw text and outputs.
    embed (str): Name of pipeline embedding used

    Output:
    df (DataFrame): Preprocessed input dataframe
    docs (doc): SpaCy doc object that stores text data along with classification
    '''


    # Store the data into tuples
    data = tuple(zip(df['text'].tolist(), df['filtered_labels'].tolist())) 
    
    # Load English library from SpaCy
    nlp=spacy.load(embed)
    # print(data[0])

    # Storage for docs
    docs = []

    # One-hot encoding for the classifications
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        for class_name in unique_classes:
            if class_name in label:
                doc.cats[class_name] = 1
            else:
                doc.cats[class_name] = 0
        docs.append(doc)
    return df, docs

In [9]:
!python -m spacy init fill-config config/base_multi.cfg config/multi_config.cfg 

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config/multi_config.cfg
You can now add your data and train your pipeline:
python -m spacy train multi_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


## Train Model

In [10]:
# Covert the train and test dataframes to .spacy files for training

# Preprocess the dataframes for train data
train_data, train_docs = preprocess(train_df,"en_core_web_sm")
# Save data and docs in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("data/spacy_data/textcat_train_multi.spacy")

# Preprocess the dataframes for test data
test_data, test_docs = preprocess(test_df,"en_core_web_sm")
# Save data and docs in a binary file to disc
doc_bin = DocBin(docs=test_docs)
doc_bin.to_disk("data/spacy_data/textcat_valid_multi.spacy")

  from .autonotebook import tqdm as notebook_tqdm
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8630/8630 [02:23<00:00, 60.04it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2158/2158 [00:36<00:00, 58.75it/s]


In [11]:
# View the entities in the train and test docs
train_loc = "data/spacy_data/textcat_train_multi.spacy"
dev_loc = "data/spacy_data/textcat_valid_multi.spacy"

# Load library and train data
nlp = spacy.load('en_core_web_sm')
doc_bin = DocBin().from_disk(train_loc)
docs = list(doc_bin.get_docs(nlp.vocab))
entities = 0

# Iterate through the docs
for doc in docs:
    entities += len(doc.ents)
print(f"TRAIN docs: {len(docs)} with {entities} entities")

# Load library and test data
doc_bin = DocBin().from_disk(dev_loc)
docs = list(doc_bin.get_docs(nlp.vocab))
entities = 0

# Iterate through the docs
for doc in docs:
    entities += len(doc.ents)
print(f"DEV docs: {len(docs)} with {entities} entities")

TRAIN docs: 8630 with 167327 entities
DEV docs: 2158 with 42018 entities


In [12]:
!python -m spacy train config/multi_config.cfg --verbose --output data/multi_textcat_output --paths.train data/spacy_data/textcat_train_multi.spacy --paths.dev data/spacy_data/textcat_valid_multi.spacy

[2023-08-20 20:27:50,430] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[38;5;2m✔ Created output directory: data/multi_textcat_output[0m
[38;5;4mℹ Saving to output directory: data/multi_textcat_output[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[2023-08-20 20:27:51,094] [INFO] Set up nlp object from config
[2023-08-20 20:27:51,111] [DEBUG] Loading corpus from path: data/spacy_data/textcat_valid_multi.spacy
[2023-08-20 20:27:51,113] [DEBUG] Loading corpus from path: data/spacy_data/textcat_train_multi.spacy
[2023-08-20 20:27:51,114] [INFO] Pipeline: ['textcat_multilabel']
[2023-08-20 20:27:51,117] [INFO] Created vocabulary
[2023-08-20 20:27:51,117] [INFO] Finished initializing nlp object
[2023-08-20 20:28:04,212] [INFO] Initialized pipeline components: ['textcat_multilabel']
[38;5;2m✔ Initialized pipeline[0m
[1m
[2023-08-20 20:28:04,232] [DEBUG] Loading corpus from path: data/spacy_data/textcat_valid_multi.spacy

## Metrics Calculation

In [51]:
def get_spacy_pred(dict):
    return_dict = {}
    for key, value in dict.items():
        if value >=0.5:
            return_dict[key] = value
    return return_dict

In [16]:
# Verify model for English model
nlp_model = spacy.load("data/multi_textcat_output/model-best")
test_text = test_data.text.tolist()
test_cats = test_data['filtered_labels'].tolist()

In [79]:
def create_multilabel_onehot(labels, unique_classes=unique_classes):
    append_list = []
    for item in unique_classes:
        if item in labels:
            append_list.append(1)
        else:
            append_list.append(0)
    return np.array(append_list)

In [80]:
test_df['multi_target'] = test_df['labels'].apply(lambda x: create_multilabel_onehot(x))

In [98]:
pred_list = []
for item in tqdm(test_text, total=len(test_text)):
    doc = nlp_model(item)
    pred = get_spacy_pred(doc.cats)
    one_hot = create_multilabel_onehot(pred.keys())
    pred_list.append(one_hot)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2158/2158 [00:00<00:00, 2537.17it/s]


In [106]:
from sklearn.metrics import  multilabel_confusion_matrix, hamming_loss

In [105]:
multilabel_confusion_matrix(np.array(test_df['multi_target'].to_list()), np.array(pred_list))

array([[[2058,    7],
        [  32,   61]],

       [[2024,    6],
        [  26,  102]],

       [[2045,    7],
        [  26,   80]],

       [[2127,    0],
        [  20,   11]],

       [[2101,    4],
        [  20,   33]],

       [[2139,    0],
        [  19,    0]],

       [[2117,    6],
        [  11,   24]],

       [[2135,    1],
        [  10,   12]],

       [[2095,    3],
        [  21,   39]],

       [[2130,    0],
        [   8,   20]],

       [[2092,    6],
        [  21,   39]],

       [[2125,    0],
        [  13,   20]],

       [[1680,    9],
        [  31,  438]],

       [[2035,   11],
        [  29,   83]],

       [[2007,   18],
        [  28,  105]],

       [[2149,    0],
        [   4,    5]],

       [[2150,    0],
        [   5,    3]],

       [[2143,    0],
        [  15,    0]],

       [[2137,    0],
        [  18,    3]],

       [[2132,    1],
        [  17,    8]],

       [[2138,    3],
        [  14,    3]],

       [[1360,   11],
        [  2

In [107]:
hamming_loss(np.array(test_df['multi_target'].to_list()), np.array(pred_list))

0.008508392544537122

> The model achieves a Hamming Loss of **0.0085** which is considerably good for a primitive model