In [2]:
import spacy
import random
from spacy.util import minibatch
import pandas as pd
from sklearn.model_selection import train_test_split
import string

In [18]:
df = pd.read_csv('single_label_df.csv')

In [36]:
unique_classes = df['top_label'].unique()

In [19]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['top_label'])

## Preprocessing

In [20]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
train_df['encoded_label'] = encoder.fit_transform(train_df['top_label'])
test_df['encoded_label']  = encoder.transform(test_df['top_label'])

In [21]:
import string
import re

# Load the English library from SpaCy
nlp = spacy.load("en_core_web_sm")

# # Add contextual spell check to pipeline
# nlp.add_pipe("contextual spellchecker", config={"max_edit_dist": 5})    

# Create list of punctuation marks
punctuations = string.punctuation

# Create list of stopwords from spaCy
stopwords = spacy.lang.en.stop_words.STOP_WORDS


# Creat tokenizer function
def spacy_tokenizer(sentence):
    # Create token object from spacy
    tokens = nlp(sentence)

    # Correct spelling
    # tokens = tokens._.outcome_spellCheck
    # tokens = nlp(tokens)

    # Lemmatize each token and convert each token into lowercase
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "PROPN" else word.lower_ for word in tokens]
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
    
    
    # return preprocessed list of tokens
    return tokens

## Bag-of-words model

In [25]:
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

# Custom transformer class using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Implement clean_text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Remove spaces and converte text into lowercase
    return text.strip().lower()

# Bag-of-words data transformation
from sklearn.feature_extraction.text import CountVectorizer
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))


In [24]:
X_train = train_df['text']
X_test = test_df['text']
y_train = train_df['encoded_label']
y_test = test_df['encoded_label']

In [26]:
# Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

# Create pipeline using Bag of Words
pipe_NB = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe_NB.fit(X_train,y_train)



In [27]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier_log = LogisticRegression()

# Create pipeline using Bag of Words
pipe_log = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier_log)])

# model generation
pipe_log.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
# SVM Classifier
from sklearn.svm import SVC
classifier_svm = SVC()

# Create pipeline using Bag of Words
pipe_svm = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier_svm)])

# model generation
pipe_svm.fit(X_train,y_train)



### Model metrics

In [30]:
# Classification Report
from sklearn.metrics import classification_report

# Predict with a test dataset
predicted = pipe_NB.predict(X_test)

# Model Accuracy
print("Naive Bayes Model:\n")
print(classification_report(y_test, predicted, target_names = encoder.classes_))

Naive Bayes Model:

              precision    recall  f1-score   support

         acq       0.90      0.97      0.93       470
       cocoa       1.00      0.50      0.67        12
      coffee       1.00      0.75      0.86        24
      copper       1.00      0.40      0.57        10
         cpi       1.00      0.53      0.70        15
       crude       0.88      0.84      0.86       108
        earn       0.98      0.95      0.96       793
         gnp       0.82      0.45      0.58        20
        gold       0.94      0.70      0.80        23
       grain       0.67      0.97      0.79       116
    interest       0.88      0.48      0.62        58
   livestock       1.00      0.06      0.12        16
    money-fx       0.68      0.96      0.79       143
money-supply       1.00      0.84      0.91        31
     oilseed       0.00      0.00      0.00        16
        ship       0.95      0.58      0.72        33
       sugar       1.00      0.54      0.70        26
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
# Classification Report
from sklearn.metrics import classification_report
# Predicting with a test dataset
predicted_log = pipe_log.predict(X_test)

# Model Accuracy
print("Logistic Regression Model:\n")
print(classification_report(y_test, predicted_log, target_names = encoder.classes_))

Logistic Regression Model:

              precision    recall  f1-score   support

         acq       0.96      0.96      0.96       470
       cocoa       1.00      1.00      1.00        12
      coffee       0.96      1.00      0.98        24
      copper       1.00      0.90      0.95        10
         cpi       0.87      0.87      0.87        15
       crude       0.93      0.84      0.88       108
        earn       0.97      0.99      0.98       793
         gnp       0.79      0.95      0.86        20
        gold       0.96      0.96      0.96        23
       grain       0.96      0.93      0.94       116
    interest       0.82      0.78      0.80        58
   livestock       1.00      0.62      0.77        16
    money-fx       0.86      0.91      0.88       143
money-supply       0.91      0.94      0.92        31
     oilseed       0.67      0.50      0.57        16
        ship       0.84      0.82      0.83        33
       sugar       0.92      0.92      0.92        26

In [32]:
# Classificatin Report
from sklearn.metrics import classification_report
# Predicting with a test dataset
predicted_svm = pipe_svm.predict(X_test)

# Model Accuracy
print("SVM Model:\n")
print(classification_report(y_test, predicted_svm, target_names = encoder.classes_))

SVM Model:

              precision    recall  f1-score   support

         acq       0.82      0.98      0.89       470
       cocoa       1.00      0.75      0.86        12
      coffee       0.95      0.88      0.91        24
      copper       1.00      0.70      0.82        10
         cpi       0.85      0.73      0.79        15
       crude       0.92      0.78      0.84       108
        earn       0.97      0.98      0.98       793
         gnp       0.79      0.55      0.65        20
        gold       0.91      0.91      0.91        23
       grain       0.87      0.84      0.85       116
    interest       0.86      0.66      0.75        58
   livestock       1.00      0.06      0.12        16
    money-fx       0.82      0.87      0.84       143
money-supply       0.93      0.84      0.88        31
     oilseed       0.86      0.38      0.52        16
        ship       0.92      0.70      0.79        33
       sugar       0.95      0.77      0.85        26
       trade   

## Neural Model

In [35]:
# Import packages
import spacy
import pandas as pd
import re
from spacy.tokens import DocBin
from tqdm import tqdm

In [37]:
def preprocess(df, embed, unique_classes = unique_classes):
    '''
    Preprocess the dataframe into spacy pipeline for later classification
    ---
    Input:
    df (DataFrame): Pandas dataframe containing the raw text and outputs.
    embed (str): Name of pipeline embedding used

    Output:
    df (DataFrame): Preprocessed input dataframe
    docs (doc): SpaCy doc object that stores text data along with classification
    '''


    # Store the data into tuples
    data = tuple(zip(df['text'].tolist(), df['top_label'].tolist())) 
    
    # Load English library from SpaCy
    nlp=spacy.load(embed)
    # print(data[0])

    # Storage for docs
    docs = []

    # One-hot encoding for the classifications
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        for class_name in unique_classes:
            if label==class_name:
                doc.cats[class_name] = 1
            else:
                doc.cats[class_name] = 0
        docs.append(doc)
    return df, docs

### Config Setup

In [39]:
!python -m spacy init fill-config config/base_config.cfg config/config.cfg 

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


### Spacy English Model

In [42]:
# Covert the train and test dataframes to .spacy files for training

# Preprocess the dataframes for train data
train_data, train_docs = preprocess(train_df,"en_core_web_sm")
# Save data and docs in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("data/spacy_data/textcat_train.spacy")

# Preprocess the dataframes for test data
test_data, test_docs = preprocess(test_df,"en_core_web_sm")
# Save data and docs in a binary file to disc
doc_bin = DocBin(docs=test_docs)
doc_bin.to_disk("data/spacy_data/textcat_valid.spacy")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8056/8056 [02:09<00:00, 62.14it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2014/2014 [00:31<00:00, 63.95it/s]


#### Validate the files and train

In [44]:
# View the entities in the train and test docs
train_loc = "data/spacy_data/textcat_train.spacy"
dev_loc = "data/spacy_data/textcat_valid.spacy"

# Load library and train data
nlp = spacy.load('en_core_web_sm')
doc_bin = DocBin().from_disk(train_loc)
docs = list(doc_bin.get_docs(nlp.vocab))
entities = 0

# Iterate through the docs
for doc in docs:
    entities += len(doc.ents)
print(f"TRAIN docs: {len(docs)} with {entities} entities")

# Load library and test data
doc_bin = DocBin().from_disk(dev_loc)
docs = list(doc_bin.get_docs(nlp.vocab))
entities = 0

# Iterate through the docs
for doc in docs:
    entities += len(doc.ents)
print(f"DEV docs: {len(docs)} with {entities} entities")

TRAIN docs: 8056 with 155341 entities
DEV docs: 2014 with 38445 entities


In [46]:
!python -m spacy train config/config.cfg --verbose --output data/textcat_output --paths.train data/spacy_data/textcat_train.spacy --paths.dev data/spacy_data/textcat_valid.spacy

[2023-08-20 18:49:13,970] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[38;5;2m✔ Created output directory: data/textcat_output[0m
[38;5;4mℹ Saving to output directory: data/textcat_output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-08-20 18:49:14,207] [INFO] Set up nlp object from config
[2023-08-20 18:49:14,220] [DEBUG] Loading corpus from path: data/spacy_data/textcat_valid.spacy
[2023-08-20 18:49:14,221] [DEBUG] Loading corpus from path: data/spacy_data/textcat_train.spacy
[2023-08-20 18:49:14,221] [INFO] Pipeline: ['textcat']
[2023-08-20 18:49:14,224] [INFO] Created vocabulary
[2023-08-20 18:49:14,224] [INFO] Finished initializing nlp object
[2023-08-20 18:49:30,863] [INFO] Initialized pipeline components: ['textcat']
[38;5;2m✔ Initialized pipeline[0m
[1m
[2023-08-20 18:49:30,879] [DEBUG] Loading corpus from path: data/spacy_data/textcat_valid.spacy
[2023-08-20 18:49:30,881] [DEBUG] Loading corpus from path: data/spacy_data/textcat_train.spacy
[38;5;4mℹ Pipeli

#### Model Metrics

In [65]:
def get_spacy_pred(dict):
  largest_value = max(dict.values())
  return list(dict.keys())[list(dict.values()).index(largest_value)]

In [58]:
# Verify model for English model
nlp_model = spacy.load("data/textcat_output/model-best")
test_text = test_data.text.tolist()
test_cats = test_data.top_label.tolist()

In [62]:
pred_list = []
for item in tqdm(test_text, total=len(test_text)):
    doc = nlp_model(item)
    pred = get_spacy_pred(doc.cats)
    pred_list.append(pred)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2014/2014 [00:00<00:00, 2620.04it/s]


In [66]:
from sklearn.metrics import f1_score

In [68]:
f1_score(test_cats, pred_list, average='weighted')

0.9272686201100564

> In addition to the above model we can also use **en_core_web_trf** model as the nlp model in spacy for improved performance

> Spacy also supports GPU support and trnsformer based models in the pipeline. In addition it also supports the recent developments in LLM space for any NLP apctivities