# LAB 6: Text classification with linear models

Objectives:

* Train and evaluate linear text classifiers using SGDClassifier
* Experiment with different feature extraction and training methods
* Log and evaluate experimental results using [mlflow](https://mlflow.org)

In [1]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

### Load and preprocess data

In [2]:
train = pd.read_parquet(
    "s3://ling583/rcv1-topics-train.parquet", storage_options={"anon": True}
)
test = pd.read_parquet(
    "s3://ling583/rcv1-topics-test.parquet", storage_options={"anon": True}
)

In [3]:
train.head()

Unnamed: 0,text,topics
0,NZ bonds close well bid ahead of key U.S. data...,MCAT
1,Asia Product Swaps - Jet/gas oil regrade at di...,MCAT
2,U.S. public schools get a C report card in qua...,GCAT
3,Thunder Bay vessel clearances - May 12. Daily ...,MCAT
4,"Amoco gains shares in Ula,Gyda N.Sea fields. A...",CCAT


CCAT : CORPORATE/INDUSTRIAL  
ECAT : ECONOMICS  
GCAT : GOVERNMENT/SOCIAL  
MCAT : MARKETS

In [4]:
train["topics"].value_counts()

CCAT    5896
MCAT    3281
GCAT    3225
ECAT    1073
Name: topics, dtype: int64

In [5]:
import spacy

nlp = spacy.load(
    "en_core_web_sm",
    exclude=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"],
)


def tokenize(text):
    doc = nlp.tokenizer(text)
    return [t.norm_ for t in doc if t.is_alpha]

In [6]:
import multiprocessing as mp

In [7]:
with mp.Pool() as p:
    train["tokens"] = pd.Series(p.imap(tokenize, tqdm(train["text"]), chunksize=100))
    test["tokens"] = pd.Series(p.imap(tokenize, tqdm(test["text"]), chunksize=100))

  0%|          | 0/13475 [00:00<?, ?it/s]

  0%|          | 0/3369 [00:00<?, ?it/s]

In [8]:
train.head()

Unnamed: 0,text,topics,tokens
0,NZ bonds close well bid ahead of key U.S. data...,MCAT,"[nz, bonds, close, well, bid, ahead, of, key, ..."
1,Asia Product Swaps - Jet/gas oil regrade at di...,MCAT,"[asia, product, swaps, jet, gas, oil, regrade,..."
2,U.S. public schools get a C report card in qua...,GCAT,"[public, schools, get, a, c, report, card, in,..."
3,Thunder Bay vessel clearances - May 12. Daily ...,MCAT,"[thunder, bay, vessel, clearances, may, daily,..."
4,"Amoco gains shares in Ula,Gyda N.Sea fields. A...",CCAT,"[amoco, gains, shares, in, ula, gyda, fields, ..."


---

### SGDClassifier

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline

In [10]:
sgd = make_pipeline(CountVectorizer(analyzer=identity), SGDClassifier())
sgd.fit(train["tokens"], train["topics"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["topics"], predicted))

              precision    recall  f1-score   support

        CCAT       0.97      0.96      0.96      1475
        ECAT       0.91      0.88      0.89       268
        GCAT       0.95      0.98      0.97       806
        MCAT       0.96      0.95      0.96       820

    accuracy                           0.96      3369
   macro avg       0.95      0.94      0.95      3369
weighted avg       0.96      0.96      0.96      3369



In [11]:
import logger
import mlflow
from logger import log_search, log_test

In [12]:
mlflow.set_experiment("lab-6")
log_test(sgd, test["topics"], predicted)

---

### Hyperparameters

In [13]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:35033")
client

0,1
Client  Scheduler: tcp://127.0.0.1:35033  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.62 GB


In [14]:
from dask_ml.model_selection import RandomizedSearchCV
from scipy.stats.distributions import loguniform, randint, uniform

In [15]:
from warnings import simplefilter

simplefilter(action="ignore", category=FutureWarning)

In [16]:
mlflow.set_experiment("lab-6/SGDClassifier")

In [17]:
%%time

search = RandomizedSearchCV(
    sgd,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "sgdclassifier__alpha": [0.1],
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 5.38 s, sys: 429 ms, total: 5.81 s
Wall time: 1min 4s


### Optimized Model for SGD Classifier




In [18]:
sgd = make_pipeline(
    CountVectorizer(analyzer=identity, min_df=2, max_df=0.7), SGDClassifier(alpha=0.1)
)
sgd.fit(train["tokens"], train["topics"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["topics"], predicted))

              precision    recall  f1-score   support

        CCAT       0.95      0.96      0.96      1475
        ECAT       0.95      0.75      0.84       268
        GCAT       0.95      0.98      0.96       806
        MCAT       0.94      0.96      0.95       820

    accuracy                           0.95      3369
   macro avg       0.95      0.91      0.93      3369
weighted avg       0.95      0.95      0.95      3369



In [19]:
mlflow.set_experiment("lab-6")
log_test(sgd, test["topics"], predicted)

### TfidTransformer Classifier

In [20]:
from sklearn.feature_extraction.text import TfidfTransformer

In [21]:
sgd = make_pipeline(CountVectorizer(analyzer=identity), 
                    TfidfTransformer(), 
                    SGDClassifier())
sgd.fit(train["tokens"], train["topics"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["topics"], predicted))

              precision    recall  f1-score   support

        CCAT       0.96      0.97      0.97      1475
        ECAT       0.94      0.82      0.87       268
        GCAT       0.95      0.98      0.97       806
        MCAT       0.96      0.96      0.96       820

    accuracy                           0.96      3369
   macro avg       0.95      0.93      0.94      3369
weighted avg       0.96      0.96      0.96      3369



In [22]:
mlflow.set_experiment("lab-6/SGDClassifierTransform")

In [23]:
%%time

search = RandomizedSearchCV(
    sgd,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "sgdclassifier__alpha": loguniform(1e-8, 100.0),
        "tfidftransformer__smooth_idf":[True, False],
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 5.48 s, sys: 420 ms, total: 5.9 s
Wall time: 1min 5s


### Optimized Model for SGD Classifier Transformation

In [24]:
sgd = make_pipeline(CountVectorizer(analyzer=identity, max_df = 0.97),
                    TfidfTransformer(use_idf = "TRUE"), 
                    SGDClassifier(alpha = 0.0001))
sgd.fit(train["tokens"], train["topics"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["topics"], predicted))    
#Take note of the difference between the macro average of the default model and the 
#optimized model for the transformation.

              precision    recall  f1-score   support

        CCAT       0.97      0.97      0.97      1475
        ECAT       0.94      0.81      0.87       268
        GCAT       0.95      0.98      0.97       806
        MCAT       0.96      0.97      0.96       820

    accuracy                           0.96      3369
   macro avg       0.96      0.93      0.94      3369
weighted avg       0.96      0.96      0.96      3369



In [25]:
mlflow.set_experiment("lab-6")
log_test(sgd, test["topics"], predicted)

### Truncated SVD Model

In [None]:
from sklearn.decomposition import TruncatedSVD

In [27]:
sgd = make_pipeline(CountVectorizer(analyzer=identity), 
                    TfidfTransformer(), 
                    TruncatedSVD(n_components=100),
                    SGDClassifier())
sgd.fit(train["tokens"], train["topics"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["topics"], predicted))

              precision    recall  f1-score   support

        CCAT       0.96      0.95      0.95      1475
        ECAT       0.93      0.74      0.82       268
        GCAT       0.91      0.98      0.94       806
        MCAT       0.94      0.94      0.94       820

    accuracy                           0.94      3369
   macro avg       0.93      0.90      0.91      3369
weighted avg       0.94      0.94      0.94      3369



In [28]:
mlflow.set_experiment("lab-6/SGDClassifierTruncate")

In [29]:
%%time

search = RandomizedSearchCV(
    sgd,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "sgdclassifier__alpha": loguniform(1e-8, 100.0),
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 6.05 s, sys: 561 ms, total: 6.61 s
Wall time: 3min 58s


### Optimized TruncatedSVD 

In [30]:
sgd = make_pipeline(CountVectorizer(analyzer=identity), 
                    TfidfTransformer(), 
                    TruncatedSVD(n_components=100),
                    SGDClassifier())
sgd.fit(train["tokens"], train["topics"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["topics"], predicted))

              precision    recall  f1-score   support

        CCAT       0.97      0.94      0.95      1475
        ECAT       0.92      0.72      0.81       268
        GCAT       0.91      0.98      0.94       806
        MCAT       0.92      0.96      0.94       820

    accuracy                           0.94      3369
   macro avg       0.93      0.90      0.91      3369
weighted avg       0.94      0.94      0.94      3369



In [31]:
mlflow.set_experiment("lab-6")
log_test(sgd, test["topics"], predicted)

### Ngrams Classifier

In [32]:
from nltk import bigrams
def unibigrams(toks):
    return [(tok,) for tok in toks] + list(bigrams(toks))

sgd = make_pipeline(CountVectorizer(analyzer=unibigrams), SGDClassifier())
sgd.fit(train["tokens"], train["topics"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["topics"], predicted))

              precision    recall  f1-score   support

        CCAT       0.97      0.96      0.96      1475
        ECAT       0.89      0.86      0.88       268
        GCAT       0.96      0.98      0.97       806
        MCAT       0.95      0.96      0.95       820

    accuracy                           0.95      3369
   macro avg       0.94      0.94      0.94      3369
weighted avg       0.95      0.95      0.95      3369



In [33]:
mlflow.set_experiment("lab-6/SGDClassifierNGrams")

In [None]:
%%time

search = RandomizedSearchCV(
    sgd,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "sgdclassifier__alpha": loguniform(1e-8, 100.0),
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

In [None]:
sgd = make_pipeline(CountVectorizer(analyzer=unibigrams), SGDClassifier())
sgd.fit(train["tokens"], train["topics"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["topics"], predicted))

It would seem that the best classifier from using the skikit learn packages would be the Ngram classifier, as it has the largest macro average f1-score out of the others at 0.94. The order of the best classifiers after the ngram classifier would be the transformer, then truncatedSGD, then the default SGD classifier that I used in making the pipelines for text classification, based on the f1-scores value.  