# LAB 6: Text classification with linear models

Objectives:

* Train and evaluate linear text classifiers using SGDClassifier
* Experiment with different feature extraction and training methods
* Log and evaluate experimental results using [mlflow](https://mlflow.org)

In [1]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

### Load and preprocess data

In [2]:
train = pd.read_parquet(
    "s3://ling583/rcv1-topics-train.parquet", storage_options={"anon": True}
)
test = pd.read_parquet(
    "s3://ling583/rcv1-topics-test.parquet", storage_options={"anon": True}
)

In [3]:
train.head()

Unnamed: 0,text,topics
0,NZ bonds close well bid ahead of key U.S. data...,MCAT
1,Asia Product Swaps - Jet/gas oil regrade at di...,MCAT
2,U.S. public schools get a C report card in qua...,GCAT
3,Thunder Bay vessel clearances - May 12. Daily ...,MCAT
4,"Amoco gains shares in Ula,Gyda N.Sea fields. A...",CCAT


CCAT : CORPORATE/INDUSTRIAL  
ECAT : ECONOMICS  
GCAT : GOVERNMENT/SOCIAL  
MCAT : MARKETS

In [4]:
train["topics"].value_counts()

CCAT    5896
MCAT    3281
GCAT    3225
ECAT    1073
Name: topics, dtype: int64

In [5]:
import spacy

nlp = spacy.load(
    "en_core_web_sm",
    exclude=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"],
)


def tokenize(text):
    doc = nlp.tokenizer(text)
    return [t.norm_ for t in doc if t.is_alpha]

In [6]:
import multiprocessing as mp

In [7]:
with mp.Pool() as p:
    train["tokens"] = pd.Series(p.imap(tokenize, tqdm(train["text"]), chunksize=100))
    test["tokens"] = pd.Series(p.imap(tokenize, tqdm(test["text"]), chunksize=100))

  0%|          | 0/13475 [00:00<?, ?it/s]

  0%|          | 0/3369 [00:00<?, ?it/s]

In [8]:
train.head()

Unnamed: 0,text,topics,tokens
0,NZ bonds close well bid ahead of key U.S. data...,MCAT,"[nz, bonds, close, well, bid, ahead, of, key, ..."
1,Asia Product Swaps - Jet/gas oil regrade at di...,MCAT,"[asia, product, swaps, jet, gas, oil, regrade,..."
2,U.S. public schools get a C report card in qua...,GCAT,"[public, schools, get, a, c, report, card, in,..."
3,Thunder Bay vessel clearances - May 12. Daily ...,MCAT,"[thunder, bay, vessel, clearances, may, daily,..."
4,"Amoco gains shares in Ula,Gyda N.Sea fields. A...",CCAT,"[amoco, gains, shares, in, ula, gyda, fields, ..."


---

### SGDClassifier

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline

In [10]:
sgd = make_pipeline(CountVectorizer(analyzer=identity), SGDClassifier())
sgd.fit(train["tokens"], train["topics"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["topics"], predicted))

              precision    recall  f1-score   support

        CCAT       0.95      0.97      0.96      1475
        ECAT       0.97      0.70      0.81       268
        GCAT       0.96      0.98      0.97       806
        MCAT       0.93      0.96      0.95       820

    accuracy                           0.95      3369
   macro avg       0.95      0.90      0.92      3369
weighted avg       0.95      0.95      0.95      3369



In [12]:
import logger
import mlflow
from logger import log_search, log_test

In [13]:
mlflow.set_experiment("lab-6")
log_test(sgd, test["topics"], predicted)

---

### Hyperparameters

In [14]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:37393")
client

0,1
Client  Scheduler: tcp://127.0.0.1:37393  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.67 GB


In [18]:
from dask_ml.model_selection import RandomizedSearchCV
from scipy.stats.distributions import loguniform, randint, uniform

In [15]:
from warnings import simplefilter

simplefilter(action="ignore", category=FutureWarning)

In [16]:
mlflow.set_experiment("lab-6/SGDClassifier")

INFO: 'lab-6/SGDClassifier' does not exist. Creating a new experiment


In [21]:
%%time

search = RandomizedSearchCV(
    sgd,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "sgdclassifier__alpha": loguniform(1e-8, 100.0),
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 5.19 s, sys: 275 ms, total: 5.46 s
Wall time: 1min 11s


In [22]:
%%time

search = RandomizedSearchCV(
    sgd,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "sgdclassifier__alpha": loguniform(1e-3, 1.0),
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 5.17 s, sys: 247 ms, total: 5.41 s
Wall time: 1min 9s


In [23]:
%%time

search = RandomizedSearchCV(
    sgd,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "sgdclassifier__alpha":  [0.1],
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 5.15 s, sys: 298 ms, total: 5.44 s
Wall time: 1min 6s


----

### Optimized model

In [24]:
sgd = make_pipeline(
    CountVectorizer(analyzer=identity, min_df=7, max_df=0.7), SGDClassifier(alpha=0.003)
)
sgd.fit(train["tokens"], train["topics"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["topics"], predicted))

              precision    recall  f1-score   support

        CCAT       0.96      0.97      0.97      1475
        ECAT       0.91      0.85      0.88       268
        GCAT       0.96      0.97      0.97       806
        MCAT       0.96      0.96      0.96       820

    accuracy                           0.96      3369
   macro avg       0.95      0.94      0.94      3369
weighted avg       0.96      0.96      0.96      3369



In [25]:
mlflow.set_experiment("lab-6")
log_test(sgd, test["topics"], predicted)

The accuracy for SGD CLassifier is 0.96, which is a little bit better than Multinomial Naive Bayes model.

---

### TfidfTransformer 

In [38]:
from sklearn.feature_extraction.text import TfidfTransformer

In [41]:
sgd_idf = make_pipeline(CountVectorizer(analyzer=identity), 
                        TfidfTransformer(), 
                        SGDClassifier())
sgd_idf.fit(train["tokens"], train["topics"])
predicted_idf = sgd_idf.predict(test["tokens"])
print(classification_report(test["topics"], predicted_idf))

              precision    recall  f1-score   support

        CCAT       0.97      0.97      0.97      1475
        ECAT       0.94      0.81      0.87       268
        GCAT       0.95      0.98      0.97       806
        MCAT       0.96      0.97      0.96       820

    accuracy                           0.96      3369
   macro avg       0.95      0.93      0.94      3369
weighted avg       0.96      0.96      0.96      3369



In [43]:
mlflow.set_experiment("lab-6")
log_test(sgd_idf, test["topics"], predicted_idf)

This model has an accuracy of .96, which is the same as SGD Classifier. 

### Hyperparameters for TfidfTransformer

In [55]:
mlflow.set_experiment("lab-6/TfidfTransformer")

In [56]:
%%time

search = RandomizedSearchCV(
    sgd_idf,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "sgdclassifier__alpha": loguniform(1e-3, 1.0),
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 5.28 s, sys: 274 ms, total: 5.55 s
Wall time: 1min 8s


In [57]:
%%time

search = RandomizedSearchCV(
    sgd_idf,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "sgdclassifier__alpha": loguniform(1e-8, 100.0),
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 5.37 s, sys: 268 ms, total: 5.64 s
Wall time: 1min 6s


In [58]:
%%time

search = RandomizedSearchCV(
    sgd_idf,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "sgdclassifier__alpha": [0.1],
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 5.4 s, sys: 308 ms, total: 5.71 s
Wall time: 1min 18s


In [60]:
sgd_idf = make_pipeline(
    CountVectorizer(analyzer=identity, min_df=4, max_df=0.64), 
                        TfidfTransformer(), 
                        SGDClassifier(alpha=2.15)
)
sgd_idf.fit(train["tokens"], train["topics"])
predicted = sgd_idf.predict(test["tokens"])
print(classification_report(test["topics"], predicted_idf))

              precision    recall  f1-score   support

        CCAT       0.97      0.97      0.97      1475
        ECAT       0.94      0.81      0.87       268
        GCAT       0.95      0.98      0.97       806
        MCAT       0.96      0.97      0.96       820

    accuracy                           0.96      3369
   macro avg       0.95      0.93      0.94      3369
weighted avg       0.96      0.96      0.96      3369



After running hyperparameters, the TFIDF tranformed model has the same accuracy value as SGD Classifier. 

### TruncatedSVD

In [62]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import random as sparse_random

In [63]:
svd = make_pipeline(CountVectorizer(analyzer=identity), 
                    TfidfTransformer(), 
                    TruncatedSVD(n_components=100),
                    SGDClassifier())

In [64]:
svd.fit(train["tokens"], train["topics"])
predicted_svd = svd.predict(test["tokens"])
print(classification_report(test["topics"], predicted_svd))

              precision    recall  f1-score   support

        CCAT       0.92      0.97      0.94      1475
        ECAT       0.93      0.72      0.81       268
        GCAT       0.95      0.94      0.95       806
        MCAT       0.95      0.92      0.94       820

    accuracy                           0.93      3369
   macro avg       0.94      0.89      0.91      3369
weighted avg       0.93      0.93      0.93      3369



Truncated SVD model has a lower inital accuracy rate.

In [65]:
mlflow.set_experiment("lab-6")
log_test(svd, test["topics"], predicted_svd)

Now, the Truncated SVD model has smaller accuracy compared to the SGD and Tfid Transformer

### Hyperparameters for TruncatedSVD

In [66]:
mlflow.set_experiment("lab-6/TruncatedSVD")

INFO: 'lab-6/TruncatedSVD' does not exist. Creating a new experiment


In [67]:
%%time

search = RandomizedSearchCV(
    svd,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "sgdclassifier__alpha": loguniform(1e-8, 100.0),
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 5.84 s, sys: 487 ms, total: 6.33 s
Wall time: 3min 54s


In [68]:
%%time

search = RandomizedSearchCV(
    svd,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "sgdclassifier__alpha": loguniform(1e-3, 1.0),
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 5.96 s, sys: 459 ms, total: 6.42 s
Wall time: 3min 40s


In [69]:
%%time

search = RandomizedSearchCV(
    svd,
    {
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "sgdclassifier__alpha": [0.1],
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 5.72 s, sys: 445 ms, total: 6.17 s
Wall time: 3min 31s


In [77]:
svd = make_pipeline(CountVectorizer(analyzer=identity, min_df=8, max_df=0.85), 
                    TfidfTransformer(), 
                    TruncatedSVD(n_components=100),
                    SGDClassifier())

In [78]:
svd.fit(train["tokens"], train["topics"])
predicted_svd = svd.predict(test["tokens"])
print(classification_report(test["topics"], predicted_svd))

              precision    recall  f1-score   support

        CCAT       0.96      0.95      0.96      1475
        ECAT       0.88      0.79      0.83       268
        GCAT       0.93      0.98      0.95       806
        MCAT       0.94      0.95      0.94       820

    accuracy                           0.94      3369
   macro avg       0.93      0.92      0.92      3369
weighted avg       0.94      0.94      0.94      3369



In [79]:
mlflow.set_experiment("lab-6")
log_test(svd, test["topics"], predicted_svd)

---

### Summary 

- The tuned accuracy for four models are: 
- Multinomial: .94
- SGD Classifier: .96
- TFIDF Tranformer: .96 
- Truncated SVD: .94 
F1 scores are very similar for all of these models. 

The best model that I've tried would be SGD Classifier because it has the highest accuracy value. Although, TFIDF Tranforer model also has the same accuracy value as SGD Classifier, this model has more comlicated parameters. In other words, SGD Classifier model is more simple to work with. 