# Unsupervised Text Classification with Word Embeddings



In [1]:
import spacy
import sklearn
import pandas as pd
import numpy as np
import string

2021-11-19 15:52:38.272104: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-19 15:52:38.272544: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [66]:
!python -m spacy download en_core_web_lg
# pip uninstall en-core-web-lg
# To remove the model after c

2021-11-02 16:43:39.498196: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-02 16:43:39.498930: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
^C
Traceback (most recent call last):
  File "/usr/lib/python3.8/runpy.py", line 185, in _run_module_as_main
    mod_name, mod_spec, code = _get_module_details(mod_name, _Error)
  File "/usr/lib/python3.8/runpy.py", line 144, in _get_module_details
    return _get_module_details(pkg_main_name, error)
  File "/usr/lib/python3.8/runpy.py", line 111, in _get_module_details
    __import__(pkg_name)
  File "/home/echao/projects/machineLearning-UW/.venv/lib/python3.8/site-packages/spacy/__init__.py", line 11, in <module>
    from thinc.api import prefer_gpu, require_gpu, require_cpu  # noqa: F401
  File "/home/

In [2]:
nlp = spacy.load('en_core_web_lg')

OSError: [E050] Can't find model 'en_core_web_lg'. It doesn't seem to be a Python package or a valid path to a data directory.

In [8]:
def clean_string(text):
    '''Utility function for removing punctuation, lowercasing and removing extra whitespace'''
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())
    return text

In [38]:
def embed(tokens, nlp):
    """Returns the centroid of the embeddings for the given tokens of a document
    
    Out-of-vocabulary tokens and stopwords are cast aside.
    Zero vector is returned if no tokens are valid.
    """
    lexemes = (nlp.vocab[token] for token in tokens)
    
    vectors = np.asarray([
        lexeme.vector
        for lexeme in lexemes
        if lexeme.has_vector
        and not lexeme.is_stop
        and len(lexeme.text) > 1
    ])

    if len(vectors) > 0:
        centroid = vectors.mean(axis=0)
    else:
        width = nlp.meta['vectors']['width']  # typically 300
        centroid = np.zeros(width)

    return centroid

**Note:**  A _lexeme_ is a basic abstract unit of meaning, a unit of morphological analysis in linguistics that roughly corresponds to a set of forms taken by a single root word. In spacy, we can access a word's lexeme with nlp.vocab(token), which may have its own, unique word vector.

In [9]:
example0 = "I have never been more angry at myself for forgetting my own girlfriend's birthday."
example1 = "Dogs are such beautiful creatures, I find they area  great companion."
exampl2 = "No all espresso bars are good. Last week I tried one of the worst cups of coffee and it cost me $6."

In [20]:
tokens0 = example0.split(' ')
centroid0 = embed(tokens0, nlp)

LEXEMES: <generator object embed.<locals>.<genexpr> at 0x7f8190b45e40>


### Classifying as 'good' or 'bad'

- All that remains is to find the closest neighbor to the centroid.
- For this task we could use the NearestNeighbors class from `sklearn`.
- We must have the embeddings corresponding to each label

In [17]:
def get_label_embeddings(labels, nlp):
    '''Given a list of label names returns the corresponding word embeddings.
        Handles label names with more than two words.
    '''
    label_embeddings = np.asarray([
        embed(label.split(' '), nlp)
        for label in labels
    ])

    return label_embeddings

In [16]:
label_names = ['good', 'bad']

label_embeddings = get_label_embeddings(label_names, nlp)

LEXEMES: <generator object embed.<locals>.<genexpr> at 0x7f81cb9e5f20>
LEXEMES: <generator object embed.<locals>.<genexpr> at 0x7f81cb9e5f20>


In [19]:
from sklearn.neighbors import NearestNeighbors

nb = NearestNeighbors(n_neighbors=1)
nb.fit(label_embeddings)

NearestNeighbors(n_neighbors=1)

In [21]:
closest_label = nb.kneighbors([centroid0], return_distance=False)[0, 0]
label_names[closest_label]

'bad'

In [24]:
tokens1 = example1.split(' ')
print("Document", example1)
centroid1 = embed(tokens1, nlp)
closest_label = nb.kneighbors([centroid1], return_distance=False)[0, 0]
print("Result:", label_names[closest_label])

Document Dogs are such beautiful creatures, I find they area  great companion.
LEXEMES: <generator object embed.<locals>.<genexpr> at 0x7f8190b45f20>
Result: good


We can see that our first document `example0` got classified correctly!
This is amazing because we did not even have to train any supervised model ourselves.

The million dolar question is, _how well does this perform for N documents?_

## Evaluating Performance with a big dataset

In [42]:
df = pd.read_csv('./data/amazon_baby.csv')

In [48]:
df['review'] = df['review'].astype('string')

In [49]:
df['sentiment'] = np.where(df['rating'] > 3, 'good', 'bad')

eval_df = df[df['rating'] != 3]
neutral_df = df[df['rating'] == 3]

eval_df.head(10)

Unnamed: 0,name,review,rating,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,good
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,good
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,good
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,good
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,good
6,A Tale of Baby's Days with Peter Rabbit,"Lovely book, it's bound tightly so you may not...",4,good
7,"Baby Tracker&reg; - Daily Childcare Journal, S...",Perfect for new parents. We were able to keep ...,5,good
8,"Baby Tracker&reg; - Daily Childcare Journal, S...",A friend of mine pinned this product on Pinter...,5,good
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,good
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,good


In [50]:
def predict(doc, nlp, nb, label_names):
    doc = clean_string(doc)
    tokens = doc.split()[:50]  # only the first 50 tokens
    centroid = embed(tokens, nlp)
    closest_label = nb.kneighbors([centroid1], return_distance=False)[0, 0]
    return label_names[closest_label]

In [51]:
eval_df['predicted_label'] = eval_df['review'].apply(lambda x: predict(x, nlp, nb, label_names))

# For 2,225 documents, it takes about 2 seconds
# For a 50,000 length dataset, it would take abt 44 secs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_df['predicted_label'] = eval_df['review'].apply(lambda x: predict(str(x), nlp, nb, label_names))


For demostrating the predictive power of this technique we will use the heuristic of rating to provide true labels to compare against.

- We will see the accuracy of predicting a 4+ star review as good and a 2- star review as bad.
- For 3 star reviews, we will see which one is more prevalent.

In [53]:
eval_df.head()

Unnamed: 0,name,review,rating,sentiment,predicted_label
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,good,good
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,good,good
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,good,good
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,good,good
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,good,good


In [54]:
def accuracy(predictions, labels):
    count = 0
    for p, l in zip(predictions, labels):
        count += p == l
    
    return count*1.0 / len(predictions)

In [56]:
acc = accuracy(eval_df['predicted_label'], eval_df['sentiment'])
print("Accuracy of embeddings classifier", acc)

Accuracy of embeddings classifier 0.8411233448474381


In [None]:
# Echarse un clavado en el 15%
# 

### ...and the Neutral reviews?

Just for curiosity we want to see the majority class among neutral reviews.

In [63]:
def majority_class(predictions):
    good = 0
    bad = 0
    for p in predictions:
        if p=='good':
            good += 1
        else:
            bad += 1
           
    print(f"Good: {good} ({good/len(predictions)}) \nBad: {bad} ({bad/len(predictions)})")

In [64]:
neutral_df.dropna(inplace=True)
neutral_df['review'] = neutral_df['review'].astype('string')
neutral_df['predicted_label'] = neutral_df['review'].apply(lambda x: predict(x, nlp, nb, label_names))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neutral_df['review'] = neutral_df['review'].astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neutral_df['predicted_label'] = neutral_df['review'].apply(lambda x: predict(x, nlp, nb, label_names))


In [65]:
print(len(neutral_df))
majority_class(neutral_df['predicted_label'])

16705
Good: 16705 (1.0) 
Bad: 0 (0.0)
