Perform unsupervised text classification based on similarity.

# Setup

In [1]:
## for data
import json
import pandas as pd
import numpy as np
from sklearn import metrics, manifold
## for processing
import re
import nltk
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for w2v
import gensim
import gensim.downloader as gensim_api
## for bert
import transformers

In [14]:
corpus = pd.read_csv('real whole corpus.csv', names = ['company', 'text'])
corpus = corpus.dropna()

# Preprocessing

In [15]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def utils_preprocess_text(text, flg_stemm = False, flg_lemm = True, lst_stopwords = None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())

    ## tokenize (convert from string to list)
    lst_text = text.split()

    ## remove stopwords 
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]

    ## stemming
    if flg_stemm :
        lemm = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lemm.lemmatize(word) for word in lst_text] 

    ## back to string from list 
    text = " ".join(lst_text)
    return text

In [16]:
lst_stopwords = nltk.corpus.stopwords.words("english")

In [17]:
corpus["text_clean"] = corpus["text"].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, 
                                                                            lst_stopwords=lst_stopwords))

# Create target clusters

In [20]:
## a pretrained NLP model 
nlp = gensim_api.load("glove-wiki-gigaword-300")

In [21]:
general_words = ['climate','risk','carbon','emission','environment','social','governance','physical','transition',
                 'dioxide','sustainability','green','economy','energy','efficiency','clean','waste','recuperation',
                 'renewable','wind','hydro','solar','global','warming','hydrocarbon','methane','oil','coal','gas',
                 'greenhouse','recycling','composting','soil','water','air','pollution','natural','hazard']

physical_words =  ['heat','wave','cold','floods','droughts','wildfires','storms','change','precipitation','pattern',
                   'sea','level','rise','coastal','hazard','fluvial']

transition_words = ['paris','agreement','regulatory','kyoto','protocol','environmental','protection','agency'] 

In [22]:
## use this to create a dictionary of keywords for each category 

## functionalize 
def get_similar_words(lst_words, top, nlp):
    lst_out = lst_words
    for tupla in nlp.most_similar(lst_words, topn=top):
        lst_out.append(tupla[0])
    return list(set(lst_out))

## create dictionary of this form: {category: [keywords]}
dic_clusters = {}

dic_clusters["GENERAL"] = get_similar_words(general_words, top=30, nlp=nlp)
dic_clusters["PHYSICAL"] = get_similar_words(physical_words, top=30, nlp=nlp)
dic_clusters["TRANSITION"] = get_similar_words(transition_words, top=30, nlp=nlp)


## print some
for k,v in dic_clusters.items():
    print(k, ": ", v[0:5], "...", len(v))

GENERAL :  ['produce', 'increase', 'emission', 'risk', 'consumption'] ... 68
PHYSICAL :  ['heat', 'rainfall', 'level', 'hazard', 'result'] ... 46
TRANSITION :  ['agencies', 'requirements', 'proposal', 'regulations', 'regulatory'] ... 38


# Feature engineering

In [23]:
## use Bert for feature engineering

tokenizer = transformers.BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case = True)

nlp = transformers.TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [24]:
## function to apply
def utils_bert_embedding(txt, tokenizer, nlp):
    idx = tokenizer.encode(txt, stride = 10, return_overflowing_tokens=True, max_length = 500)[:-1]
    idx = np.array(idx, dtype=np.int64)
    embedding = nlp(idx)
    ## Todo: 
    X = np.array(embedding[0][0][1:-1])
    return X
## create list of news vector
lst_mean_vecs = [utils_bert_embedding(txt, tokenizer, nlp).mean(0) for txt in corpus["text_clean"]]
## create the feature matrix (n news x 768)
X = np.array(lst_mean_vecs)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


ResourceExhaustedError: OOM when allocating tensor with shape[504,12,500,500] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu [Op:BatchMatMulV2]

In [None]:
dic_y = {k:utils_bert_embedding(v, tokenizer, nlp).mean(0) for k,v in dic_clusters.items()}

# Model design

In [None]:
#--- Model Algorithm ---#
## compute cosine similarities
similarities = np.array([metrics.pairwise.cosine_similarity(X, y).T.tolist()[0] for y in dic_y.values()]).T
## adjust and rescale
labels = list(dic_y.keys())
for i in range(len(similarities)):
    ### assign randomly if there is no similarity
    if sum(similarities[i]) == 0:
        similarities[i] = [0]*len(labels)
        similarities[i][np.random.choice(range(len(labels)))] = 1
    ### rescale so they sum = 1
    similarities[i] = similarities[i] / sum(similarities[i])

## classify the label with highest similarity score
predicted_prob = similarities
predicted = [labels[np.argmax(pred)] for pred in predicted_prob]