The goal of this notebook is to develop an end2end text classifier using different embedding like word2vec, fasttext, and doc2vec.

The first pipe is to clean the text data (More cleaning for future data.) 

In [1]:
# Data cleaning
# ===========================================================================
import re
def clean_text(text):
    # replace  . and a space with only a space, then amke all words lower case.
    text = text.replace(". "," ").replace(",","").lower()
    # get rid of the . at the end of each line. 
    cleaned_text = re.sub("\.$","",text)
    
    return cleaned_text
 


class text_clean:
    """
    A class to help with cleaning text data. 
    """
    def fit(self, X, y):
        return self
    def transform(self, X):
        assert isinstance(X,pd.Series), "The input data should be pandas Series."
        X = X.apply(clean_text)
        
        return X


# Word embedding training 
# ===========================================================================
from tqdm import tqdm
import numpy as np
from gensim.models import Word2Vec
from gensim.models import FastText
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.externals import joblib



def _find_part_pii(text, model, sep = " "):
    tokenized_text = text.split(sep)
    
    part_pii = model.wv.doesnt_match(tokenized_text)
    
    return part_pii    



def _extracted_pii2matrix(pii_list, model):
    # set the matrix dimensions
    column_num = model.trainables.layer1_size
    row_num = len(pii_list)
    # initialized the matrix
    pii2vec_mat = np.zeros((row_num, column_num))
    # iterate through the pii_list and assign the vectors to matrix.
    for index, ith_pii in enumerate(tqdm(pii_list)):
        pii2vec_mat[index,:] = model.wv[ith_pii]
    
    return pii2vec_mat



class word_embedding:
    """
    A class to convert words/docs to vectors by applying any model supported by gensim.  
    
    This class will allow continued training on the pre-trained model by assigning
    the model to the pre_trained option in class initialization.  
    
    After training the model, it will dump the word2vec model to the path assigned to 
    dump_file option.  
    
    
    """
    def __init__(self, algo_name = "word2vec", size = 100, min_count = 1, window = 5, workers =1,\
                 epochs = 5, pre_train = None, dump_file = False,\
                 re_train_new_sentences = True):
        
        
        assert algo_name in ["word2vec", 'fasttext', 'doc2vec'], \
        "please enter a model name in ['word2vec', 'fasttext', 'doc2vec']"
        
        self.algo_name = algo_name
        self.epochs = epochs 
        self.pre_train = pre_train
        self.dump_file = dump_file 
        self.re_train_new_sentences = re_train_new_sentences
        
        # model options
        self.size = size
        self.min_count = min_count
        self.window = window
        self.workers = workers
        
        
    def _algo_init(self):
        if self.algo_name == "word2vec":
            model = Word2Vec(size = self.size, min_count = self.min_count,
                            window = self.window, workers = self.workers)
        elif self.algo_name == "fasttext":
            model = FastText(size = self.size, min_count = self.min_count,
                            window = self.window, workers = self.workers)
        elif self.algo_name == "doc2vec":
            model = Doc2Vec(vector_size = self.size, min_count = self.min_count,
                            window = self.window, workers = self.workers)
            
        self.model = model
        return self

    def _embedding_training(self, sentences, update = False):
        """
        if update = True, it will update the vocabulary and the model can continue to train.
        If update = False, the model will rebuild a new vocabulary from scratch using the input data.
        """
        updated_model_with_vocab = self.model

        updated_model_with_vocab.build_vocab(sentences, update = update)
        
        updated_model_with_vocab.train(sentences, total_examples = len(sentences), epochs = self.epochs)
        
        # update the model with the trained one. 
        self.model = updated_model_with_vocab
        
    def _pd_to_gensim_format(self, text):
        
        # special handling for doc2vec model. 
        if self.algo_name == "doc2vec":
            documents = [TaggedDocument(sentence.split(" "), [index])\
                          for index, sentence in enumerate(text)] 
            print("Using index for the tags")    
        else:
            documents = [sentence.split(" ") for sentence in text]
            
            
        return documents
            
        
    def fit(self, X):
        """
        The fit method will get use the pre_trained model if the model is assigned to the pre_train option.
        
        If the pre_train is None, then the model will be trained. 
        """
        gensim_X = self._pd_to_gensim_format(text = X)
        
        if self.pre_train is not None:
            self.model = self.pre_train
            return self
        else:
            # initialize the model, split the sentence into tokens and train it. 
            self._algo_init()
            self._embedding_training(sentences = gensim_X)
            
        return self
        
    
    def transform(self, X):
        """
        If re_train_new_sentences is True, which is the default setting, 
        the model will be re-trained on the new sentences. 
        This will create word embedding for words not in the original vocabulary.
        This will increase the model inference time since it invovles model training. 
        
        For using word2vec to predict PII data, it is recommended to update the model with new sentences. 
        For fastttext, it is not necessary since it will infer from the character n-grams. The fasttext training
        is much longer than word2vec. 
        """
        gensim_X = self._pd_to_gensim_format(text = X)
        # update the embedding with new sentences or train the model. 
        if self.re_train_new_sentences:
            self._embedding_training(sentences = gensim_X, update = True)
            print("transforming while training {} model with new data.".format(self.algo_name))
            
            
        # extract the PII 
        extracted_pii_list = [_find_part_pii(text = text, model = self.model)\
                    for text in tqdm(X) ]
        
        # convert the extracted pii text into vectors.
        piivec_matrix = _extracted_pii2matrix(pii_list = extracted_pii_list,\
                                          model = self.model)
        return piivec_matrix 
                                          

In [2]:
import pandas as pd
algo_test_data = pd.read_csv("../data/train_text_with_pii_2019_01_05_02_48_24_796403.csv")

## Word2vec testing

In [3]:
testing_embedding = word_embedding(algo_name = 'word2vec')
testing_embedding.fit(algo_test_data['Text']);
test_pii_matrix = testing_embedding.transform(algo_test_data["Text"])
test_pii_matrix

100%|██████████| 800/800 [00:00<00:00, 8181.51it/s]
100%|██████████| 800/800 [00:00<00:00, 92377.92it/s]

transforming while training word2vec model with new data.





array([[-0.00217615,  0.00796351, -0.00748214, ...,  0.00887035,
        -0.00148853,  0.00133624],
       [-0.00447374,  0.01026269, -0.00777159, ...,  0.0162302 ,
         0.00067097,  0.00166161],
       [-0.0069133 ,  0.00655595, -0.00614706, ...,  0.00911191,
         0.00302767, -0.00566331],
       ..., 
       [-0.02315124,  0.026426  , -0.0140119 , ...,  0.03725655,
        -0.00016703, -0.00462752],
       [-0.02108077,  0.01676678, -0.00663171, ...,  0.02916127,
        -0.00657078,  0.00124387],
       [-0.01856113,  0.01454719, -0.01343988, ...,  0.02961764,
        -0.00293513, -0.00825683]])

In [4]:
pre_trained = Word2Vec.load("./word2vec/word2vec_cleaned_300_.bin")
testing_pre_trained = word_embedding(algo_name = "word2vec", pre_train = pre_trained)
testing_pre_trained.fit(algo_test_data["Text"])
testing_pre_trained.transform(algo_test_data["Text"])


  0%|          | 0/800 [00:00<?, ?it/s]

transforming while training word2vec model with new data.


100%|██████████| 800/800 [00:00<00:00, 2534.26it/s]
100%|██████████| 800/800 [00:00<00:00, 154585.98it/s]


array([[ 0.0752305 , -0.18665323, -0.9611361 , ...,  0.15208749,
         0.02958794, -0.41311947],
       [ 0.43034682, -0.05534443, -0.96178532, ...,  0.46109006,
         0.22167945, -0.54757804],
       [ 0.03700855, -0.02521761, -0.01220291, ..., -0.05585773,
         0.20333029,  0.08629275],
       ..., 
       [-0.04278483, -0.05869394,  0.02729131, ..., -0.08618189,
         0.08368408,  0.00944143],
       [ 0.29659262, -0.58743894,  0.01724759, ...,  0.43597522,
        -0.06645637,  1.55932295],
       [ 0.14534651, -0.12933674, -0.10637662, ...,  0.02798184,
         0.06007978,  0.08667902]])

## Fasttext testing

In [5]:
testing_embedding = word_embedding(algo_name = 'fasttext')

testing_embedding.fit(algo_test_data["Text"]);
test_pii_matrix = testing_embedding.transform(algo_test_data["Text"])
test_pii_matrix

100%|██████████| 800/800 [00:00<00:00, 6674.95it/s]
100%|██████████| 800/800 [00:00<00:00, 134583.80it/s]

transforming while training fasttext model with new data.





array([[-0.00887869,  0.00361352, -0.01853686, ..., -0.05992417,
        -0.00419788,  0.05287892],
       [ 0.00052317,  0.00261651, -0.01038469, ..., -0.0329863 ,
        -0.00088623,  0.02751647],
       [-0.0102258 ,  0.00307373, -0.02907217, ..., -0.07936461,
        -0.00811043,  0.06561628],
       ..., 
       [-0.02260031,  0.00858459, -0.05097585, ..., -0.14817157,
        -0.00801586,  0.1313304 ],
       [-0.02994826,  0.0141341 , -0.06360444, ..., -0.16948107,
        -0.00543688,  0.1463099 ],
       [-0.02625905,  0.0081979 , -0.05881318, ..., -0.15292889,
        -0.00570232,  0.13339898]])

## doc2vec testing

In [6]:
testing_embedding_doc2vec = word_embedding(algo_name = 'doc2vec')
testing_embedding_doc2vec.fit(X = algo_test_data['Text'])

test__doc2vec_pii_matrix = testing_embedding_doc2vec.transform(algo_test_data['Text'])
test__doc2vec_pii_matrix

Using index for the tags


  0%|          | 0/800 [00:00<?, ?it/s]

Using index for the tags
transforming while training doc2vec model with new data.


100%|██████████| 800/800 [00:00<00:00, 8090.49it/s]
100%|██████████| 800/800 [00:00<00:00, 111114.75it/s]


array([[ 0.00194516,  0.00455558, -0.00649501, ...,  0.00442428,
         0.0003333 ,  0.00088473],
       [-0.00020503,  0.00680893, -0.0075832 , ...,  0.01286311,
         0.00291317,  0.00047825],
       [-0.00049342,  0.00556684,  0.00063821, ...,  0.01182422,
        -0.00324276, -0.00413751],
       ..., 
       [-0.01013561,  0.01485797, -0.01136504, ...,  0.02295436,
         0.00547766, -0.00727283],
       [-0.01072698,  0.008491  , -0.00622351, ...,  0.0203049 ,
        -0.00110666, -0.00171354],
       [-0.00733437,  0.01433416, -0.00885574, ...,  0.0186146 ,
        -0.00331921, -0.00483936]])

In [7]:
## add transfer_learning option.

In [8]:
## add dum_file option