## Testing on unseen text data.

The goal of this notebook is to determine the performance of different embedding and algorithm on unseen text data

###  load the unseen text data

In [1]:
import pandas as pd

test_data = pd.read_csv("../clean_data/Cleaned_test_text_with_pii_\
2018_12_31_05_35_46_815414.csv")
train_data = pd.read_csv("../clean_data/Cleaned_train_text_with_pii_2018_12_29_07_26_56_266227.csv")

In [2]:
test_data.head()

Unnamed: 0,Text,Labels,PII,Cleaned_text,Target
0,Wife marriage cup 7416 Smith Forks before pict...,Address,7416 Smith Forks,wife marriage cup 7416 smith forks before pict...,1
1,However send which. Suite 244 Nice market acce...,Address,Suite 244,however send which suite 244 nice market accep...,1
2,0497 Kemp Lane Amount tough and fire until. Is...,Address,0497 Kemp Lane,0497 kemp lane amount tough and fire until iss...,1
3,Model north receive nature effort 58162 France...,Address,"58162 Frances Shoals Conniemouth, OH 71686",model north receive nature effort 58162 france...,1
4,Child already drive could. Begin such down cel...,Address,538 Gina Circles,child already drive could begin such down cell...,1


In [14]:
test_data['Labels'].value_counts()

None                10000
Phone_number        10000
CreditCardNumber    10000
Name                10000
Address             10000
SSN                 10000
Email               10000
Plates              10000
Name: Labels, dtype: int64

### Accomplish entire the training process with a simple Pipeline 

In [3]:
# 0 denote no pii
# 1 denote pii exists
def binary_pii(label):
    pii_label = 0
    if label != "None":
        pii_label = 1
    return pii_label 

test_data["Target"] = test_data['Labels'].apply(binary_pii)

In [4]:
import re
def clean_text(text):
    # replace  . and a space with only a space, then amke all words lower case.
    text = text.replace(". "," ").replace(",","").lower()
    # get rid of the . at the end of each line. 
    cleaned_text = re.sub("\.$","",text)
    
    return cleaned_text
 


class text_clean:
    """
    A class to help with cleaning text data. 
    """
    def fit(self, X, y):
        return self
    def transform(self, X):
        assert isinstance(X,pd.Series), "The input data should be pandas Series."
        X = X.apply(clean_text)
        
        
        return X
    # This will return the entire dataframe with changed columns. 

In [5]:
from tqdm import tqdm
import numpy as np
from gensim.models import Word2Vec


def _find_part_pii(text, model, sep = " "):
    tokenized_text = text.split(sep)
    
    part_pii = model.wv.doesnt_match(tokenized_text)
    
    return part_pii    



def _get_word2vec_matrix(pii_list, model):
    # set the matrix dimensions
    column_num = model.trainables.layer1_size
    row_num = len(pii_list)
    # initialized the matrix
    pii2vec_mat = np.zeros((row_num, column_num))
    # iterate through the pii_list and assign the vectors to matrix.
    for index, ith_pii in enumerate(tqdm(pii_list)):
        pii2vec_mat[index,:] = model.wv[ith_pii]
    
    return pii2vec_mat



class word2vec_embed:
    """
    A class to convert words/docs to vectors by applying word2vec 
    algorithm for training a classifier. 
    Also used for predicting new unseen text by assigning the model variable. 
    """
    def __init__(self, size = 100, window=5,min_count = 1, workers = 1, \
                 epochs = 5, model = None):
        self.size = size
        self.window = window
        self.min_count = min_count 
        self.epochs =  epochs 
        # this is set to 1 to voids problem with multi-core training. 
        self.workers = workers 
        self.model = model
        if model is not None:
            self.size = model.trainables.layer1_size
            self.window = model.window
            self.min_count = model.min_count
            
        
    def fit(self, X, y):
        assert isinstance(X,pd.Series), "The input data should be pandas \
        Series for word2vec."
        
        # tokenized the sentences 
        tokenized_sentences = [sentence.split(" ") for sentence in X]
        # build vocab and train the word2vec model. 
        model = Word2Vec(size = self.size, window = self.window\
                   ,min_count = self.min_count, workers = self.workers )
        

        model.build_vocab(tokenized_sentences)
            
        
        model.train(tokenized_sentences, total_examples = len(tokenized_sentences),\
                    epochs = self.epochs)
        
        # assign the trained model to self.model and return self.
        self.model = model
        return self
    
    def transform(self, X):
        # update the embedding with new sentences. 
        model_updated = self.model
        tokenized_new_sentences = [new_sentence.split(" ") for new_sentence in X]
        model_updated.build_vocab(tokenized_new_sentences, update = True)
        model_updated.train(tokenized_new_sentences,\
                                  total_examples = len(tokenized_new_sentences)\
                                  , epochs = self.epochs)
        # extract the PII 
        extracted_pii_list = [_find_part_pii(text, model = model_updated)\
                    for text in tqdm(X) ]
        
        # convert the extract pii text into vectors.
        pii_matrix = _get_word2vec_matrix(pii_list = extracted_pii_list,\
                                          model = model_updated)
        return pii_matrix 
                                          

In [6]:
class doc2vec_embed:
    pass

In [13]:
class fasttext_embed:
    pass

In [7]:
from sklearn.pipeline import Pipeline


from sklearn.linear_model import LogisticRegression
logit_clf = LogisticRegression(solver = "lbfgs", max_iter = 10000, class_weight={0:0.8,1:0.1})

In [8]:
model = Pipeline([('text_cleaning', text_clean()),
                 ("word2vec", word2vec_embed()),
                 ("logit",logit_clf)
                ])

In [9]:
%%time
model.fit(train_data["Text"],train_data["Target"])

100%|██████████| 800000/800000 [01:17<00:00, 10339.59it/s]
100%|██████████| 800000/800000 [00:02<00:00, 279078.62it/s]


CPU times: user 6min 17s, sys: 11.3 s, total: 6min 28s
Wall time: 6min 4s


Pipeline(memory=None,
     steps=[('text_cleaning', <__main__.text_clean object at 0x7f30e6591c88>), ('word2vec', <__main__.word2vec_embed object at 0x7f30e6591fd0>), ('logit', LogisticRegression(C=1.0, class_weight={0: 0.8, 1: 0.1}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=10000,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))])

In [10]:
%%time 
binary_pred = model.predict(test_data["Text"])
binary_true = test_data["Target"]

100%|██████████| 80000/80000 [00:09<00:00, 8826.04it/s] 
100%|██████████| 80000/80000 [00:00<00:00, 202371.15it/s]


CPU times: user 24.8 s, sys: 433 ms, total: 25.2 s
Wall time: 25 s


In [11]:
%%time
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize= (8,8))
sns.heatmap(confusion_matrix(y_true = binary_true, y_pred = binary_pred), annot = True,fmt="d",cmap="YlGnBu")
plt.xlabel("Predicted values")
plt.ylabel("True values")

CPU times: user 368 ms, sys: 141 ms, total: 509 ms
Wall time: 319 ms


In [12]:
%%time
from sklearn.metrics import classification_report

print(classification_report(y_true = binary_true, y_pred = binary_pred))

              precision    recall  f1-score   support

           0       0.63      0.99      0.77     10000
           1       1.00      0.92      0.96     70000

   micro avg       0.92      0.92      0.92     80000
   macro avg       0.81      0.95      0.86     80000
weighted avg       0.95      0.92      0.93     80000

CPU times: user 72.9 ms, sys: 88.3 ms, total: 161 ms
Wall time: 57.6 ms


The performance was quite good for a first relase. 