## Train the word2vec model with the data from clean_data/

In [1]:
from gensim.models import Word2Vec
import pandas as pd
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [2]:
fake_pii = pd.read_csv("../clean_data/Cleaned_train_text_with_pii_2018_12_29_07_26_56_266227.csv")

In [3]:
fake_pii.head()

Unnamed: 0,Text,Labels,PII,Cleaned_text
0,Return never place become nearly movement. No ...,Address,"9479 Ward Creek Suite 474 Warnershire, NH 08434",return never place become nearly movement no s...
1,Quality recently seek Suite 937 goal relations...,Address,Suite 937,quality recently seek suite 937 goal relations...
2,Word expert reveal effect response forward. In...,Address,Apt. 925,word expert reveal effect response forward ind...
3,Son specific think. Conference its home expert...,Address,"15286 Ramirez Lakes Apt. 147 West Anthony, FL ...",son specific think conference its home expert ...
4,Control trouble Apt. 223 keep action rule. Sol...,Address,Apt. 223,control trouble apt 223 keep action rule soldi...


In [4]:
def binary_pii(label):
    pii_label = "No"
    if label != "None":
        pii_label = "Yes"
    return pii_label 

In [5]:
fake_pii["Target"] = fake_pii['Labels'].apply(binary_pii)

In [None]:
fake_pii.head()

In [6]:
context_sentences = [sentence.split(" ") for sentence in fake_pii.Cleaned_text]

In [7]:
%%time 
word2vec_model = Word2Vec(context_sentences, size = 100, window=5,
                                  min_count = 1, workers = 2)

CPU times: user 2min 22s, sys: 556 ms, total: 2min 22s
Wall time: 1min 16s


In [8]:
file_name = "./word2vec/"+"word2vec_cleaned_300_"+".bin"
word2vec_model.save(file_name)

## Visualize the resulting word2vec model vectors in 2d with PCA.

In [9]:
from yellowbrick.features.pca import PCADecomposition
import matplotlib.pyplot as plt

In [10]:
from word2vec_visualizer import find_part_pii, get_pii2vec_matrix, color_dict

Extract the PII part since it is most different from other part of the text.

In [11]:
from tqdm import tqdm
pii_list = [find_part_pii(text, model = word2vec_model) for text in tqdm(fake_pii["Cleaned_text"]) ]

100%|██████████| 800000/800000 [01:20<00:00, 9996.21it/s] 


Use the word2vec model to get the vector representation of the extracted PII.

In [12]:
pii_matrix = get_pii2vec_matrix(pii_list = pii_list, model = word2vec_model)

100%|██████████| 800000/800000 [00:02<00:00, 272446.72it/s]


In [None]:
pii_labels = fake_pii["Labels"]
colors_map = pii_labels.map(color_dict)


plt.figure(figsize=(10,10))
PCAvisualizer = PCADecomposition(scale=True, color = colors_map, proj_dim = 2)
PCAvisualizer.fit_transform(pii_matrix, pii_labels)
PCAvisualizer.poof()

## Training some classifiers for classifications. 

### Logistic regression

In [14]:
%%time
%%notify -m "model finished training"
from sklearn.linear_model import LogisticRegression
from scipy.stats import expon
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(LogisticRegression(random_state=0 , max_iter=10000))

param_grid = {
    'logisticregression__C':expon(scale = 2),
    'logisticregression__penalty':['l1','l2']
}

random_cv_lr = RandomizedSearchCV(estimator= pipe,param_distributions = param_grid, cv =10, error_score = 0,
                               n_iter = 10 , scoring = 'f1', return_train_score=True, n_jobs = 2)


random_cv_lr.fit(pii_matrix, fake_pii['Target'])

ValueError: pos_label=1 is not a valid label: array(['No', 'Yes'],
      dtype='<U3')

<IPython.core.display.Javascript object>

CPU times: user 2.62 s, sys: 935 ms, total: 3.56 s
Wall time: 3min 32s


In [15]:
binary_pred = random_cv_lr.predict(pii_matrix)
binary_true = fake_pii["Target"]

NotFittedError: This RandomizedSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
%%time

import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(confusion_matrix(y_true = binary_true, y_pred = binary_pred), annot = True,fmt="d")
plt.xlabel("Predicted values")
plt.ylabel("True values")

In [None]:
%%time
%%notify -m "metric are ready"
print(classification_report(y_true = binary_true, y_pred = binary_pred))

### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipe = make_pipeline(RandomForestClassifier())

from scipy.stats import randint

param_grid = {
    'randomforestclassifier__criterion':['gini', 'entropy'],
    'randomforestclassifier__n_estimators':randint(100,5000)
}

random_cv_rf = RandomizedSearchCV(estimator = pipe,param_distributions = param_grid, cv =10, error_score = 0,
                               n_iter = 10 , scoring = 'f1',return_train_score=True, n_jobs = 2)

In [None]:
random_cv_rf.fit(pii_matrix, fake_pii['Target'])

### Xgboost

In [None]:
from xgboost import XGBClassifier


from scipy.stats import randint, uniform

param_grid = {
    'xgbclassifier__booster':['gbtree', 'gblinear','dart'],
    'xgbclassifier__n_estimators':randint(100,1000),
    'xgbclassifier__gamma':uniform(0,10),
    "xgbclassifier__learning_rate":uniform(0,1),
    'xgbclassifier__nthread':[2],
    'xgbclassifier__max_depth':randint(1,10)    
}

random_cv_xgboost = RandomizedSearchCV(estimator = pipe,param_distributions = param_grid,\
                                       cv =10, error_score = 0,n_iter = 10 , scoring = 'f1'\
                                       ,return_train_score=True, n_jobs = 2)

In [None]:
random_cv_xgboost.fit(pii_matrix, fake_pii['Target'])

# Appendix

In [None]:
import inspect

In [None]:
color_dict

In [None]:
print(inspect.getsource(find_part_pii))

In [None]:
print(inspect.getsource(get_pii2vec_matrix))