The goal of this notebook is to create some fake PII data and merge it with some blog or news for training. 

The generated PII type is SSN, Name (first or last name), address (street address or full address), email (personal or company), phone_number, credit card number, birthday.   

In [1]:
from faker import Faker
from numpy.random import choice as choose
from numpy.random import seed as setseed

from tqdm import tqdm

from datetime import datetime

import pandas as pd



def sep_change(text, init_sep, after_sep):
    """
    A function to separate the input text into strings by the init_sep variable
    
    and combine the strings again by the after_sep variable.
    """
    splitted_strings = text.split(init_sep)
    
    combined_strings = after_sep.join(splitted_strings)
    
    return combined_strings

def convert_datetime_underscore(data):
    """
    A Helper function to convert all separators in datetime.now() into underscore. 
    """
    now_string = str(data)
    
    now_string = sep_change(now_string, init_sep = "-", after_sep = "_")
    
    now_string = sep_change(now_string, init_sep = " ", after_sep = "_")
    
    now_string = sep_change(now_string, init_sep = ":", after_sep = "_")
    
    now_string = sep_change(now_string, init_sep = ".", after_sep = "_")
    
    return now_string
    
def _random_sep_change(data, init_sep = "-", after_sep = " ", percentage = 0.5 , seed = 7):
    """
    A function to randomly change the SSN data's separator. 
    
    The input data is a list.
    """
    setseed(seed)
    # generate the index for replacing separator.
    replacing_indexes = choose(range(len(data)), int(len(data)*percentage))
    
    for each_replacing_index in replacing_indexes:
        # change the ssn data's separator from init_sep to after_sep
        data[each_replacing_index] = sep_change(data[each_replacing_index], init_sep, after_sep)
        
    return data
        

    
    

class Fake_PII():
    '''
    A class to generate a number of fake profiles, training/testing text mixed with
    different types of fake PIIs.
    '''
    def __init__(self, n_profile = None,fake_profiles = None, seed = 7,\
                pii_with_text = None, pii_labels = None, PII = None):

            
        
        # initialize the Faker from faker package for fake data generation.
        try:
            self.faker = Faker()
        except ImportError as error:
            print(error.__class__.__name__ + ": " + error.message)
            

        self.n_profile = n_profile
        self.pii_with_text = pii_with_text
        self.pii_labels = pii_labels
        self.fake_profiles = fake_profiles
        self.seed = seed 
        self.PII = PII 
 
        
    def create_fake_profile(self, n_profile, verbose = False, ssn_sep_change = True):
        
        assert isinstance(n_profile, int), "Please enter an integer\
        for the number of generated profiles."
        self.n_profile = n_profile
        
        fake_profiles = dict()
        # use faker package to generate either a full/last name/first name.
        fake_profiles["Name"] = [choose([self.faker.name(),\
                                         self.faker.last_name(),
                                         self.faker.first_name()])\
                                 for _ in range(self.n_profile)]
        # use faker to generate either a full/secondary/street address
        fake_profiles["Address"] = [choose([self.faker.address(),\
                                            self.faker.street_address(),\
                                            self.faker.secondary_address()])\
                                   for _ in range(self.n_profile)]
        
        fake_profiles["SSN"] = [self.faker.ssn() for _ in range(self.n_profile)]
        
        fake_profiles["Email"] = [self.faker.email() for _ in range(self.n_profile)]
                                 
        fake_profiles["Plates"] = [self.faker.license_plate()\
                                   for _ in range(self.n_profile)]
                                        
        
        fake_profiles["CreditCardNumber"] = [self.faker.credit_card_number()\
                                             for _ in range(self.n_profile)]
                                     
        
        fake_profiles["Phone_number"] = [self.faker.phone_number()\
                                         for _ in range(self.n_profile)]
        
        # change the separator in SSN data.
        if ssn_sep_change:
            fake_profiles["SSN"] = _random_sep_change(fake_profiles["SSN"])
                                        
        # change the separator in Address data from "/n" to " "
        fake_profiles['Address'] = [sep_change(each_address, init_sep = "\n" , after_sep = " ")\
                                   for each_address in fake_profiles['Address'] ]
                                              
        
        
        self.fake_profiles = fake_profiles
        
        if verbose:
            return self.fake_profiles 
        
    def _init_pii_gen_train(self):
        
        # generate the all possible PII implemented in the create_fake_profile methods
        self._fake_labels = list(self.fake_profiles.keys())
        
        # generate the None labels 
        self._none_pii_labels = ["None" for _ in range(self._n_text)]
        
        # generate the pii labels
        self.pii_labels = sorted(self._fake_labels*self._n_text)
        
        # generate the test with no pii
        self._fake_text_no_pii = [self.faker.paragraph() for _ in range(self._n_text)]
        
        # mutiply the no pii text with the number of PII types
        self._init_fake_text_no_pii = self._fake_text_no_pii*len(self._fake_labels)     
        
        # initialize the text mixed with PII with all "None" strings. 
        self.pii_with_text = ["None" for _ in range(len(self._fake_labels)*(self._n_text))]
        
        # initialize the PII with all "None" strings
        self.PII = ["None" for _ in range(self._n_text*(len(self._fake_labels)+1))]
        
    def _random_pii_insert(self):
        # randomly insert PII into the text
        for index, PII in enumerate(tqdm(self.pii_labels)):
            # choose a PII value from the dictionary according to the PII type.
            PII_value = choose(self.fake_profiles[PII])
            
            original_fake_text = self._init_fake_text_no_pii[index]
            
            tokenized_fake_text = original_fake_text.split(" ")
            
            # generate the position to fill in the PII value
            PII_position = choose(range(len(tokenized_fake_text)+1))
            
            tokenized_fake_text.insert(PII_position, PII_value)
            
            one_text_mixed_with_PII = " ".join(tokenized_fake_text)
            
            self.pii_with_text[index] = one_text_mixed_with_PII
            self.PII[index] = PII_value
        
    
    def create_pii_text_train(self, n_text = 10):
        """
        A method to create the training text randomly mixed with fake PII. This
        method creates a text and mixed it with different kinds of PII, which leads
        to a total number of (num_of_PII)*(n_text) rows.
        
        """
        warning_text = "Please create fake profiles first with .create_fake_profile method."
        assert self.fake_profiles is not None, warning_text
        
        self._n_text = n_text
        # initialized a few variables for inserting PII values 
        self._init_pii_gen_train()
        
        # randomly insert Pii text into the paragraph.
        self._random_pii_insert()
        
        
        self.pii_with_text.extend(self._fake_text_no_pii)
        self.pii_labels.extend(self._none_pii_labels)
        
   
        
        return self.pii_labels, self.pii_with_text, self.PII
        
    def _init_pii_gen_test(self):
        
        # generate the all possible PII implemented in the create_fake_profile methods
        self._fake_labels = list(self.fake_profiles.keys())
        
        # generate the None labels 
        self._none_pii_labels = ["None" for _ in range(self._n_text)]
        
        # generate the pii labels
        self.pii_labels = sorted(self._fake_labels*self._n_text)  
        
        total_num_pii_text = (1+len(self._fake_labels))*(self._n_text)
        # generate the fake text with no pii
        self._init_fake_text_no_pii = [self.faker.paragraph() for _ in range(total_num_pii_text)]
        
        # initialize the text mixed with PII with all "None" strings. 
        self.pii_with_text = self._init_fake_text_no_pii
        # initialize the PII with all "None" strings
        self.PII = ["None" for _ in range(total_num_pii_text)]
        
    
    def create_pii_text_test(self, n_text = 10):
        """
        A method to create the testing text randomly mixed with fake PII. 
        This method creates a text and mixed it with a type of PII. 
        
        In the training text, a normal text is repeated used to insert different PIIs into
        it. In the testing text, a normal text is not intentionally repeated to insert 
        different PIIs. 
        
        """
        warning_text = "Please create fake profiles first with .create_fake_profile method."
        assert self.fake_profiles is not None, warning_text
        
        self._n_text = n_text
        
        # initialized a few variables for inserting PII values 
        self._init_pii_gen_test()
        
        # randomly insert Pii text into the paragraph.
        self._random_pii_insert()
        # add the none labels 
        self.pii_labels.extend(self._none_pii_labels)
       
        return self.pii_labels, self.pii_with_text, self.PII
        
    

In [2]:
fake_ = Fake_PII()
fake_.create_fake_profile(10)
train_labels, train_text, train_PII = fake_.create_pii_text_train(n_text = 10)
test_labels, test_text, test_PII = fake_.create_pii_text_test(n_text = 10)

100%|██████████| 70/70 [00:00<00:00, 35023.41it/s]
100%|██████████| 70/70 [00:00<00:00, 37010.12it/s]


In [3]:
train_text_with_pii = pd.DataFrame({"Text":train_text, "Labels":train_labels, "PII":train_PII})
train_file_name = "train_text_with_pii_" + convert_datetime_underscore(datetime.now()) + ".csv"
train_text_with_pii.to_csv(train_file_name,index=False)





test_text_with_pii = pd.DataFrame({"Text":test_labels, "Labels":test_text, "PII":test_PII})
test_file_name = "test_text_with_pii_" + convert_datetime_underscore(datetime.now()) + ".csv"
test_text_with_pii.to_csv(test_file_name,index=False)



In [4]:
train_text

['Maybe site begin pass relationship natural style. Suite 927 Full actually arrive house someone per understand.',
 'Have specific top already under part avoid. 25468 Matthew Village Theresaland, AR 06964 Sit state south film east.',
 'Ahead bank several nothing product chair. Per probably eat 6958 Julie Harbors Apt. 486 take kid measure.',
 'Instead many appear beautiful now. Within candidate 6958 Julie Harbors Apt. 486 great alone task general ever. Indeed like bar.',
 'Analysis our policy produce 56342 William Isle direction morning. Trade energy newspaper price. Too he five daughter.',
 'Card point particularly table imagine exactly. Office 7047 Raymond Stream Suite 912 Port Stephen, RI 57413 identify civil sign rich form wish. Stay type to under somebody include individual ability.',
 'When feeling hair fund bill. Let card team meeting already start. 7047 Raymond Stream Suite 912 Port Stephen, RI 57413 Impact wear single start respond.',
 'Subject but study language toward. Indust

In [5]:
test_text

['Friend his those run probably seek our. Trial same century conference matter 6958 Julie Harbors Apt. 486 economy fight. Yard others something.',
 'Sometimes myself only detail own. Occur his Suite 772 stock peace. From including down attorney special relationship.',
 'Everybody machine through simple will. Live wonder former identify. 3576 Davis Extension Suite 359',
 'Fact close 56342 William Isle get machine election section. Go authority attention.',
 'Stock feeling life mean. Certain 7047 Raymond Stream Suite 912 Port Stephen, RI 57413 various research administration upon another matter.',
 'Apt. 893 Behavior medical plan full. Scene dinner its know her.',
 'None health practice she old especially. Continue 3576 Davis Extension Suite 359 evening base bad. Clearly cost read.',
 'Away happen shake region. Standard middle indeed answer. Method organization become red feel. 3268 Teresa Plaza East Cassandra, HI 22259',
 'Off upon blue west 6958 Julie Harbors Apt. 486 great capital con