The goal of this notebook is to create some fake PII data and merge it with some blog or news for training. 

The generated PII type is SSN, Name (first or last name), address (street address or full address), email (personal or company), phone_number, credit card number, birthday.   

In [1]:
from faker import Faker
from numpy.random import choice as choose
from numpy.random import seed as setseed

from tqdm import tqdm

from datetime import datetime

import pandas as pd



def sep_change(text, init_sep, after_sep):
    """
    A function to separate the input text into strings by the init_sep variable
    
    and combine the strings again by the after_sep variable.
    """
    splitted_strings = text.split(init_sep)
    
    combined_strings = after_sep.join(splitted_strings)
    
    return combined_strings

def convert_datetime_underscore(data):
    """
    A Helper function to convert all separators in datetime.now() into underscore. 
    """
    now_string = str(data)
    
    now_string = sep_change(now_string, init_sep = "-", after_sep = "_")
    
    now_string = sep_change(now_string, init_sep = " ", after_sep = "_")
    
    now_string = sep_change(now_string, init_sep = ":", after_sep = "_")
    
    now_string = sep_change(now_string, init_sep = ".", after_sep = "_")
    
    return now_string
    
def _random_sep_change(data, init_sep = "-", after_sep = " ", percentage = 0.5 , seed = 7):
    """
    A function to randomly change the SSN data's separator. 
    
    The input data is a list.
    """
    setseed(seed)
    # generate the index for replacing separator.
    replacing_indexes = choose(range(len(data)), int(len(data)*percentage))
    
    for each_replacing_index in replacing_indexes:
        # change the ssn data's separator from init_sep to after_sep
        data[each_replacing_index] = sep_change(data[each_replacing_index], init_sep, after_sep)
        
    return data
        

    
    

class Fake_PII():
    '''
    A class to generate a number of fake profiles. 
    '''
    def __init__(self, n_profile,fake_profiles = None, seed = 7,\
                pii_text = None, pii_labels = None):

            
        assert isinstance(n_profile, int), "Please enter an integer\
        for the number of generated profiles."
        
        # initialize the Faker from faker package for fake data generation.
        try:
            self.faker = Faker()
        except ImportError as error:
            print(error.__class__.__name__ + ": " + error.message)
            

        self.n_profile = n_profile
        self.pii_text = pii_text
        self.pii_labels = pii_labels
        self.fake_profiles = fake_profiles
        self.seed = seed 
 
        
    def create_fake_profile(self, verbose = False, ssn_sep_change = True):
        
        fake_profiles = dict()
        # use faker package to generate either a full/last name/first name.
        fake_profiles["Name"] = [choose([self.faker.name(),\
                                         self.faker.last_name(),
                                         self.faker.first_name()])\
                                 for _ in range(self.n_profile)]
        # use faker to generate either a full/secondary/street address
        fake_profiles["Address"] = [choose([self.faker.address(),\
                                            self.faker.street_address(),\
                                            self.faker.secondary_address()])\
                                   for _ in range(self.n_profile)]
        
        fake_profiles["SSN"] = [self.faker.ssn() for _ in range(self.n_profile)]
        
        fake_profiles["Email"] = [self.faker.email() for _ in range(self.n_profile)]
                                 
        fake_profiles["Plates"] = [self.faker.license_plate()\
                                   for _ in range(self.n_profile)]
                                        
        
        fake_profiles["CreditCardNumber"] = [self.faker.credit_card_number()\
                                             for _ in range(self.n_profile)]
                                     
        
        fake_profiles["Phone_number"] = [self.faker.phone_number()\
                                         for _ in range(self.n_profile)]
        
        # change the separator in SSN data.
        if ssn_sep_change:
            fake_profiles["SSN"] = _random_sep_change(fake_profiles["SSN"])
                                        
        # change the separator in Address data from "/n" to " "
        fake_profiles['Address'] = [sep_change(each_address,init_sep = "/n" , after_sep = " ")\
                                   for each_address in fake_profiles['Address'] ]
                                              
        
        
        self.fake_profiles = fake_profiles
        
        if verbose:
            return self.fake_profiles 
        
    def _init_pii_gen(self):
        
        # generate the all possible PII implemented in the create_fake_profile methods
        self._fake_labels = list(self.fake_profiles.keys())
        # generate the None labels 
        self._none_pii_labels = ["None" for _ in range(self._n_text)]
        # generate the pii labels
        self.pii_labels = sorted(self._fake_labels*self._n_text)
        # generate the test with no pii
        self._fake_text_no_pii = [self.faker.paragraph() for _ in range(self._n_text)]
        # mutiply the no pii text with the number of PII types
        self._init_fake_text_no_pii = self._fake_text_no_pii*len(self._fake_labels)
        # initialize the text mixed with PII with all "None" strings. 
        self.pii_text = ["None" for _ in range(self._n_text*len(self._fake_labels))]
        
    
    def create_pii_text(self, n_text = 10):
        """
        A method to create text randomly mixed with fake PII
        """
        warning_text = "Please create fake profiles first with .create_fake_profile method."
        assert self.fake_profiles is not None, warning_text
        
        self._n_text = n_text
        # initialized a few variables for inserting PII values 
        self._init_pii_gen()
        
        # insert PII into the text
        for index, PII in enumerate(tqdm(self.pii_labels)):
            # choose a PII value from the dictionary according to the PII type.
            PII_value = choose(self.fake_profiles[PII])
            
            orignal_fake_text = self._init_fake_text_no_pii[index]
            
            tokenized_fake_text = orignal_fake_text.split(" ")
            
            # generate the position to fill in the PII value
            PII_position = choose(range(len(tokenized_fake_text)+1))
            
            tokenized_fake_text.insert(PII_position, PII_value)
            
            one_text_mixed_with_PII = " ".join(tokenized_fake_text)
            
            self.pii_text[index] = one_text_mixed_with_PII
        
        
        self.pii_text.extend(self._fake_text_no_pii)
        self.pii_labels.extend(self._none_pii_labels)
        
   
        
        return self.pii_labels, self.pii_text
    

In [2]:
fake_ = Fake_PII(100000)

In [3]:
fake_.create_fake_profile()

In [None]:
labels, text = fake_.create_pii_text(n_text = 100000)

 57%|█████▋    | 401748/700000 [1:24:35<38:19, 129.68it/s] 

In [None]:
len(labels) 

In [None]:
len(text)

In [None]:
text_with_pii = pd.DataFrame({"Text":text, "Labels":labels})

In [None]:
file_name = "text_with_pii_" + convert_datetime_underscore(datetime.now()) + ".csv"
text_with_pii.to_csv(file_name,index=False)