## Imports and Basic Setup

In [2]:
from faker import Faker
import json
from decimal import Decimal
from datetime import date
import random
from shortuuid import uuid

# default global Faker object. modify locale to add other contries
fake = Faker(locale=['en-US'])

#### Custom JSON Encoder to write complex data types to file

In [3]:
class SimpleJSONEncoder(json.JSONEncoder):
    
    def default(self, value):
        dtfmt = "%Y-%m-%d"
        if isinstance(value, Decimal):
            return str(value)
        elif isinstance(value, date):
            return date.strftime(value, dtfmt)
        else:
            return super(SimpleJSONEncoder, self).default(value)

## Simple Profile Generator

In [None]:

class SimplePersonGenerator(object):
    """
    Generates proflies for simple person JSON file
    """

    def __init__(self, locale=['en-US']):
        self._fake = Faker(locale=locale)
        self.profiles = []

    def generate(self, num=1):
        """
        Generate N number of random person profiles
        """
        fake = self._fake
        bag = []
        for _ in range(num):
            person = {}
            person["uid"] = uuid()
            gender = random.choice(['M', 'M', 'F', 'F', 'NB'])
            if gender == 'M':
                person["name"] = fake.name_male()
            elif gender == 'F':
                person["name"] = fake.name_female()
            else:
                person["name"] = fake.name_nonbinary()
            person["gender"] = gender
            person["email"] = person["name"].lower().replace(' ', '.') + "@gmail.com"
            person["birthdate"] = fake.date_of_birth()
            person["salary"] = round(fake.pyfloat(min_value=70000.00, max_value=168000.00), 2)
            person['credit_score'] = fake.pyint(min_value=500, max_value=800)
            person["active"] = fake.pybool()
            # add the person to the bag of people generated
            bag.append(person)
        # add our generated bag to the class profiles
        self.profiles.extend(bag)
        # return either the person or bag of people
        return (bag[0] if num == 1 else bag)

    def to_json(self, json_file, cls=SimpleJSONEncoder):
        """
        Write all generated profiles to a file as JSON rows
        """
        for person in self.profiles:
            # convert to json string using special encoder for writing datetime formats
            json_str = json.dumps(person, cls=cls)
            # write the json line to file
            json_file.write(f"{json_str}\n")

g = SimplePersonGenerator()
g.generate(10)
with open("./data/profiles_simple.json", "w") as json_file:
    g.to_json(json_file)

print("done!")

## Complex Profile Generator

- Produce profiles for more complex hierarchal JSON objects
- Introduce inconsistent data including nulls and missing fields

In [None]:
class ComplexPersonGenerator(object):
    """
    Generates complex people profiles:
     - including a hierarchal data types: complex fields with sub-fields
     - including inconsistent data: missing fields and values
    """

    def __init__(self, locale=['en-US']):
        self._fake = Faker(locale=locale)
        self.profiles = []
        self.locale = locale

    def generate(self, num=1):
        """
        Generate N number of random person profiles
        """
        fake = self._fake
        bag = []
        for _ in range(num):
            person = {}
            # decisions to throw bad fields
            missing_uid = False if random.randint(1, 20) > 1 else True
            missing_email = False if random.randint(2, 30) > 1 else True
            # generate data
            person["uid"] = None if missing_uid else uuid()
            gender = random.choice(['M', 'F'])
            if gender == 'M':
                person["name"] = fake.name_male()
            else:
                person["name"] = fake.name_female()
            person["gender"] = gender
            if not missing_email:
                person["email"] = person["name"].lower().replace(' ', '.') + "@gmail.com"
            person["birthdate"] = fake.date_of_birth()
            # person["job"] = fake.job()
            # person["ssn"] = fake.ssn()
            person["address"] = fake.address()
            person["geo_location"]= fake.local_latlng(country_code='US')[:2]
            person["credit_cards"] = []
            # add credit cards
            for _ in range(random.choice((1, 1, 1, 2))):
                card = {}
                card["card_type"] = fake.credit_card_provider()
                card["card_number"] = fake.credit_card_number()
                card["exp_date"] = fake.credit_card_expire()
                card["cvc"] = fake.credit_card_security_code()
                person["credit_cards"].append(card)
            # add the person to the bag of people generated
            bag.append(person)
        # add our generated bag to the class profiles
        self.profiles.extend(bag)
        # return either the person or bag of people
        return (bag[0] if num == 1 else bag)

    def to_json(self, json_file, cls=SimpleJSONEncoder):
        """
        Write all generated profiles to a file as JSON rows
        """
        for person in self.profiles:
            # convert to json string using special encoder for writing datetime formats
            json_str = json.dumps(person, cls=cls)
            # write the json line to file
            json_file.write(f"{json_str}\n")


g = ComplexPersonGenerator()
g.generate(30)
with open("./data/profiles_complex.json", "w") as json_file:
    g.to_json(json_file)

## Simple Car Registration

Generate:
- car registration file
- address, date fields

In [31]:
import json
from faker import Faker
from faker_vehicle import VehicleProvider


class SimpleVehicleRegistrationGenerator(object):

    def __init__(self, locale=['en_US']):
        fake = Faker(locale=locale)             # faker class
        fake.add_provider(VehicleProvider)      # add vehicle provider
        self._fake = fake
        self.cars = []

    def generate(self, num=1):
        fake = self._fake
        bag = []
        # generate N=num of cars
        for _ in range(num):
            car = {}
            car["license_plate"] = fake.license_plate()
            vehicle = fake.vehicle_object()
            car["make_model"] = vehicle["Make"] + ", " + vehicle["Model"]
            car["year"] = vehicle["Year"]
            car["color"] = fake.color_name()
            car["registered_date"] = fake.date_between(start_date="-5y", end_date="today")
            car["registered_name"] = fake.name()
            car["registered_address"] = fake.address()
            bag.append(car)
        # add the bag to the cards
        self.cars.extend(bag)
        return bag[0] if num == 1 else bag

    def to_json(self, json_file, cls=SimpleJSONEncoder):
        for car in self.cars:
            row = json.dumps(car, cls=cls)
            json_file.write(f"{row}\n")


g = SimpleVehicleRegistrationGenerator()
g.generate(10)
# write to file
with open("./data/vehicles_simple.json", "w") as json_file:
    g.to_json(json_file)

## Complex Car Registration Generator

In [14]:
import json
from faker import Faker
from faker_vehicle import VehicleProvider
import random
from datetime import timedelta

def add_cent_to_dollar(dollar_value:int) -> float:
    cents = random.randint(0, 99)
    return float(f"{dollar_value}.{cents:02d}")


class ComplexVehicleRegistrationGenerator(object):

    def __init__(self, locale=['en_US']):
        fake = Faker(locale=locale)             # faker class
        fake.add_provider(VehicleProvider)      # add vehicle provider
        self._fake = fake
        self.cars = []

    def generate(self, num=1):
        fake = self._fake
        bag = []
        # generate N=num of cars
        for _ in range(num):
            car = {}
            car["license_plate"] = fake.license_plate()
            # license_plate=None for 5% of rows
            if random.randint(1, 100) <= 5:
                car["license_plate"] = None
            vehicle = fake.vehicle_object()
            car["make_model"] = vehicle["Make"] + ", " + vehicle["Model"]
            car["year"] = vehicle["Year"]
            car["color"] = fake.safe_color_name()
            # color = None for 5% of rows
            if random.randint(1, 100) <= 5:
                car["color"] = None
            car["registered_date"] = fake.date_between(start_date="-5y", end_date="today")
            # increase the registration date to 10+ yrs on 5% of rows
            if random.randint(1, 100) <= 5:
                car["registered_date"] = fake.date_between(start_date="-30y", end_date="-10y")
            # name=None for 5% of rows
            car["registered_name"] = fake.name()
            if random.randint(1, 100) <= 5:
                car["registered_name"] = None
            car["registered_address"] = fake.address()
            # remove the second line of address for 5% of rows
            if random.randint(1, 100) <= 5:
                car["registered_address"] = str(car["registered_address"]).rpartition('\n')[0]
            # delete address for 5% of row
            if random.randint(1, 100) <= 5:
                del car["registered_address"]
            # generate sales records
            num_sales = random.randint(0, 3)
            car["sales_record"] = []
            price = add_cent_to_dollar(random.randint(10000, 40000))
            last_date = fake.date_between(start_date="-3y")
            last_name = car["registered_name"] if car["registered_name"] is not None else fake.name()
            for i in range(num_sales):
                sale = {}
                sale["new_owner"] = last_name
                last_name = fake.name()
                sale["previous_owner"] = last_name
                # create a gap in sales record for 2% of rows
                if random.randint(1, 100) <= 2:
                    last_name = fake.name()
                price += round(i * add_cent_to_dollar(random.randint(1000, 3000)), 2)
                last_date -= timedelta(random.randint(100, 356*4))
                sale["sale_price"] = price
                sale["sale_date"] = last_date
                car["sales_record"].append(sale)
            bag.append(car)
        # add the bag to the cards
        self.cars.extend(bag)
        return bag[0] if num == 1 else bag

    def to_json(self, json_file, cls=SimpleJSONEncoder):
        for car in self.cars:
            row = json.dumps(car, cls=cls)
            json_file.write(f"{row}\n")


g = ComplexVehicleRegistrationGenerator()
g.generate(1000)
# write to file
with open("../data/vehicles_complex.json", "w") as json_file:
    g.to_json(json_file)