# A Quick Guide to Generating Fake Data with Pandas
https://www.caktusgroup.com/blog/2020/04/15/quick-guide-generating-fake-data-with-pandas/

In [1]:
#!pip install faker

In [1]:
import numpy as np
import pandas as pd
from faker.providers.person.en import Provider
from faker import Faker

ModuleNotFoundError: No module named 'faker'

In [3]:
Faker.seed(0)
fake = Faker()
for _ in range(5):
    print(fake.email())

achang@example.org
tammy76@example.com
nhoward@example.net
juancampos@example.net
vanessa89@example.org


In [4]:
def random_names(name_type, size):
    """
    Generate n-length ndarray of person names.
    name_type: a string, either first_names or last_names
    """
    names = getattr(Provider, name_type)
    return np.random.choice(names, size=size)

In [5]:
def random_fullnames(size):
    return [fake.name() for _ in range(size)]

In [6]:
def random_genders(size, p=None):
    """Generate n-length ndarray of genders."""
    if not p:
        # default probabilities
        p = (0.49, 0.49, 0.01, 0.01)
    gender = ("M", "F", "O", "")
    return np.random.choice(gender, size=size, p=p)

In [7]:
def random_dates(start, end, size):
    """
    Generate random dates within range between start and end.    
    Adapted from: https://stackoverflow.com/a/50668285
    """
    # Unix timestamp is in nanoseconds by default, so divide it by
    # 24*60*60*10**9 to convert to days.
    divide_by = 24 * 60 * 60 * 10**9
    start_u = start.value // divide_by
    end_u = end.value // divide_by
    return pd.to_datetime(np.random.randint(start_u, end_u, size), unit="D")

In [8]:
# How many records do we want to create in our CSV? In this example
# we are generating 100, but you could also find relatively fast results generating 
# much larger datasets
size = 100
df = pd.DataFrame(columns=['First', 'Last', 'Gender', 'Birthdate'])
df['First'] = random_names('first_names', size)
df['Last'] = random_names('last_names', size) 
df['Gender'] = random_genders(size)
df['Birthdate'] = random_dates(start=pd.to_datetime('1940-01-01'), end=pd.to_datetime('2008-01-01'), size=size)
# alternate way to gen date directly with faker, instead of numpy
df['xdate'] = [fake.date_between(start_date='-30y', end_date='today') for _ in range(size)]
df.index.name='id'
df.to_csv('fake-file.csv')

In [9]:
df

Unnamed: 0_level_0,First,Last,Gender,Birthdate,xdate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Andre,Bergstrom,F,1944-11-24,2002-05-01
1,Ottie,Olson,M,1985-01-06,1995-02-20
2,Indiana,Rempel,M,1978-02-14,2016-08-11
3,Mariah,VonRueden,M,1964-09-19,1994-04-15
4,Mayme,Buckridge,M,1974-10-27,2020-09-18
...,...,...,...,...,...
95,Author,Pfeffer,F,1990-02-19,2007-12-29
96,Paulina,Kuhic,F,1949-09-18,1994-02-16
97,Arkie,O'Conner,M,1985-01-25,1994-11-01
98,Isreal,Lynch,F,2002-06-15,2014-11-14


In [10]:
fake.profile()

{'job': 'Conservator, furniture',
 'company': 'Mclean Inc',
 'ssn': '717-70-6411',
 'residence': '339 Riley Mission Suite 515\nSouth Brendamouth, ID 32356',
 'current_location': (Decimal('-85.6489225'), Decimal('-34.487601')),
 'blood_group': 'A-',
 'website': ['https://www.simmons-brown.com/', 'http://www.walters.com/'],
 'username': 'myersmitchell',
 'name': 'Chelsea Greer',
 'sex': 'F',
 'address': 'Unit 0903 Box 2173\nDPO AP 08507',
 'mail': 'stephenschristine@yahoo.com',
 'birthdate': datetime.date(1915, 4, 22)}

In [24]:
from tqdm import tqdm
# caution, to gen 100K lines it takes some time ~ 2-3 minutes
fake = Faker()
def make_people(num):
    fake_persons = [{'id': x+1000,
                     'name': fake.name(),
                     'email': fake.email(),
                     'address': fake.address(),
                     'city': fake.city(),
                     'dateTime': fake.date_between(start_date='-30y', end_date='today'),
                     'randomInt': np.random.randint(0,10000)} for x in tqdm(range(num))]
    return fake_persons
df = pd.DataFrame(make_people(10*1000))
df.head()
df.to_csv('./data/raw/fake_people.csv',index=False) # do not write index col

100%|██████████| 10000/10000 [00:14<00:00, 669.49it/s]


In [25]:
df.head()

Unnamed: 0,id,name,email,address,city,dateTime,randomInt
0,1000,David Farrell,bairdmichelle@example.com,"560 Edward Glens Suite 325\nLake Dylanmouth, S...",Colemanburgh,2005-05-20,1494
1,1001,Daniel Scott,michellemccormick@example.net,"672 Denise Glen\nPort Kimberlyland, ME 03370",Port Jody,1996-05-23,8577
2,1002,Heather Thompson,wjames@example.com,"8537 Juan Mountains\nPort Richard, WA 72950",South Stephanieshire,2019-01-17,4966
3,1003,Dr. Scott Morgan DDS,uburch@example.com,"66801 Cunningham Circle\nLake Joshua, VA 33126",Danielberg,1997-09-16,5824
4,1004,Jennifer Torres,randall51@example.net,"PSC 0139, Box 0456\nAPO AE 09875",Lake Jessica,2007-02-11,6989


inspired by https://towardsdatascience.com/generation-of-large-csv-data-using-python-faker-8cfcbedca7a7

we do not load dataframe into memory and write straight to csv file. However it still takes 16 min for 500K records

In [26]:
from tqdm import tqdm
import csv
def datagenerate(records, headers):
    fake = Faker('en_US')
    with open("People_data.csv", 'wt') as csvFile:
        writer = csv.DictWriter(csvFile, fieldnames=headers)
        writer.writeheader()
        for i in tqdm(range(records)):
            full_name = fake.name()
            
            writer.writerow({'id': i+1000,
                     'name': fake.name(),
                     'email': fake.email(),
                     'address': fake.address(),
                     'city': fake.city(),
                     'dateTime': fake.date_between(start_date='-30y', end_date='today'),
                     'randomInt': np.random.randint(0,10000)})
            
records = 500*1000 # 500k records
headers = ["id","name", "email", "address","city","dateTime","randomInt"]
datagenerate(records, headers)

100%|██████████| 500000/500000 [17:23<00:00, 479.01it/s] 
100%|██████████| 500000/500000 [19:15<00:00, 432.90it/s] 
100%|██████████| 500000/500000 [18:28<00:00, 450.87it/s]  
100%|██████████| 500000/500000 [15:36<00:00, 533.68it/s]
 67%|██████▋   | 332621/500000 [10:13<05:08, 542.13it/s]


KeyboardInterrupt: 