In [5]:
#using multi-processing, 16 procs, direct csv.writerow()
#TODO make threadsafe

import os
import multiprocessing as mp
from contextlib import closing
import csv
import time
import collections

from faker import Faker
from faker.providers import ssn
from faker.providers import phone_number
from faker.providers import address
from faker.providers import geo
from faker.providers import profile
from faker.providers import internet
from faker.providers import lorem
from faker.providers import credit_card
from faker.providers import user_agent
from faker.providers import misc

f = Faker('en-US')

f.add_provider(ssn)
f.add_provider(phone_number)
f.add_provider(address)
f.add_provider(geo)
f.add_provider(profile)
f.add_provider(internet)
f.add_provider(lorem)
f.add_provider(credit_card)
f.add_provider(user_agent)
f.add_provider(misc)

filepath = 'donor.csv'

# get available procs
procs_allowed = os.cpu_count()
print('allowed processors for mp: ', procs_allowed)

# Define an output queue
output = mp.Queue()

def fake_donor(pos,output):
    name = f.name()
    address = f.street_address()
    geo = list(f.local_latlng(country_code='US', coords_only=False))
    geo_dict= dict({'lat': geo[0], 'lon': geo[1], 'city': geo[2], 'country':geo[3],'timezone':geo[4]})
    #state = f.state_abbr(include_territories=True)
    #zipcode = f.zipcode_in_state(state_abbr=state)
    phone = f.phone_number()

    credit_card_number = f.credit_card_number(card_type=None)
    credit_card_provider = f.credit_card_provider(card_type=None)
    credit_card_expire = f.credit_card_expire(start='now', end='+10y', date_format='%m/%y')
    credit_card_security_code = f.credit_card_security_code(card_type=None)

    ssn=f.ssn(taxpayer_identification_number_type='SSN')
    ip = f.ipv4_private()
    user_agent = f.user_agent()
    #comment = f.text()

    profile = f.profile(fields=None, sex=None)
    profile = dict(profile)

    blood_group = profile['blood_group']
    birthdate = profile['birthdate']
    gender = profile['sex']
    user_name = profile['username']
    email = profile['mail']
    occupation = profile['job']
    employer = profile['company']

    row = collections.OrderedDict({
            #'row_num': i+1,
            'donor_uuid': f.uuid4(str),
            'bleed_uuid': f.uuid4(str),
            'appt_uuid': f.uuid4(str),
            'interaction_uuid': f.uuid4(str),
            'name': name,
            'address': address,

            'city':geo_dict['city'],
            'country':geo_dict['country'],
            'timezone':geo_dict['timezone'],
            'lat':geo_dict['lat'],
            'lon':geo_dict['lon'],
            'phone': phone,

            'credit_card_number': credit_card_number,
            'credit_card_provider': credit_card_provider,
            'credit_card_expire': credit_card_expire,
            'credit_card_security_code':credit_card_security_code,

            'ssn': ssn,
            'occupation': occupation,
            'employer': employer,
            'gender': gender,
            'birthdate': birthdate,
            'blood_group': blood_group,
            'user_name': user_name,
            'email': email,
            'ip': ip,
            'user_agent': user_agent
        })

    output.put((pos,row))
    
def multi_process(procs=1):    
    
    # Setup a list of processes that we want to run
    processes = [mp.Process(target=fake_donor, args=(x,output)) for x in range(procs)]

    # Run processes
    for p in processes:
        p.start()

    # Exit the completed processes
    for p in processes:
        p.join()

    # Get process results from the output queue
    results = [output.get() for p in processes]
    return results

def process_csv(rows,procs): 
    result = multi_process(1)
    header = list(next(r[1] for r in result).keys())
    print(header)
    print()
    
    with open(filepath,'w') as outfile:

        w = csv.DictWriter(outfile, header)
        w.writeheader()

        for i in range(round(rows/procs)):
            t0 = time.time()
            result = multi_process(procs)
            for row in result:
                w.writerow(row[1])
            print('%s %s %s %s' % ('\r ms: ',round((time.time() - t0)*1000,1),'rows: ', (i*procs)+procs+1), end ="", flush=True)

tjob = time.time() 

process_csv(1000000,16)

print()
print('Done!')
print(round((time.time() - tjob)/60,1), " minutes elapsed time")

allowed processors for mp:  16
['donor_uuid', 'bleed_uuid', 'appt_uuid', 'interaction_uuid', 'name', 'address', 'city', 'country', 'timezone', 'lat', 'lon', 'phone', 'credit_card_number', 'credit_card_provider', 'credit_card_expire', 'credit_card_security_code', 'ssn', 'occupation', 'employer', 'gender', 'birthdate', 'blood_group', 'user_name', 'email', 'ip', 'user_agent']

 ms:  53.1 rows:  1000001
Done!
58.3  minutes elapsed time
