In [7]:
# Generate CI Data with the Python Faker library

In [8]:
# These are the CSV Rows:

# "CustomerId",
# "Address",
# "State",
# "StateProvince",
# "PreferredStore",
# "RewardsPoints",
# "StreetAddress",
# "NameCombined",
# "CountryCombined",
# "CityCombined",
# "PostalCodeCombined",
# "RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID",
# "RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID_Alternate",
# "RetailDemoData_SurveySystem_SurveyContact_SurveyContactID",
# "RetailDemoData_SurveySystem_SurveyContact_SurveyContactID_Alternate"

# As a JSON document, the data looks like this:
# {
#     "CustomerId": "341a24fd6823408a8feabef2f6fdf0cd",
#     "Address": null,
#     "State": "Louisiana",
#     "StateProvince": "Louisiana",
#     "PreferredStore": "698AF567-9BC5-494C-BB68-F585438F1A65",
#     "RewardsPoints": "662",
#     "StreetAddress": null,
#     "NameCombined": "LonWennam",
#     "CountryCombined": "UnitedStates",
#     "CityCombined": "NewOrleans",
#     "PostalCodeCombined": "70160",
#     "RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID": "0163EBA0-F00F-4476-AA68-2FBF7CECAA0B",
#     "RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID_Alternate": "0163EBA0-F00F-4476-AA68-2FBF7CECAA0B",
#     "RetailDemoData_SurveySystem_SurveyContact_SurveyContactID": "0E96FAA7-5DAA-4C9F-94D6-9D277C45FBAA",
#     "RetailDemoData_SurveySystem_SurveyContact_SurveyContactID_Alternate": "0E96FAA7-5DAA-4C9F-94D6-9D277C45FBAA",
# }

In [9]:
# Create an instance of Faker, and import python libraries

from faker import Faker
fake = Faker()

import random
import uuid

import pandas as pd


In [10]:
# Common functions used below

def column_names():
    return [
        "CustomerId",
        "Address",
        "State",
        "StateProvince",
        "PreferredStore",
        "RewardsPoints",
        "StreetAddress",
        "NameCombined",
        "CountryCombined",
        "CityCombined",
        "PostalCodeCombined",
        "RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID",
        "RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID_Alternate",
        "RetailDemoData_SurveySystem_SurveyContact_SurveyContactID",
        "RetailDemoData_SurveySystem_SurveyContact_SurveyContactID_Alternate"
    ]

def header_row():
    return ",".join(column_names()) + "\n"

def data_row():
    values = list()
    state = fake.state()
    for name in column_names():
        if name == 'CustomerId':
            values.append(str(uuid.uuid4()).lower().replace('-',''))
            
        elif name == 'Address':
            values.append(fake.city())
            
        elif name == 'State':
            values.append(state)  # 'State'
            values.append(state)  # 'StateProvince' is the same value
            
        elif name == 'StateProvince':
            pass  # added above with State
            
        elif name == 'PreferredStore':
            values.append(str(uuid.uuid4()).upper())
            
        elif name == 'RewardsPoints':
            values.append(str(random.randint(0,10000)))
            
        elif name == 'StreetAddress':
            values.append(fake.street_address())
            
        elif name == 'NameCombined':
            values.append(fake.name().replace(' ',''))
            
        elif name == 'CountryCombined':
            values.append(fake.city())
            
        elif name == 'CityCombined':
            values.append('UnitedStates')  # hardcode for now, do we need Canada & Mexico?
            
        elif name == 'PostalCodeCombined':
            values.append(fake.postcode())  # TODO - postal code in state
            
        elif name == 'RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID':
            values.append(str(uuid.uuid4()).upper())
            
        elif name == 'RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID_Alternate':
            values.append(str(uuid.uuid4()).upper())
            
        elif name == 'RetailDemoData_SurveySystem_SurveyContact_SurveyContactID':
            values.append(str(uuid.uuid4()).upper())
            
        elif name == 'RetailDemoData_SurveySystem_SurveyContact_SurveyContactID_Alternate':
            values.append(str(uuid.uuid4()).upper())
            
        else:
            values.append('?')
    
    
    return ",".join(values) + "\n"


In [11]:
# Create the randomized data.  
# Set desired_row_count as necessary; default value is 10.
# Output is written to file 'tmp/customer_data.csv

desired_row_count = 1000
outfile = 'tmp/customer_data.csv'

with open(outfile, "w", newline="\n") as out:
    for i in range(desired_row_count):
        if i == 0:
            out.write(header_row())
        else:
            out.write(data_row())

print('file written: {}'.format(outfile))
        
    

file written: tmp/customer_data.csv


In [12]:
# Read the CSV file into a Pandas Dataframe and display it

df = pd.read_csv('tmp/customer_data.csv')

top = df.head(8)  # select the first 8 rows
print(top)

                         CustomerId           Address         State  \
0  3319abafc4d24491826b6cae72c9dc53  Stevensonborough        Alaska   
1  a14dcbedcfb44ad699c779aad56b67ee           Hallton    California   
2  1c76cb9fa0c4475aa90f16d8e9922e3c   East Robertfurt    New Jersey   
3  9020cbc825a5420e9a248bf37cb3860d      New Michelle    California   
4  0bc3cda8b18a4e61a6f0afadb546eabc       East Cheryl      Kentucky   
5  becba43abf4044818d1694e992e4b257          Ryanfurt         Texas   
6  6dbce48bc2d946aeb38f84b073a9a5eb       West Edward  Pennsylvania   
7  1791e61767a34bfbab203d6b06264842         Olsonfort          Iowa   

  StateProvince                        PreferredStore  RewardsPoints  \
0        Alaska  6C73673D-4188-4B4F-977F-0B01A962DDFD           9724   
1    California  061B9DE5-8C26-40FF-AE37-623D14BB313C           9510   
2    New Jersey  2CF32203-192A-489F-ABAE-56831319CDA2           9291   
3    California  118B0EC3-CD19-4957-A242-995D5A802E15           6144   
