In [136]:
# Generate CI Data with the Python Faker library

In [137]:
# These are the CSV Rows:

# "CustomerId",
# "Address",
# "State",
# "StateProvince",
# "PreferredStore",
# "RewardsPoints",
# "StreetAddress",
# "NameCombined",
# "CountryCombined",
# "CityCombined",
# "PostalCodeCombined",
# "RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID",
# "RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID_Alternate",
# "RetailDemoData_SurveySystem_SurveyContact_SurveyContactID",
# "RetailDemoData_SurveySystem_SurveyContact_SurveyContactID_Alternate"

# As a JSON document, the data looks like this:
# {
#     "CustomerId": "341a24fd6823408a8feabef2f6fdf0cd",
#     "Address": null,
#     "State": "Louisiana",
#     "StateProvince": "Louisiana",
#     "PreferredStore": "698AF567-9BC5-494C-BB68-F585438F1A65",
#     "RewardsPoints": "662",
#     "StreetAddress": null,
#     "NameCombined": "LonWennam",
#     "CountryCombined": "UnitedStates",
#     "CityCombined": "NewOrleans",
#     "PostalCodeCombined": "70160",
#     "RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID": "0163EBA0-F00F-4476-AA68-2FBF7CECAA0B",
#     "RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID_Alternate": "0163EBA0-F00F-4476-AA68-2FBF7CECAA0B",
#     "RetailDemoData_SurveySystem_SurveyContact_SurveyContactID": "0E96FAA7-5DAA-4C9F-94D6-9D277C45FBAA",
#     "RetailDemoData_SurveySystem_SurveyContact_SurveyContactID_Alternate": "0E96FAA7-5DAA-4C9F-94D6-9D277C45FBAA",
# }

In [138]:
# Create an instance of Faker, and import python libraries

from faker import Faker
fake = Faker()

import random
import uuid

import pandas as pd


In [139]:
# Common functions used below

def column_names():
    return [
        "CustomerId",
        "Address",
        "State",
        "StateProvince",
        "PreferredStore",
        "RewardsPoints",
        "StreetAddress",
        "NameCombined",
        "CountryCombined",
        "CityCombined",
        "PostalCodeCombined",
        "RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID",
        "RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID_Alternate",
        "RetailDemoData_SurveySystem_SurveyContact_SurveyContactID",
        "RetailDemoData_SurveySystem_SurveyContact_SurveyContactID_Alternate"
    ]

def header_row():
    return ",".join(column_names()) + "\n"

def data_row():
    values = list()
    state = fake.state()
    for name in column_names():
        if name == 'CustomerId':
            values.append(str(uuid.uuid4()).lower().replace('-',''))
            
        elif name == 'Address':
            values.append(fake.city())
            
        elif name == 'State':
            values.append(state)  # 'State'
            values.append(state)  # 'StateProvince' is the same value
            
        elif name == 'StateProvince':
            pass  # added above with State
            
        elif name == 'PreferredStore':
            values.append(str(uuid.uuid4()).upper())
            
        elif name == 'RewardsPoints':
            values.append(str(random.randint(0,10000)))
            
        elif name == 'StreetAddress':
            values.append(fake.street_address())
            
        elif name == 'NameCombined':
            values.append(fake.name().replace(' ',''))
            
        elif name == 'CountryCombined':
            values.append(fake.city())
            
        elif name == 'CityCombined':
            values.append('UnitedStates')  # hardcode for now, do we need Canada & Mexico?
            
        elif name == 'PostalCodeCombined':
            values.append(fake.postcode())  # TODO - postal code in state
            
        elif name == 'RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID':
            values.append(str(uuid.uuid4()).upper())
            
        elif name == 'RetailDemoData_RetailSystem_RetailCustomer_CustomerRetailID_Alternate':
            values.append(str(uuid.uuid4()).upper())
            
        elif name == 'RetailDemoData_SurveySystem_SurveyContact_SurveyContactID':
            values.append(str(uuid.uuid4()).upper())
            
        elif name == 'RetailDemoData_SurveySystem_SurveyContact_SurveyContactID_Alternate':
            values.append(str(uuid.uuid4()).upper())
            
        else:
            values.append('?')
    
    
    return ",".join(values) + "\n"


In [140]:
# Create the randomized data.  
# Set desired_row_count as necessary; default value is 10.
# Output is written to file 'tmp/ci_data.csv

desired_row_count = 10
outfile = 'tmp/ci_data.csv'

with open(outfile, "w", newline="\n") as out:
    for i in range(desired_row_count):
        if i == 0:
            out.write(header_row())
        else:
            out.write(data_row())

print('file written: {}'.format(outfile))
        
    

file written: tmp/ci_data.csv


In [141]:
# Read the CSV file into a Pandas Dataframe and display it

df = pd.read_csv('tmp/ci_data.csv')

top = df.head(8)  # select the first 8 rows
print(top)

                         CustomerId         Address           State  \
0  c9eef07d4ee745f5984e6d1e9157fa08      Jamesburgh        Illinois   
1  af6b712ac11a46e38d270a8a9e80183f      New Joseph      New Jersey   
2  ca85455076db48d5b98a3eb6ba744ad4    Christyville           Idaho   
3  ecb704bd34fd479e84a3180135353f1e     Briannaport         Alabama   
4  6e40dec1b27546b9aeb4a78070b98a3b     West Rachel         Vermont   
5  b0782c207428494ea7909a8390b462ec     Ricardotown  North Carolina   
6  8340a5673d68423da5ee9ccf91ec7de6    Matthewsfort      California   
7  bd8c4e985e79465991a4ac4fb4dfb379  Russellchester      Washington   

    StateProvince                        PreferredStore  RewardsPoints  \
0        Illinois  84535648-871D-4CCD-94BD-682BF1301230           3666   
1      New Jersey  EB045976-FEE6-4CEE-9119-AC7C6A8F1CC5           3666   
2           Idaho  95AFA080-DFA8-41B1-9F97-F33EFE5156D0           7201   
3         Alabama  7B3DC44B-FF84-44FA-A931-DC9C58AF2BEB         