# Create a Dataset with Synthetic Errors
If Source is 'C' and Country of Birth is Asia, this is likely to cause errors on Last Name (e.g. due to mixing up first/last name). Country of Birth also influences Citizenship, but Citizenship doesn't cause errors.

In [1]:
import numpy as np
import pandas as pd
import random
import string

In [2]:
CONTINENTS = ['North America', 'Europe', 'Asia']
COUNTRIES = {'North America': ['Canada', 'USA'], 'Europe': ['France', 'Germany', 'Spain', 'Sweden'], 'Asia': ['China', 'Japan']}

SOURCES = ['A', 'B', 'C', 'D']
MARITAL_STATUSES = ['Married', 'Single', 'Divorced', 'Widowed']

In [3]:
dataset = pd.DataFrame(columns=['Source', 'Last Name', 'CountryOfBirth', 'ContinentOfBirth', 'CitizenshipCountry', 'CitizenshipContinent', 'NumChildren', 'MaritalStatus'])

In [4]:
prob_birth_citizen_match = 0.85
for i in range(10000):
    source = random.choice(SOURCES)
    name = ''.join(random.choice(string.ascii_lowercase) for i in range(5))
    num_children = random.randint(0, 4)
    marital_status = random.choice(MARITAL_STATUSES)
    continent_of_birth = random.choice(CONTINENTS)
    country_of_birth = random.choice(COUNTRIES[continent_of_birth])
    if random.uniform(0,1) > prob_birth_citizen_match:
        citizenship_continent = random.choice(CONTINENTS)
        citizenship_country = random.choice(COUNTRIES[citizenship_continent])
    else:
        citizenship_continent = continent_of_birth
        citizenship_country = country_of_birth
    dataset = dataset.append({'Source': source, 'Last Name': name, 'CountryOfBirth': country_of_birth, 'ContinentOfBirth': continent_of_birth, 'CitizenshipCountry': citizenship_country, 'CitizenshipContinent': citizenship_continent, 'NumChildren': num_children, 'MaritalStatus': marital_status}, ignore_index=True)

In [5]:
# flag errors
prob_error_asia_source_c = 0.97
prob_error_asia_source_other = 0.07
prob_error_other = 0.05

errors_asia_source_c = np.random.rand(len(dataset[(dataset['ContinentOfBirth'] == 'Asia') & (dataset['Source'] == 'C')])) >= prob_error_asia_source_c
errors_asia_source_other = np.random.rand(len(dataset[(dataset['ContinentOfBirth'] == 'Asia') & (dataset['Source'] != 'C')])) >= prob_error_asia_source_other
errors_other = np.random.rand(len(dataset[dataset['ContinentOfBirth'] != 'Asia'])) >= prob_error_other

dataset.loc[(dataset['ContinentOfBirth'] == 'Asia') & (dataset['Source'] == 'C'), 'Last Name Correct'] = errors_asia_source_c
dataset.loc[(dataset['ContinentOfBirth'] == 'Asia') & (dataset['Source'] != 'C'), 'Last Name Correct'] = errors_asia_source_other
dataset.loc[dataset['ContinentOfBirth'] != 'Asia','Last Name Correct'] = errors_other

In [6]:
# verify error rates
a = dataset.loc[(dataset['ContinentOfBirth'] == 'Asia') & (dataset['Source'] == 'C'), 'Last Name Correct']
b = dataset.loc[(dataset['ContinentOfBirth'] == 'Asia') & (dataset['Source'] != 'C'), 'Last Name Correct']
c = dataset.loc[dataset['ContinentOfBirth'] != 'Asia','Last Name Correct']
a.sum()/a.count(), b.sum()/b.count(), c.sum()/c.count()

(0.030998851894374284, 0.9263410728582866, 0.9520434323631428)

In [7]:
dataset.head()

Unnamed: 0,Source,Last Name,CountryOfBirth,ContinentOfBirth,CitizenshipCountry,CitizenshipContinent,NumChildren,MaritalStatus,Last Name Correct
0,A,wobid,Canada,North America,Canada,North America,4,Single,True
1,A,ikmdt,Canada,North America,Canada,North America,3,Single,True
2,A,dolti,Sweden,Europe,Sweden,Europe,1,Married,True
3,C,ofixw,Spain,Europe,Spain,Europe,4,Married,True
4,A,hunpb,Canada,North America,Canada,North America,2,Single,True


In [8]:
# Create Data X-Ray input
feature_vector = 'a:a:a:a:a:' # Source:BirthPlace:Citizenship:NumChildren:MaritalStatus:
structure_vector = '1:1:1:1:1:'
max_dims = '1:2:2:1:1:'
error_rate = dataset['Last Name Correct'].mean()
cost = 0.0

In [9]:
top_row = feature_vector + '\t' + max_dims + ';' + str(error_rate) + ';' + str(cost) + ';false;' + feature_vector + ';' + structure_vector + ';' + str(len(dataset)) + ';0;'
top_row

'a:a:a:a:a:\t1:2:2:1:1:;0.8654;0.0;false;a:a:a:a:a:;1:1:1:1:1:;10000;0;'

In [10]:
dataset['BirthString'] = 'a_' + dataset['ContinentOfBirth'] + '_' + dataset['CountryOfBirth'] + '_:'
dataset['CitizenshipString'] = 'a_' + dataset['CitizenshipContinent'] + '_' + dataset['CitizenshipCountry'] + '_:'
for col in ['Source', 'NumChildren', 'MaritalStatus']:
    dataset[col + 'String'] = 'a_' + dataset[col].astype(str) + '_:'

dataset['input-str'] = dataset['SourceString'] + dataset['BirthString'] + dataset['CitizenshipString'] +  dataset['NumChildrenString'] +  dataset['MaritalStatusString']

In [11]:
with open('./data/synthetic-input.txt', 'w') as f:
    f.write(top_row)
    list(f.write('{}%{}%{}='.format(i, dataset['Last Name Correct'].iloc[i], dataset['input-str'].iloc[i])) for i in range(len(dataset)))