# Synthetic survey data generator

In [1]:
!pip3 install names
!pip3 install rstr


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import random
import pandas as pd
import numpy as np

# for random strings generation
import rstr

# for random names generation
import names

In [3]:
# defaults 
NO_OF_RECORDS = 500
FILENAME = "simulated_data"

In [4]:
# NRIC generator
def generate_nric(): 
    return rstr.xeger(r'^[STFG]\d{7}[A-Z]$')

In [5]:
"""
Sample survey data
|---------- |-----------------|---------------|--------|-----------|
| NRIC      | researcher_name | sector        | salary | degree    | 
|---------- |------------------------------------------------------|
| T8270500P | Ted Osborn      | Agricultural  | 6000   | Masters   |
| S6754779S | Edith Ward      | Aerospace     | 5000   | Bachelors |
| G4976879R | Eric Heard      | Engineering   | 7000   | Masters   |
| S4190526H | Franklin Brod   | Healthcare    | 9000   | PhD       |
|------------------------------------------------------------------|

Types:
NRIC: string 
researcher_name: string 
sector: string (categorical)
salary: int 
degree: int (categorical) 
"""

# categorical attributes 
SECTOR_TYPES = ["Agricultural", "Aerospace", "Engineering", "Healthcare"] 
QUALIFICATION_TYPES = ["PhD", "Masters", "Bachelors", "Non-degree", "Post-grad"]

# constants
SALARY_RANGE = [3000, 15000]

# NATIONALITY_TYPES = ["SC", "PR", "FC"]

def create_simulated_data(no_of_records=NO_OF_RECORDS):
    simulated_data = pd.DataFrame()
    
    # NRIC
    simulated_data["NRIC"] = [generate_nric() for _ in range(0, no_of_records)]
    
    # researcher_name
    simulated_data["researcher_name"] = [names.get_full_name() for _ in range(0, no_of_records)]
    
    # sector
    simulated_data["sector"] = np.random.choice(SECTOR_TYPES, size=no_of_records) 
    
    # salary
    simulated_data["salary"] = np.random.randint(SALARY_RANGE[0], SALARY_RANGE[1], size=no_of_records)
    # using gamma distribution 
    # https://en.wikipedia.org/wiki/Gamma_distribution
    # simulated_data["salary"] = np.round(np.random.gamma(10, 300, size=no_of_records), 2)

    # degree
    simulated_data["degree"] = np.random.choice(QUALIFICATION_TYPES, size=no_of_records) 
     
    return simulated_data

In [6]:
no_of_records = 10000
simulated_data = create_simulated_data(no_of_records)
simulated_data.to_csv(f"data/{FILENAME}_{no_of_records}.csv", mode='w', index=False)

In [7]:
simulated_data

Unnamed: 0,NRIC,researcher_name,sector,salary,degree
0,S1406168V,Roger Williams,Healthcare,10261,Masters
1,G4938110D,Georgette Westberry,Agricultural,6283,PhD
2,F5310001V,Gladys Limon,Aerospace,10019,Non-degree
3,T3432307R,Robert Thomas,Aerospace,12947,Non-degree
4,S6047094E,Lori Portes,Healthcare,9177,Post-grad
...,...,...,...,...,...
9995,F5202261O,Thelma Stadler,Agricultural,5250,Masters
9996,G3600769S,Delbert Riggs,Agricultural,8189,Post-grad
9997,G0611770M,Margaret Colapietro,Agricultural,13801,PhD
9998,T1516839U,Vickie Dighton,Agricultural,10575,Non-degree
