## Synthetic Attributes
This notebook inputs a sample GATIS dataset and fills empty attributes with synthetic data base on random sampling. This can be done to generate any tier of data. The intention here is for demonstrating all of the fields of GATIS and for testing the validator.

At the moment, new geometries are not generated, just attributes.

### Categorical Weights Dict
This dictionary allows for setting custom weights for any categorical (enumerated or recommended values) GATIS attribute. The keys are attribute names, the items are lists of tuples where the first value is a recommended/enumerated string of the attribute and the second is a number that is greater than or equal to one. Use the GATIS dictionary files to pull the attribute names and values that you want to add.

Start by setting all values equal to one, then if you want one value to appear more frequently than another, increase the value of that number.

In [None]:

# list of one
# 
# weights are normalized so they don't need to add up to one
# higher means more frequent, equal means equally likely
categorical_weights_dict = {
    'surface_issue': [
        ('yes', 10),
        ('no', 5),
        ('cracking', 1),
        ('scaling', 1),
        ('spalling', 1),
        ('overgrowth', 1),
        ('uneven/displacement', 1),
        ('frequent water pooling', 1),
        ('heaving', 1),
        ('missing bricks / stones', 1),
        ('grates / utility covers / other surface impediments', 1),
        ('potholes / holes', 1),
        ('slickness', 1),
        ('uneven joints', 1),
        ('markings worn / missing', 1),
        ('detectable warning surface damage', 1),
        ('other', 1)
    ]
}

def synthetic_data_listed_values(feature_df,feature_type,spec,attribute_name):
    '''
    Used to generate synthetic data for a given field. Searches the weights dict object
    to see if there are pre-defined weights. If there aren't then it will each listed value
    in listed_values will have an equal propbability of being chosen.
    '''

    attribute = [x for x in spec['attributes'] if x['name'] == attribute_name][0]

    # feature type type that the attribute is recommended or required for
    _types = [key for key, item in attribute['presence'].items() if ("recommended" in item) | ("required" in item)]
    cond_index = feature_df[feature_df[f'{feature_type}_type'].isin(_types)].index
    
    sampleList = attribute['listed_values']
    weights = weights_dict.get(attribute_name)

    use_weights = False
    if weights is not None:
        weights_names = [x[0] for x in weights]
        weights = [x[1] for x in weights]
        if len(set.difference(set(sampleList), set(weights_names) )) == 0:
            use_weights = True
        else:    
            print("Key mismatch for",attribute_name,"in weights_dict")
    else:
        print("No weights specified for",attribute_name)

    # set seed for repeatability
    random.seed("gatis")
    if use_weights:
        randomList = random.choices(sampleList,weights=weights,k=len(cond_index))
    else:
        randomList = random.choices(sampleList,k=len(cond_index))
    
    feature_df.loc[cond_index,attribute_name] = pd.Series(randomList,index=cond_index)

In [None]:
# for record in edges_spec['attributes']:
#     if gatis_edges[record['name']].notna().any():
#         # if we've already infilled some values then don't re-infill them
#         continue
#     # for listed values
#     if record.get("listed_values") is not None:
#         synthetic_data_listed_values(gatis_edges,"edge",edges_spec,record.get("name"))
#     # for numeric values (tomorrow)

In [None]:
# # gatis_edges.to_file("test.geojson")
# #NOTE don't use GeoPandas to_file because there a lot of null values
# #want to drop null values for each record so that the resulting geojson isn't as large
# with open("test_edges.geojson","w") as f:
#     f.write(gatis_edges.to_json(na="drop",indent=2))

In [None]:
# sidewalks_df['incline'] = 5.0 #missing
# sidewalks_df['cross_slope'] = 2.1 #missing
# sidewalks_df['pedestrian_lane'] = False #missing
# sidewalks_df['ada_compliance'] = np.random.choice(['yes', 'no', 'unknown'], size = len(sidewalks_df)) #missing / may be in assessment type
# sidewalks_df.loc[sidewalks_df['ada_compliance'] == 'yes', 'ada_compliance_standard'] = 'PROWAG' #missing
# sidewalks_df.loc[sidewalks_df['ada_compliance'] == 'yes', 'ada_compliance_date'] = '1998-08-14' #missing
# sidewalks_df['detectable_warning'] = np.random.choice(['yes', 'no', 'unknown'], size = len(sidewalks_df)) #missing
# sidewalks_df['from_node'] = np.random.randint(low = 10_000_000, high = 100_000_000, size = len(sidewalks_df)) #missing
# sidewalks_df['to_node'] = np.random.randint(low = 10_000_000, high = 100_000_000, size = len(sidewalks_df)) #missing
# sidewalks_df['min_width'] = np.random.randint(low = 30, high = 70, size = len(sidewalks_df)) #missing
# sidewalks_df['bridge'] = np.random.choice([True, False], size = len(sidewalks_df)) #missing
# sidewalks_df['surface_issue'] = np.random.choice(['yes', 'no', 'cracking', 'scaling', 'spalling', 'overgrowth', 
#             'uneven/displacement', 'frequent water pooling', 'heaving', 'missing bricks/stones', 
#             'grates/utility covers/other surface impediments', 'potholes/holes', 'slickness', 'uneven joints', 
#             'markings worn/missing', 'detectable warning surface damage', 'other'], p = [0.01, 0.84, 0.01, 0.01, 
#             0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], size = len(sidewalks_df)) #missing
# sidewalks_df['cross_slope_max'] = 2.1 #missing
# sidewalks_df['incline_max'] = 5.0 #missing
# sidewalks_df['impediment'] = np.random.choice(['yes', 'no', 'horizontal overgrowth', 'vertical overgrowth', 
#             'fixed vertical obstruction', 'solid fixed object', 'flexible fixed object', 'protrusion', 
#             'turning space missing or issue', 'detectable warning not aligned with crossing', 'push button not working', 
#             'other'], p = [0.01, 0.89, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], size = len(sidewalks_df)) #missing
# sidewalks_df['visual_markings'] = np.random.choice(['yes', 'no', 'dashed lines', 'zebra', 'continental', 'ladder', 'other'],
#             p = [0.94, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], size = len(sidewalks_df))#missing
# ##FIGURE OUT ENUM LISTS, MAKE SURE DATE FIELDS ARE DATE TYPE