In [3]:
import os
import glob

import pandas as pd

In [4]:
DATA_DIR = '../data'
WEIGHT_MIN = 1
WEIGHT_MAX = 100

In [16]:
def get_weights(group):
    min_count = group['Count'].min()
    max_count = group['Count'].max()
    group['Weight'] = WEIGHT_MIN + (group['Count'] - min_count) * (WEIGHT_MAX - 1) / (max_count - min_count)
    group['Weight'] = WEIGHT_MAX if group['Weight'].isna().any() else group['Weight']
    return group

In [21]:
all_dfs = []

csv_files = glob.glob(
    os.path.join(f"{DATA_DIR}/tags", '*.csv')
)
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    all_dfs.append(df)
    
combined_df = pd.concat(all_dfs, ignore_index=True)
combined_df = combined_df.groupby('Tag', as_index=False)['Count'].sum()
combined_df[['Key', 'Value']] = combined_df['Tag'].str.extract(r"\('([^']*)', '([^']*)'\)")
combined_df = combined_df.sort_values(by='Count', ascending=False)

output_dir = f"{DATA_DIR}/weights"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

combined_df.to_csv(f"{output_dir}/tag-counts.csv", index=False)

combined_df = combined_df[
    combined_df['Tag'].isin([
        "('highway', 'footway')",
        "('footway', 'sidewalk')",
        "('crossing', 'marked')",
        "('crossing', 'traffic_signals')",
    ])
]

combined_df = combined_df.groupby('Key').apply(get_weights)


combined_df.to_csv(f"{output_dir}/weights.csv", index=False)