# Data Overview

-----------Team 48------------
| Name          | Student ID |
|---------------|------------|
| Yifei ZHANG   | 1174267    |
| Yibo HUANG    | 1380231    |
| Hanzhang SUN  | 1379790    |
| Liyang CHEN   | 1135879    |
| Yueyang WU    | 1345511    |

In [None]:
import json

def print_metadata_details(metadata):
    for key, value in metadata.items():
        print(f"Key: {key}")
        if isinstance(value, dict):  #if the value is a dictionary, keep iterating the subkeys and subvalues
            for subkey, subvalue in value.items():
                print(f"  {subkey}: {subvalue}")
        else:
            print(f"Value: {value}")
        print()  #print a space for better formatting



In [None]:
import json
import pandas as pd

#load the meta data
with open('PM2003-2007-meta.json', 'r') as file:
    metadata = json.load(file)

#check the details of the data
print_metadata_details(metadata)



In [None]:
import json
import pandas as pd

#load the meta data
with open('PM2008-2012-meta.json', 'r') as file:
    metadata = json.load(file)

#check the details of the data
print_metadata_details(metadata)

In [None]:
import json
import pandas as pd

#load the meta data
with open('PM2010-2014-meta.json', 'r') as file:
    metadata = json.load(file)

#check the details of the data
print_metadata_details(metadata)

In [None]:
import json
import pandas as pd

#load the meta data
with open('PM2011-2015-meta.json', 'r') as file:
    metadata = json.load(file)

#check the details of the data
print_metadata_details(metadata)

In [None]:
import json
import pandas as pd

#load the meta data
with open('PM2014-2018-meta.json', 'r') as file:
    metadata = json.load(file)

#check the details of the data
print_metadata_details(metadata)

In [38]:
import json

with open('PM2008-2012-meta.json', 'r') as file:
    data = json.load(file)

In [39]:
print(data.keys())

dict_keys(['organisation', 'name', 'title', 'legal', 'referenceSystemIdentifier', 'geoLevel', 'key', 'keyRegex', 'availability', 'geomField', 'keyword', 'theme', 'temporalExtent', 'type', '_id', 'bbox', 'abstract', 'selectedAttributes', 'filter'])


In [40]:
feature = data.get('filter', {}).get('feature', {})
print(feature.get('key'))

area_code


In [41]:
print(data['filter'])

{'filterType': 'tabular', 'feature': {'key': 'area_code', 'geoLevel': 'lga2011', 'geoField': 'ignored', 'year': '2006', 'featureBbox': [140.961681984, -39.159189527500004, 149.976679008, -33.9806475865], 'featureType': 'ste', 'featureInstance': '2', 'featureName': 'Victoria', 'referenceSystemIdentifier': 'urn:x-ogc:def:crs:EPSG:4283', 'prefix': '', 'keyTransform': None}, 'states': ['2']}


In [42]:
feature = data['filter']['feature']
print(feature)

{'key': 'area_code', 'geoLevel': 'lga2011', 'geoField': 'ignored', 'year': '2006', 'featureBbox': [140.961681984, -39.159189527500004, 149.976679008, -33.9806475865], 'featureType': 'ste', 'featureInstance': '2', 'featureName': 'Victoria', 'referenceSystemIdentifier': 'urn:x-ogc:def:crs:EPSG:4283', 'prefix': '', 'keyTransform': None}


# Check the Data Structure

In [None]:
import json
import os

def check_json_structure(file_paths):
    #store the data structure for every file
    structure_dict = {}
    
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            data = json.load(file)
            #retrieve the keys and subkeys
            keys = set(data.keys())
            subkeys = set()
            for key in data:
                if isinstance(data[key], dict):
                    subkeys.update(data[key].keys())
            
            #add the path and structures to dictionary
            structure_dict[file_path] = (keys, subkeys)
    
    return structure_dict


In [None]:
#retrieve all files' paths from the folder
meta_data_path = '/Users/yueyangwu/Desktop/sorted_newData(Disease)/PM_meta/'
file_paths = [os.path.join(meta_data_path, file) for file in os.listdir(meta_data_path) if file.endswith('.json')]

#check the structure
structure_dict = check_json_structure(file_paths)

#output:
for file_path, structure in structure_dict.items():
    print(f"File: {file_path}")
    print(f"Keys: {structure[0]}")
    print(f"Subkeys: {structure[1]}")
    print()


# Combine the PM_meta To a New Dataframe

In [None]:
import pandas as pd
import os
import json

In [None]:

def load_json_to_df(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    
    records = []
    
    def flatten_dict(d, parent_key=''):
        for k, v in d.items():
            new_key = parent_key + '_' + k if parent_key else k
            if isinstance(v, dict):
                flatten_dict(v, new_key)
            else:
                records.append({new_key: v})
    
    flatten_dict(data)
    return pd.DataFrame(records)

In [None]:
#set the path to the meta file
meta_data_path = '/Users/yueyangwu/Desktop/sorted_newData(Disease)/PM_meta/'

#retrieve all files' paths from the folder
file_paths = [os.path.join(meta_data_path, file) for file in os.listdir(meta_data_path) if file.endswith('.json')]

#load all jsons and make them to be dataframes
df_list = [load_json_to_df(file_path) for file_path in file_paths]

#merge all dataframes
combined_df = pd.concat(df_list, ignore_index=True)

#save all merged dataframe as json
combined_df.to_json('PM_meta_combined_0.json', orient='records')

In [None]:
print(combined_df.columns)

In [None]:
print(combined_df['keyRegex'].unique())

# Transfer the Dataframe

In [None]:
#read merged dataFrame
combined_df = pd.read_json('PM_meta_combined_0.json', orient='records')

#transfer combined_df as a format of nested dictionary list
nested_data = []
for lga_code, lga_group in combined_df.groupby('filter_feature_key'):
    lga_dict = {'lga_code': lga_code, 'lga_name': f'LGA {lga_code}', 'years': []}
    for year, year_group in lga_group.groupby('year'):
        year_dict = {'year': year, 'diseases': []}
        for disease, disease_group in year_group.groupby('disease'):
            disease_dict = {
                'disease_name': disease,
                'asr': disease_group['ASR'].iloc[0],
                'sr': disease_group['SR'].iloc[0],
                'number': disease_group['count'].iloc[0]
            }
            year_dict['diseases'].append(disease_dict)
        lga_dict['years'].append(year_dict)
    nested_data.append(lga_dict)

#write the nested dictionary list to json format and save
with open('pm_meta_combined_1.json', 'w') as json_file:
    json.dump(nested_data, json_file, indent=2)