## <b>Metadata Extraction <b>

In [None]:
import os
import hashlib
import exifread
import json
import time
from tqdm import tqdm

# Define the list of standard metadata fields
STANDARD_METADATA_FIELDS = [
    'Image Width', 'Image Length', 'Image Make', 'Image Model', 'Orientation',
    'Image DateTime', 'Image XResolution', 'Image YResolution', 'Image ResolutionUnit', 'Software',
    'Artist', 'Image YCbCrPositioning', 'EXIF ExposureTime', 'FNumber', 'ExposureProgram',
    'ISO', 'EXIF ExifVersion', 'EXIF DateTimeOriginal', 'EXIF DateTimeDigitized',
    'ShutterSpeedValue', 'ApertureValue', 'BrightnessValue', 'ExposureBiasValue',
    'MaxApertureValue', 'MeteringMode', 'LightSource', 'EXIF Flash', 'FocalLength',
    'EXIF FlashPixVersion', 'EXIF ColorSpace', 'PixelXDimension', 'PixelYDimension', 'RelatedSoundFile',
    'FocalPlaneXResolution', 'FocalPlaneYResolution', 'FocalPlaneResolutionUnit',
    'SensingMethod', 'FileSource', 'SceneType', 'DigitalZoomRatio', 'FocalLengthIn35mmFilm', 'GainControl',
    'Contrast', 'Saturation', 'Sharpness', 'DeviceSettingDescription', 'SubjectDistanceRange', 'ImageUniqueID',
    'GPSInfo', 'Image ExifOffset', 'EXIF ISOSpeedRatings', 'EXIF ComponentsConfiguration', 'EXIF ExifImageWidth',
    'EXIF ExifImageLength', 'EXIF ExposureMode', 'EXIF WhiteBalance', 'EXIF SceneCaptureType'
]

# Function to calculate the SHA-256 hash of a file
def sha256sum(filename):
    h = hashlib.sha256()
    with open(filename, 'rb') as file:
        while True:
            chunk = file.read(h.block_size)
            if not chunk:
                break
            h.update(chunk)
    return h.hexdigest()

# Function to get image metadata
def get_image_metadata(filename):
    with open(filename, 'rb') as file:
        exif_tags = exifread.process_file(file, details=False)
    
    # Convert tags to a serializable format
    metadata = {}
    for tag, value in exif_tags.items():
        # Convert the tag value to a string if it's not serializable
        try:
            json.dumps(value)
            metadata[tag] = value
        except TypeError:
            metadata[tag] = str(value)

    return metadata

# Function to check if a file is hidden
def is_hidden(filepath):
    return filepath.startswith('.')

# Function to check if a file is an image
def is_image_file(filename):
    image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}
    return os.path.splitext(filename)[1].lower() in image_extensions

# Main function to generate the report and save data to a JSON file
def generate_file_report(root_directory, output_file):
    data = []
    file_id = 1
    prefix_to_remove = '/' # dataset path

    start_time = time.time()  # Start time

    for root, dirs, files in os.walk(root_directory):
        for file in tqdm(files, desc="Processing images"):
            file_path = os.path.join(root, file)
            if is_hidden(file) or not is_image_file(file):
                continue
            file_hash = sha256sum(file_path)
            metadata = get_image_metadata(file_path)
            # Remove the prefix from the path
            file_path_rel = file_path.replace(prefix_to_remove, '', 1)
            row = {'ID': file_id, 'SHA256': file_hash, 'Path': file_path_rel, 'Metadata': metadata}
            data.append(row)
            file_id += 1

    end_time = time.time()  # End time
    elapsed_time = end_time - start_time

    with open(output_file, 'w') as json_file:
        json.dump(data, json_file, indent=4)

    print(f"Process completed in {elapsed_time:.2f} seconds")

# path where the dataset is located and where the .json will be stored
if __name__ == '__main__':
    generate_file_report('/', '/') 


## <b>Detector Merge with Metadata<b>

Merges two JSON files based on image IDs and paths.

In [None]:
# Path to the JSON files
ordered_json_path = '/home/fbaquero/tesis/ordered_json_final.json'
dragon_metadata_path = '/home/fbaquero/tesis/dragonMetaData.json'
output_path = '/home/fbaquero/tesis/mergedDragon.json'

# Function to read a JSON file
def read_json(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Function to save a JSON file
def write_json(data, file_path):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

# Read the JSON files
ordered_data = read_json(ordered_json_path)
dragon_data = read_json(dragon_metadata_path)

# Create a dictionary for quick access to dragonData based on Path
dragon_dict = {}
for item in dragon_data:
    full_path = os.path.join('/media/databases/tiputini', item['Path'])
    dragon_dict[full_path] = item

# Perform the merge based on img_id and Path
merged_data = []
for item in ordered_data['annotations']:
    img_id = item['img_id']
    if img_id in dragon_dict:
        merged_item = {**item, **dragon_dict[img_id]}
        merged_data.append(merged_item)
    else:
        merged_data.append(item)

# Save the merged file
write_json(merged_data, output_path)

print(f"The merged file has been saved to {output_path}")

The code merges metadata and detection information from a JSON file into a new structured format.

In [12]:
# Load data from the JSON file
with open('mergedDragon.json', 'r') as file: # Open the json resulting from the megadetector merge with metadata 
    data = json.load(file)

merged_data = []

# Iterate over each object in the data list
for item in data:
    # Create the merged item using the provided format
    merged_item = {
        "metadata": {
            "image": item['Metadata'],
            "other": {
                "location": {},
                "label_location": {}
            }
        },
        "id_sha_256": item['SHA256'],
        "paths": [item['Path']],
        "detectors": {
            "megadetectorV5": {
                "researcher": "Oscar Cajamarca, Jenner Baquero",
                "output": {
                    "file": item['img_id'],
                    "detections": [{
                        "category": item['category'],
                        "confidence": item['confidence'],
                        "bbox": item['bbox'],
                        "detection_categories": {
                            "0": "animal",
                            "1": "person",
                            "2": "vehicle"
                        },
                        "info": {
                            "detection_completion_time": "2024-05-08 20:00:00",
                            "format_version": "1.3",
                            "detector": "md_v5a.0.0.pt",
                            "detector_metadata": {
                                "megadetector_version": "v5a.0.0",
                                "typical_detection_threshold": 0.2,
                                "conservative_detection_threshold": 0.05
                            }
                        }
                    }]
                }
            },
            "another_detector": {
                "researcher": {},
                "output": {}
            },
            "ground_truth": {
                "researcher": {},
                "output": {}
            },
        },
        "segmentation_masks": {}
    }
    
    # Add the merged item to the list of merged data
    merged_data.append(merged_item)

# Save the merged data to a new JSON file
with open('Dragon_merge_v1.json', 'w') as file: # save the resulting json 
    json.dump(merged_data, file, indent=4)

Merges paths in a JSON file based on SHA-256 identifiers.

In [None]:
from collections import defaultdict

# Path to the original JSON file
input_file_path = '/home/Dragon_merge_v1.json'
# Path to the resulting JSON file
output_file_path = '/home/Dragonpathsmerged.json'

# Read the original JSON file
with open(input_file_path, 'r') as file:
    data = json.load(file)

# Dictionary to store combined paths by SHA
combined_data = defaultdict(lambda: {'paths': [], 'detectors': {}, 'segmentation_masks': {}, 'metadata': {}})

# Process each entry in the original JSON
for entry in data:
    sha = entry['id_sha_256']
    # Extend the paths list for each SHA identifier
    combined_data[sha]['paths'].extend(entry['paths'])
    # Merge detector and segmentation mask data
    combined_data[sha]['detectors'] = entry['detectors']
    combined_data[sha]['segmentation_masks'] = entry['segmentation_masks']
    # Combine metadata
    combined_data[sha]['metadata'] = entry['metadata']

# Convert the dictionary to a list of dictionaries for saving in JSON
result = []
for sha, details in combined_data.items():
    result.append({
        'id_sha_256': sha,
        'paths': details['paths'],
        'detectors': details['detectors'],
        'segmentation_masks': details['segmentation_masks'],
        'metadata': details['metadata']
    })

# Save the result to a new JSON file
with open(output_file_path, 'w') as file:
    json.dump(result, file, indent=4)

print(f'Combined file saved to {output_file_path}')


## <b>Review of json<b> 

In [1]:
import pandas as pd

with open('Dragonpathsmerged.json', 'r') as file:
    data = json.load(file)

In [3]:
df = pd.read_json('Dragonpathsmerged.json')

df

Unnamed: 0,id_sha_256,paths,detectors,segmentation_masks,metadata
0,38cfdcc6f01fb3a5543d1230c12161e554ce3c42b328d0...,[original_db/Pictures 2004.2008/Processed Imag...,{'megadetectorV5': {'researcher': 'Oscar Cajam...,{},"{'image': {'Image Make': 'G6.1 ', 'Image M..."
1,7ed0d9e3a078a3cd5602b234af3dd79a7dca60f6d20a02...,[original_db/Pictures 2004.2008/Processed Imag...,{'megadetectorV5': {'researcher': 'Oscar Cajam...,{},"{'image': {'Image Make': 'G6.1 ', 'Image M..."
2,2693a8ac41561c6688dcfb6efd39c147c0e4851073ab51...,[original_db/Pictures 2004.2008/Processed Imag...,{'megadetectorV5': {'researcher': 'Oscar Cajam...,{},"{'image': {'Image Make': 'G6.1 ', 'Image M..."
3,adddc64a964950c26e117e9613d7270b26a5f460861b95...,[original_db/Pictures 2004.2008/Processed Imag...,{'megadetectorV5': {'researcher': 'Oscar Cajam...,{},"{'image': {'Image Make': 'G6.1 ', 'Image M..."
4,f7f6d0840c927b353051330b268d63434cf318e18d6878...,[original_db/Pictures 2004.2008/Processed Imag...,{'megadetectorV5': {'researcher': 'Oscar Cajam...,{},"{'image': {'Image Make': 'G6.1 ', 'Image M..."
...,...,...,...,...,...
126102,63b259a47c770f62e34c2f54159e669e9942a711a9d22b...,[original_db/Pictures Study On and Off trails ...,{'megadetectorV5': {'researcher': 'Oscar Cajam...,{},{'image': {'Image ImageDescription': ' ...
126103,a43a9d2348b24852945c56b2574bdf6702b8ec1af1cd37...,[original_db/Pictures Study On and Off trails ...,{'megadetectorV5': {'researcher': 'Oscar Cajam...,{},{'image': {'Image ImageDescription': ' ...
126104,577a258d69b03f2b6962b6899750ae4c3416061f4ceb72...,[original_db/Pictures Study On and Off trails ...,{'megadetectorV5': {'researcher': 'Oscar Cajam...,{},{'image': {'Image ImageDescription': ' ...
126105,ccc08732c77ad9eea6802dc3f61c8950d41628eed02890...,[original_db/Pictures Study On and Off trails ...,{'megadetectorV5': {'researcher': 'Oscar Cajam...,{},{'image': {'Image ImageDescription': ' ...


In [8]:
df.shape

(126107, 5)

json structure

In [9]:
def pretty_print_structure(obj, indent=0, prefix=""):
    # Unicode characters for tree structure
    line = "│   "
    branch = "├── "
    last_branch = "└── "

    # Determine branch type (continuing or last)
    if indent == 0:
        next_prefix = ""
    else:
        next_prefix = prefix + line if prefix.endswith(line) else prefix

    # If the object is a DataFrame
    if isinstance(obj, pd.DataFrame):
        print(f"{prefix}DataFrame {obj.shape}")
        last_index = len(obj.columns) - 1
        for i, column in enumerate(obj.columns):
            new_prefix = prefix + (last_branch if i == last_index else branch)
            print(f"{new_prefix}Column '{column}':")
            pretty_print_structure(obj[column], indent + 1, next_prefix + (line if i != last_index else "    "))

    # If the object is a Series
    elif isinstance(obj, pd.Series):
        print(f"{prefix}Series {obj.name} - {obj.dtype}")
        unique_types = obj.apply(type).unique()  # Check types in this column
        if len(unique_types) == 1 and unique_types[0] in [int, float, str]:
            print(f"{prefix + branch}All elements of type: {unique_types[0]}")
        else:
            last_index = len(obj) - 1
            for i, val in enumerate(obj):
                new_prefix = prefix + (last_branch if i == last_index else branch)
                pretty_print_structure(val, indent + 1, next_prefix + (line if i != last_index else "    "))

    # If the object is a list
    elif isinstance(obj, list):
        print(f"{prefix}List: Length {len(obj)}")
        last_index = len(obj) - 1
        for i, item in enumerate(obj):
            new_prefix = prefix + (last_branch if i == last_index else branch)
            pretty_print_structure(item, indent + 1, next_prefix + (line if i != last_index else "    "))

    # If the object is a dictionary
    elif isinstance(obj, dict):
        print(f"{prefix}Dictionary with {len(obj)} keys")
        last_index = len(list(obj.keys())) - 1
        for i, (key, value) in enumerate(obj.items()):
            new_prefix = prefix + (last_branch if i == last_index else branch)
            print(f"{new_prefix}Key '{key}':")
            pretty_print_structure(value, indent + 1, next_prefix + (line if i != last_index else "    "))

    # Other data types (e.g., int, float, str)
    else:
        print(f"{prefix}{type(obj).__name__}: {obj}")

In [12]:
pretty_print_structure(df.loc[0])

Series 0 - object
│   str: 38cfdcc6f01fb3a5543d1230c12161e554ce3c42b328d036b36536ed883e3ca6
│   List: Length 2
│   │   │   str: original_db/Pictures 2004.2008/Processed Images/Ocelot Paper Data/Ocelots 2010-2011/013L-2P/P150-A-10-06-04-0145-Leopar.JPG
│   │       str: original_db/Pictures 2009-2011/P 150/P150-A/P150-A-10-07-10/P150-A-10-06-04-0145-Leopar.JPG
│   Dictionary with 3 keys
│   ├── Key 'megadetectorV5':
│   │   │   Dictionary with 2 keys
│   │   │   ├── Key 'researcher':
│   │   │   │   │   str: Oscar Cajamarca, Jenner Baquero
│   │   │   └── Key 'output':
│   │   │   │       Dictionary with 2 keys
│   │   │   │       ├── Key 'file':
│   │   │   │       │   str: /media/databases/tiputini/original_db/Pictures 2009-2011/P 150/P150-A/P150-A-10-07-10/P150-A-10-06-04-0145-Leopar.JPG
│   │   │   │       └── Key 'detections':
│   │   │   │           List: Length 1
│   │   │   │               Dictionary with 5 keys
│   │   │   │               ├── Key 'category':
│   │   │   │       

## **Modifications**

In [None]:
import json

# Diccionario con coordenadas para diferentes patrones de paths
coordinates_dict = {
    "H750": {"latitude": "0.63242S", "longitude": "76.14507W"},
    "H1650": {"latitude": "0.62563S", "longitude": "76.14572W"},
    "H3000": {"latitude": "0.62616S", "longitude": "76.15614W"},
    "M1400": {"latitude": "0.6194S", "longitude": "76.16307W"},
    "M2200": {"latitude": "0.62245S", "longitude": "76.16823W"},
    "M3350": {"latitude": "0.62782S", "longitude": "76.16973W"},
    "M4250": {"latitude": "0.63426S", "longitude": "76.16458W"},
    "P150": {"latitude": "0.64107S", "longitude": "76.14355W"},
    "P1150": {"latitude": "0.64533S", "longitude": "76.13853W"},
    "P2450": {"latitude": "0.64986S", "longitude": "76.14212W"},
    "Pu275": {"latitude": "0.63233S", "longitude": "76.1558W"},
    "Pu 275": {"latitude": "0.63233S", "longitude": "76.1558W"},
    "H2-200": {"latitude": "0.6333S", "longitude": "76.14209W"},
    "H2-400": {"latitude": "0.63316S", "longitude": "76.14022W"},
    "H2-600": {"latitude": "0.63312S", "longitude": "76.1384W"},
    "H2-800": {"latitude": "0.6331S", "longitude": "76.13669W"},
    "H4-200": {"latitude": "0.63151S", "longitude": "76.14203W"},
    "H4-400": {"latitude": "0.63143S", "longitude": "76.14027W"},
    "H4-600": {"latitude": "0.63136S", "longitude": "76.13839W"},
    "H4-800": {"latitude": "0.63131S", "longitude": "76.13675W"},
    "H6-200": {"latitude": "0.6297S", "longitude": "76.14197W"},
    "H6-400": {"latitude": "0.62965S", "longitude": "76.14018W"},
    "H6-600": {"latitude": "0.62954S", "longitude": "76.13842W"},
    "H6-800": {"latitude": "0.62944S", "longitude": "76.13665W"},
    "H8-200": {"latitude": "0.62778S", "longitude": "76.14207W"},
    "H8-400": {"latitude": "0.62783S", "longitude": "76.1404W"},
    "H8-600": {"latitude": "0.62775S", "longitude": "76.13861W"},
    "H8-800": {"latitude": "0.62774S", "longitude": "76.13681W"},
    "P-1-200": {"latitude": "0.62254S", "longitude": "76.15947W"},
    "P-1-400": {"latitude": "0.62262S", "longitude": "76.16031W"},
    "P-1-600": {"latitude": "0.6228S", "longitude": "76.16267W"},
    "P-1-800": {"latitude": "0.62284S", "longitude": "76.16467W"},
    "P1-200": {"latitude": "0.62455S", "longitude": "76.15943W"},
    "P1-400": {"latitude": "0.6246S", "longitude": "76.16118W"},
    "P1-600": {"latitude": "0.62476S", "longitude": "76.16291W"},
    "P1-800": {"latitude": "0.62469S", "longitude": "76.16443W"},
    "P3-200": {"latitude": "0.62463S", "longitude": "76.15939W"},
    "P3-400": {"latitude": "0.62632S", "longitude": "76.16024W"},
    "P3-600": {"latitude": "0.62654S", "longitude": "76.16265W"},
    "P3-800": {"latitude": "0.62664S", "longitude": "76.16454W"},
    "P5-200": {"latitude": "0.62812S", "longitude": "76.15895W"},
    "P5-400": {"latitude": "0.62829S", "longitude": "76.16074W"},
    "P5-600": {"latitude": "0.62816S", "longitude": "76.16268W"},
    "P5-800": {"latitude": "0.62838S", "longitude": "76.16445W"},
    "Chorongo": {"latitude": "0.63453S", "longitude": "76.148W"}
}

# Diccionario con información para notes_location
notes_location_dict = {
    "H750": ["H750", "Trail cameras"],
    "H1650": ["H1650", "Trail cameras"],
    "H3000": ["H3000", "Trail cameras"],
    "M1400": ["M1400", "Trail cameras"],
    "M2200": ["M2200", "Trail cameras"],
    "M3350": ["M3350", "Trail cameras"],
    "M4250": ["M4250", "Trail cameras"],
    "P150": ["P150", "Trail cameras"],
    "P1150": ["P1150", "Trail cameras"],
    "P2450": ["P2450", "Trail cameras"],
    "Pu275": ["Pu275", "Trail cameras"],
    "Pu 275": ["Pu 275", "Trail cameras"],
    "H2-200": ["H2-200", "Plot cameras"],
    "H2-400": ["H2-400", "Plot cameras"],
    "H2-600": ["H2-600", "Plot cameras"],
    "H2-800": ["H2-800", "Plot cameras"],
    "H4-200": ["H4-200", "Plot cameras"],
    "H4-400": ["H4-400", "Plot cameras"],
    "H4-600": ["H4-600", "Plot cameras"],
    "H4-800": ["H4-800", "Plot cameras"],
    "H6-200": ["H6-200", "Plot cameras"],
    "H6-400": ["H6-400", "Plot cameras"],
    "H6-600": ["H6-600", "Plot cameras"],
    "H6-800": ["H6-800", "Plot cameras"],
    "H8-200": ["H8-200", "Plot cameras"],
    "H8-400": ["H8-400", "Plot cameras"],
    "H8-600": ["H8-600", "Plot cameras"],
    "H8-800": ["H8-800", "Plot cameras"],
    "P-1-200": ["P-1-200", "Plot cameras"],
    "P-1-400": ["P-1-400", "Plot cameras"],
    "P-1-600": ["P-1-600", "Plot cameras"],
    "P-1-800": ["P-1-800", "Plot cameras"],
    "P1-200": ["P1-200", "Plot cameras"],
    "P1-400": ["P1-400", "Plot cameras"],
    "P1-600": ["P1-600", "Plot cameras"],
    "P1-800": ["P1-800", "Plot cameras"],
    "P3-200": ["P3-200", "Plot cameras"],
    "P3-400": ["P3-400", "Plot cameras"],
    "P3-600": ["P3-600", "Plot cameras"],
    "P3-800": ["P3-800", "Plot cameras"],
    "P5-200": ["P5-200", "Plot cameras"],
    "P5-400": ["P5-400", "Plot cameras"],
    "P5-600": ["P5-600", "Plot cameras"],
    "P5-800": ["P5-800", "Plot cameras"],
    "Chorongo": ["Chorongo", "Saladeros"]
}

# Función para extraer coordenadas del path
def get_coordinates(path):
    for key in coordinates_dict:
        if key in path:
            return coordinates_dict[key]
    return {}

# Función para obtener notes_location del path
def get_notes_location(path):
    for key in notes_location_dict:
        if key in path:
            return notes_location_dict[key]
    return []

# Cargar el archivo JSON
with open('/home/ocajamarca/python/anaconda3/project/Megadetector/megadetector_metadata.json', 'r') as file:
    data = json.load(file)

# Modificar cada entrada en el JSON
for entry in data:
    # Cambiar label_location por notes_location y actualizar el valor
    if 'label_location' in entry['metadata']['other']:
        entry['metadata']['other'].pop('label_location')
        entry['metadata']['other']['notes_location (salar, bosque, rio)'] = ""
    
    # Añadir coordenadas a location y notes_location si algún path contiene un patrón conocido
    for path in entry['paths']:
        coordinates = get_coordinates(path)
        notes_location = get_notes_location(path)
        if coordinates:
            entry['metadata']['other']['location'] = coordinates
        if notes_location:
            entry['metadata']['other']['notes_location (salar, bosque, rio)'] = f"{notes_location}"
            break

# Guardar el archivo JSON modificado
with open('/home/ocajamarca/python/anaconda3/project/Tests/data_modified_2.json', 'w') as file:
    json.dump(data, file, indent=4)

print("Archivo JSON modificado y guardado como 'data_modified_2.json'")


In [None]:
import json

with open('/home/ocajamarca/python/anaconda3/project/Megadetector/Dragonpathsmerged.json', 'r') as file:
    data = json.load(file)
    
names = set()

for item in data:
    for path in item.get('paths', []):
        if 'H2-200' in path:
            start_idx = path.find("H2-200")
            end_idx = path.find("/", start_idx)
            if end_idx == -1:
                end_idx = len(path)
            animal_name = path[start_idx:end_idx]
            names.add(animal_name)

names = sorted(names)

for i, name in enumerate(names, 1):
    print(f"{i}. {name}")
