In [3]:
import requests
from bs4 import BeautifulSoup
import json
import scipy.io

In [4]:


def load_synsets(file_path):
    # Load the MATLAB file
    mat = scipy.io.loadmat(file_path)
    old_id_words_mapping = {}
    # Access the 'synsets' struct array (adjust the key based on your file's structure)
    synsets = mat['synsets']  # This might vary depending on the structure of the .mat file
    # Process each synset
    for synset in synsets:
        ilsvrc_id = synset['ILSVRC2012_ID'][0][0]
        wnid = synset['WNID'][0]
        words = synset['words'][0]
        #print(f'ID: {ilsvrc_id}, WNID: {wnid}, Words: {words}')
        if ilsvrc_id <= 1000:
            old_id_words_mapping[ilsvrc_id[0]] = words[0]
    return old_id_words_mapping
# Assuming 'meta.mat' is in your current directory
old_id_words_mapping = load_synsets('meta.mat')
old_id_words_mapping

{1: 'kit fox, Vulpes macrotis',
 2: 'English setter',
 3: 'Siberian husky',
 4: 'Australian terrier',
 5: 'English springer, English springer spaniel',
 6: 'grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus',
 7: 'lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens',
 8: 'Egyptian cat',
 9: 'ibex, Capra ibex',
 10: 'Persian cat',
 11: 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor',
 12: 'gazelle',
 13: 'porcupine, hedgehog',
 14: 'sea lion',
 15: 'malamute, malemute, Alaskan malamute',
 16: 'badger',
 17: 'Great Dane',
 18: 'Walker hound, Walker foxhound',
 19: 'Welsh springer spaniel',
 20: 'whippet',
 21: 'Scottish deerhound, deerhound',
 22: 'killer whale, killer, orca, grampus, sea wolf, Orcinus orca',
 23: 'mink',
 24: 'African elephant, Loxodonta africana',
 25: 'Weimaraner',
 26: 'soft-coated wheaten terrier',
 27: 'Dandie Dinmont, Dandie Dinmont terrier',
 28: 'red wolf, maned wolf, Canis rufus, Canis nig

In [5]:
def new_word_to_id(url, table_selector, output_file):
    # Fetch the webpage
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to fetch the page")
        return

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    #print(soup)
    table = soup.select_one(table_selector)  # Use CSS selector to find the table

    if not table:
        print("Table not found")
        return

    # Extract data from the table rows
    data = {}
    for row in table.find_all('tr'):
        columns = row.find_all('td')
        if len(columns) > 1:  # Assuming at least two columns for ID and mapping
            old_id = int(columns[0].text.strip())
            word = columns[1].text.strip()
            data[word] = old_id


    return data
url = 'https://deeplearning.cms.waikato.ac.nz/user-guide/class-maps/IMAGENET/'
table_selector = 'table'  # Adjust the CSS selector based on the actual table's attributes
output_file = 'new_word_to_id_mapping.json'
new_word_ids = new_word_to_id(url, table_selector, output_file)
new_word_ids

{'tench, Tinca tinca': 0,
 'goldfish, Carassius auratus': 1,
 "great white shark, white shark, man-eater, man-eating shark, Carcharodon caharias',": 2,
 'tiger shark, Galeocerdo cuvieri': 3,
 'hammerhead, hammerhead shark': 4,
 'electric ray, crampfish, numbfish, torpedo': 5,
 'stingray': 6,
 'cock': 7,
 'hen': 8,
 'ostrich, Struthio camelus': 9,
 'brambling, Fringilla montifringilla': 10,
 'goldfinch, Carduelis carduelis': 11,
 'house finch, linnet, Carpodacus mexicanus': 12,
 'junco, snowbird': 13,
 'indigo bunting, indigo finch, indigo bird, Passerina cyanea': 14,
 'robin, American robin, Turdus migratorius': 15,
 'bulbul': 16,
 'jay': 17,
 'magpie': 18,
 'chickadee': 19,
 'water ouzel, dipper': 20,
 'kite': 21,
 'bald eagle, American eagle, Haliaeetus leucocephalus': 22,
 'vulture': 23,
 'great grey owl, great gray owl, Strix nebulosa': 24,
 'European fire salamander, Salamandra salamandra': 25,
 'common newt, Triturus vulgaris': 26,
 'eft': 27,
 'spotted salamander, Ambystoma macu

In [6]:
import difflib

# Example dictionaries
dict1 = old_id_words_mapping
dict2 = new_word_ids

# Create a new dictionary to store the results
new_dict = {}

# Process each item in dict1
for key, value in dict1.items():
    # Get the closest match from dict2 keys
    closest_match = difflib.get_close_matches(value, dict2.keys(), n=1, cutoff=0.8)  # Adjust cutoff as needed for your data
    if closest_match:
        # If there's a match, use it to get the corresponding new_id from dict2
        new_dict[key] = dict2[closest_match[0]]
    else:
        # Optionally handle the case where no close match is found
        new_dict[key] = None

new_dict[429] = 134 ##this is crane bird disambiguation   
print(new_dict)

{1: 278, 2: 212, 3: 250, 4: 193, 5: 217, 6: 147, 7: 387, 8: 285, 9: 350, 10: 283, 11: 286, 12: 353, 13: 334, 14: 150, 15: 249, 16: 362, 17: 246, 18: 166, 19: 218, 20: 172, 21: 177, 22: 148, 23: 357, 24: 386, 25: 178, 26: 202, 27: 194, 28: 271, 29: 229, 30: 290, 31: 175, 32: 163, 33: 191, 34: 276, 35: 299, 36: 197, 37: 380, 38: 364, 39: 339, 40: 359, 41: 251, 42: 165, 43: 157, 44: 361, 45: 179, 46: 268, 47: 233, 48: 356, 49: 266, 50: 264, 51: 225, 52: 349, 53: 335, 54: 375, 55: 282, 56: 204, 57: 352, 58: 272, 59: 187, 60: 256, 61: 294, 62: 277, 63: 174, 64: 234, 65: 351, 66: 176, 67: 280, 68: 223, 69: 154, 70: 262, 71: 203, 72: 190, 73: 370, 74: 298, 75: 384, 76: 292, 77: 170, 78: 342, 79: 241, 80: 340, 81: 348, 82: 245, 83: 365, 84: 253, 85: 288, 86: 239, 87: 153, 88: 185, 89: 158, 90: 211, 91: 192, 92: 382, 93: 224, 94: 216, 95: 284, 96: 367, 97: 228, 98: 160, 99: 152, 100: 376, 101: 338, 102: 270, 103: 296, 104: 366, 105: 169, 106: 265, 107: 183, 108: 345, 109: 199, 110: 244, 111: 38

In [7]:
seen_values = set()
duplicates = []

for value in new_dict.values():
    if value in seen_values:
        duplicates.append(value)
    else:
        seen_values.add(value)

if duplicates:
    print("Duplicate values found:", duplicates)
else:
    print("No duplicates found.")
#THIS IS VALID!!! there are 2 cranes, spelled exactly the same....
#crane bird is 134
#crane is 517 so
#429 or 545 , the bird one should be 134...
#update 429 to be 134

No duplicates found.


In [8]:
# To generate the mapping, we need to first consider the digits as strings, sort them alphabetically,
# and then map them back to their original integer values and respective indices.

# Generate the list of digits from 1 to 1000 as strings
digits = [str(i) for i in range(1, 1001)]

# Sort the list alphabetically
sorted_digits = sorted(digits)

# Create the mapping where each number is mapped to its index in the sorted list
mapping = {i: int(sorted_digits[i]) for i in range(1000)}

# Print the mapping
mapping


{0: 1,
 1: 10,
 2: 100,
 3: 1000,
 4: 101,
 5: 102,
 6: 103,
 7: 104,
 8: 105,
 9: 106,
 10: 107,
 11: 108,
 12: 109,
 13: 11,
 14: 110,
 15: 111,
 16: 112,
 17: 113,
 18: 114,
 19: 115,
 20: 116,
 21: 117,
 22: 118,
 23: 119,
 24: 12,
 25: 120,
 26: 121,
 27: 122,
 28: 123,
 29: 124,
 30: 125,
 31: 126,
 32: 127,
 33: 128,
 34: 129,
 35: 13,
 36: 130,
 37: 131,
 38: 132,
 39: 133,
 40: 134,
 41: 135,
 42: 136,
 43: 137,
 44: 138,
 45: 139,
 46: 14,
 47: 140,
 48: 141,
 49: 142,
 50: 143,
 51: 144,
 52: 145,
 53: 146,
 54: 147,
 55: 148,
 56: 149,
 57: 15,
 58: 150,
 59: 151,
 60: 152,
 61: 153,
 62: 154,
 63: 155,
 64: 156,
 65: 157,
 66: 158,
 67: 159,
 68: 16,
 69: 160,
 70: 161,
 71: 162,
 72: 163,
 73: 164,
 74: 165,
 75: 166,
 76: 167,
 77: 168,
 78: 169,
 79: 17,
 80: 170,
 81: 171,
 82: 172,
 83: 173,
 84: 174,
 85: 175,
 86: 176,
 87: 177,
 88: 178,
 89: 179,
 90: 18,
 91: 180,
 92: 181,
 93: 182,
 94: 183,
 95: 184,
 96: 185,
 97: 186,
 98: 187,
 99: 188,
 100: 189,
 101: 19,

In [13]:
final_dict = {key: "U"+str(mapping[value]) for key, value in new_dict.items()}

In [14]:
final_dict

{1: 'U349',
 2: 'U29',
 3: 'U323',
 4: 'U272',
 5: 'U294',
 6: 'U230',
 7: 'U447',
 8: 'U355',
 9: 'U413',
 10: 'U353',
 11: 'U356',
 12: 'U416',
 13: 'U4',
 14: 'U233',
 15: 'U322',
 16: 'U424',
 17: 'U32',
 18: 'U248',
 19: 'U295',
 20: 'U253',
 21: 'U258',
 22: 'U231',
 23: 'U42',
 24: 'U446',
 25: 'U259',
 26: 'U280',
 27: 'U273',
 28: 'U342',
 29: 'U304',
 30: 'U36',
 31: 'U256',
 32: 'U245',
 33: 'U270',
 34: 'U347',
 35: 'U368',
 36: 'U276',
 37: 'U440',
 38: 'U426',
 39: 'U403',
 40: 'U421',
 41: 'U324',
 42: 'U247',
 43: 'U24',
 44: 'U423',
 45: 'U26',
 46: 'U34',
 47: 'U308',
 48: 'U419',
 49: 'U338',
 50: 'U336',
 51: 'U300',
 52: 'U412',
 53: 'U40',
 54: 'U436',
 55: 'U352',
 56: 'U282',
 57: 'U415',
 58: 'U343',
 59: 'U267',
 60: 'U329',
 61: 'U363',
 62: 'U348',
 63: 'U255',
 64: 'U309',
 65: 'U414',
 66: 'U257',
 67: 'U350',
 68: 'U3',
 69: 'U237',
 70: 'U334',
 71: 'U281',
 72: 'U27',
 73: 'U431',
 74: 'U367',
 75: 'U444',
 76: 'U361',
 77: 'U251',
 78: 'U406',
 79: 'U3

In [16]:
import os

# Assuming 'folder_path' is the path to the directory containing the folders
folder_path = '/projectnb/textconv/distill/mdistiller/data/imagenet/val/xxxxxxxx' #don't run again plx

# Your mapping dictionary {current_folder_name: new_folder_name}
mapping_dict = final_dict

# Loop through each folder in the directory
for folder in os.listdir(folder_path):
    # Check if the folder name is an integer and exists in your dictionary
    if folder.isdigit() and int(folder) in mapping_dict:
        # Generate the current folder path
        current_folder = os.path.join(folder_path, folder)
        # Generate the new folder path using the mapping
        new_folder = os.path.join(folder_path, mapping_dict[int(folder)])
        
        # Rename the folder
        os.rename(current_folder, new_folder)
        print(f"Renamed {current_folder} to {new_folder}")

print("Folder renaming complete.")


Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/705 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U480
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/338 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U676
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/630 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U377
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/263 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U837
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/334 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U892
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/747 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U813
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/69 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U237
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/va

Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/288 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U820
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/997 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U79
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/51 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U300
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/125 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U285
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/580 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U516
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/993 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U939
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/609 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U168
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val

Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/713 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U549
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/346 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U590
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/417 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U185
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/36 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U276
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/342 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U536
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/755 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U71
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/826 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U722
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val

Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/99 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U235
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/192 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U290
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/972 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U584
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/676 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U830
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/100 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U437
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/513 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U716
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/217 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U191
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/va

Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/434 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U224
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/847 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U471
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/79 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U315
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/772 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U91
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/843 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U735
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/476 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U143
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val/889 to /projectnb/textconv/distill/mdistiller/data/imagenet/val/U999
Renamed /projectnb/textconv/distill/mdistiller/data/imagenet/val