In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd drive/MyDrive/1016/Project/Typicality_project/

/content/drive/MyDrive/1016/Project/Typicality_project


In [3]:
import numpy as np
import pandas as pd
import torch
from PIL import Image
from torchvision import transforms
import os
import matplotlib.pyplot as plt
import ast
from functools import partial

# First attempt: Use WordNet similarity score to find probabilitiy corresponding to the closest label 

In [4]:
from nltk.corpus import wordnet

In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
df_old = pd.read_csv('./Overfeat/overfeat_results_original.txt',index_col=0,sep='\t',header=0)

In [7]:
df_old = df_old.rename(columns={"0": "Type", "1": "dir", "2": "name", "3": "score"})

In [8]:
# read labels to wordnet synsets
ltw = pd.read_json('imagenet_label_to_wordnet_synset.json').T

In [9]:
# get readable labels from imagenet_label_to_wordnet_synset.json
def convertIdToReadable(id_label):
    for labelx in range(ltw.shape[0]):
        if ltw['label'][labelx] == id_label:
            return ltw['id'][labelx]
    return None

In [10]:
df_old['id'] = df_old['name'].apply(convertIdToReadable)

In [None]:
result_dict = {}
for index, row in df_old.iterrows():
    key = (row['Type'],row['dir'])
    if key not in result_dict.keys():
        result_dict[key]=[{row['name']:row['score']},{row['id']:row['score']}]
    else:
        result_dict[key][0][row['name']] = row['score']
        result_dict[key][1][row['id']] = row['score']
result_dict[('Banana',1)]

In [12]:
df = pd.DataFrame(columns = ['type','dir','id_labels','readable_labels'])

In [13]:
for key in result_dict.keys():
    type, dir = key
    dfsmall = {'type':type,'dir':dir,'id_labels':result_dict[key][1],'readable_labels':result_dict[key][0]}
    df = df.append(dfsmall, ignore_index = True)

In [14]:
df.to_csv('./Overfeat/overfeat_scores_wordnet_id.csv')

In [15]:
df.head()

Unnamed: 0,type,dir,id_labels,readable_labels
0,Plane,1,"{'02690373-n': 0.794895, '04592741-n': 0.20368...","{'airliner': 0.794895, 'wing': 0.2036869999999..."
1,Plane,10,"{'02690373-n': 0.8531709999999999, '04592741-n...","{'airliner': 0.8531709999999999, 'wing': 0.114..."
2,Plane,11,"{'02690373-n': 0.48173999999999995, '04592741-...","{'airliner': 0.48173999999999995, 'wing': 0.14..."
3,Plane,12,"{'04592741-n': 0.318848, '03874293-n': 0.16218...","{'wing': 0.318848, 'paddlewheel, paddle wheel'..."
4,Plane,13,"{'04552348-n': 0.145282, '04065272-n': 0.14077...","{'warplane, military plane': 0.145282, 'recrea..."


Input dataframe columns: (see alexnet.ipynb for getting wordnet ids for corresponding labels)

In [None]:
# manually select the synsnet with the correct definition for each type
type_names = np.unique(df['type'].values)
for t in type_names:
    syns = wordnet.synsets(t.lower())
    if t == 'Beach':
        syns = wordnet.synsets('seashore')
    print(f'{t}: ')
    for s in syns:
        print(s)
        print(s.definition())
    print()

In [17]:
type_synsets = ['banana.n.02','seashore.n.01','car.n.01','church.n.02','beacon.n.03','mountain.n.01','coffee_mug.n.01','airplane.n.01']
type_final = {type_names[i] : type_synsets[i] for i in range(len(type_names))}

In [None]:
type_final

In [19]:
# read labels 
with open("./imagenet_classes.txt", "r") as f:
    categories = [s.strip() for s in f.readlines()]
# read labels to wordnet synsets
ltw = pd.read_json('./imagenet_label_to_wordnet_synset.json').T

In [20]:
# compute similarity score between synsets of each of the top 10 labels and the type 
def getSyns(similarity,closest,row):
    # type name
    category = row['type'] 
    
    # top 10 labels' ids in wordnet
    labels = row['id_labels'] 
    labels = ast.literal_eval(labels)
    key = labels.keys()
    
    # iterate through top 10 labels and compare with type 
    sim = dict()
    notFound = True
    for k in key:
        if k:
            # print(k)
            pos, synid = k.split('-')[1], int(k.split('-')[0]) # part of speech (all noun in this case), id
            k_syns = wordnet.synset_from_pos_and_offset(pos, synid) # label synsets
            cat_syns = wordnet.synset(type_final[category]) # type synsets
            s = k_syns.path_similarity(cat_syns)
            sim[k] = s
            
            # find the first > 0.5 similarity label among the top 10
            if s >= 0.5 and notFound:
                closest.append({k:s})
                notFound = False
            
    # return the label with highest probabilitiy if none is above 0.5
    if notFound:
        sim_np = np.array(sim.values())
        max_key = list(key)[np.argmax(sim_np)]
        closest.append({max_key : sim[max_key]})
    
    similarity.append(sim)

In [21]:
# get readable labels from imagenet_label_to_wordnet_synset.json
def convertIdToReadable(id_label):
    for idx in range(ltw.shape[0]):
        if ltw['id'][idx] == id_label:
            return ltw['label'][idx]
    return None

In [22]:
df = pd.read_csv('Overfeat/overfeat_scores_wordnet_id.csv',index_col=0)

In [23]:
similarity = []
closest = []
getSynsPartial = partial(getSyns, similarity, closest)
df.apply(getSynsPartial, axis=1)

df['similarity_score'] = similarity
df['closest_id_label'] = [list(item.keys())[0] for item in closest]
df['closest_similarity_score'] = [list(item.values())[0] for item in closest]
df['closest_readable_label'] = df['closest_id_label'].apply(convertIdToReadable)
df['closest_prob'] = [ast.literal_eval(d)[i] for d, i in zip(df['id_labels'],df['closest_id_label'])]

In [None]:
df.to_csv('./Overfeat/overfeat_scores_wordnet_id_closest_label.csv')

Potential issues: 
1. top 10 and threshold of 0.5 are arbitrarily chosen - e.g. see `df.loc[4]` where the top label is `cliff` (similarity score 0.25) but `alp` is the chosen label
2. when none of top 10 labels' similarity scores pass the 0.5 threshold, it chooses the label with max similarity score and breaks ties by choosing the first, which is one with highest probability among all max similarity score labels - e.g. see `df.loc[9]`


# Second attempt: Map all labels using similarity score

In [48]:
# read labels 
with open("./imagenet_classes.txt", "r") as f:
    categories = [s.strip() for s in f.readlines()]
# read labels to wordnet synsets
ltw = pd.read_json('./imagenet_label_to_wordnet_synset.json').T

In [49]:
for t in type_final:
    sim = []
    for i in range(ltw.shape[0]):
        type_synset = wordnet.synset(type_final[t])
        k = ltw.iloc[i]['id']
        pos, synid = k.split('-')[1], int(k.split('-')[0]) # part of speech (all noun in this case), id
        k_syns = wordnet.synset_from_pos_and_offset(pos, synid) # label synsets
        sim.append(k_syns.path_similarity(type_synset))
    ltw[t] = sim

In [50]:
type_labels = dict()
for t in type_final:
    if t in ['Plane']:
        temp = np.array(ltw[ltw[t] >= 0.3].sort_values(t, ascending=False).index)
    else:
        temp = np.array(ltw[ltw[t] >= 0.5].sort_values(t, ascending=False).index)
    if temp.size == 1:
        type_labels[t] = temp[0]
        print(f"{t}: {ltw.loc[type_labels[t]]['label']}")
        print(f"{t}: {ltw.loc[type_labels[t]][t]}")
        print()
    else:
        type_labels[t] = temp
        print(f"{t}: {ltw.loc[type_labels[t]]['label'].values}")
        print(f"{t}: {ltw.loc[type_labels[t]][t].values}")
        print()

Banana: banana
Banana: 1.0

Beach: seashore, coast, seacoast, sea-coast
Beach: 1.0

Car: ['ambulance'
 'beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon'
 'cab, hack, taxi, taxicab' 'convertible' 'jeep, landrover'
 'limousine, limo' 'minivan' 'Model T' 'racer, race car, racing car'
 'sports car, sport car']
Car: [0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]

Church: church, church building
Church: 1.0

Lighthouse: beacon, lighthouse, beacon light, pharos
Lighthouse: 1.0

Mountain: ['alp' 'volcano']
Mountain: [0.5 0.5]

Mug: coffee mug
Mug: 1.0

Plane: ['airliner' 'warplane, military plane']
Plane: [0.5        0.33333333]



Select one label that's closest to our definition of the categories

# Final label selection: select one label that most closely matches our definitions of the categories

In [51]:
type_labels['Car'] = 436
type_labels['Mountain'] = 970
type_labels['Plane'] = 404

In [52]:
selected = ltw.loc[list(type_labels.values())][ltw.columns[:3]]
selected['type'] = type_labels.keys()

In [53]:
df = pd.read_csv('Overfeat/overfeat_scores_wordnet_id.csv',index_col=0)

In [54]:
df.head()

Unnamed: 0,type,dir,id_labels,readable_labels
0,Plane,1,"{'02690373-n': 0.794895, '04592741-n': 0.20368...","{'airliner': 0.794895, 'wing': 0.2036869999999..."
1,Plane,10,"{'02690373-n': 0.8531709999999999, '04592741-n...","{'airliner': 0.8531709999999999, 'wing': 0.114..."
2,Plane,11,"{'02690373-n': 0.48173999999999995, '04592741-...","{'airliner': 0.48173999999999995, 'wing': 0.14..."
3,Plane,12,"{'04592741-n': 0.318848, '03874293-n': 0.16218...","{'wing': 0.318848, 'paddlewheel, paddle wheel'..."
4,Plane,13,"{'04552348-n': 0.145282, '04065272-n': 0.14077...","{'warplane, military plane': 0.145282, 'recrea..."


In [55]:
def getScoreForMappedLabels(matched_label,matched_prob,row):
    top_labels = ast.literal_eval(row['id_labels'])
    mapped_label = ltw.loc[type_labels[row['type']]]['id']
    flag = False
    for k in top_labels.keys():
        if k == mapped_label:
            matched_label.append(k)
            matched_prob.append(top_labels[k])
            flag = True
    if not flag:
        matched_label.append(mapped_label)
        matched_prob.append(0)

In [56]:
matched_label,matched_prob = [], []
getScoreForMappedLabelsPartial = partial(getScoreForMappedLabels, matched_label,matched_prob)
df.apply(getScoreForMappedLabelsPartial,axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
123    None
124    None
125    None
126    None
127    None
Length: 128, dtype: object

In [57]:
df['matched_label'] = matched_label
df['matched_prob'] = matched_prob
df['matched_readable'] = df['matched_label'].apply(lambda x: ltw[ltw['id'] == x]['label'].values[0])

In [58]:
df

Unnamed: 0,type,dir,id_labels,readable_labels,matched_label,matched_prob,matched_readable
0,Plane,1,"{'02690373-n': 0.794895, '04592741-n': 0.20368...","{'airliner': 0.794895, 'wing': 0.2036869999999...",02690373-n,0.794895,airliner
1,Plane,10,"{'02690373-n': 0.8531709999999999, '04592741-n...","{'airliner': 0.8531709999999999, 'wing': 0.114...",02690373-n,0.853171,airliner
2,Plane,11,"{'02690373-n': 0.48173999999999995, '04592741-...","{'airliner': 0.48173999999999995, 'wing': 0.14...",02690373-n,0.481740,airliner
3,Plane,12,"{'04592741-n': 0.318848, '03874293-n': 0.16218...","{'wing': 0.318848, 'paddlewheel, paddle wheel'...",02690373-n,0.113196,airliner
4,Plane,13,"{'04552348-n': 0.145282, '04065272-n': 0.14077...","{'warplane, military plane': 0.145282, 'recrea...",02690373-n,0.057365,airliner
...,...,...,...,...,...,...,...
123,Banana,5,"{'07753592-n': 0.364579, '07930864-n': 0.12127...","{'banana': 0.364579, 'cup': 0.121272, 'coil, s...",07753592-n,0.364579,banana
124,Banana,6,"{'07753592-n': 0.9627370000000001, '07717410-n...","{'banana': 0.9627370000000001, 'acorn squash':...",07753592-n,0.962737,banana
125,Banana,7,"{'07753592-n': 1.0, '03255030-n': 0.0, '078311...","{'banana': 1.0, 'dumbbell': 0.0, 'carbonara': ...",07753592-n,1.000000,banana
126,Banana,8,"{'07753592-n': 0.878092, '03680355-n': 0.03566...","{'banana': 0.878092, 'Loafer': 0.0356656, 'clo...",07753592-n,0.878092,banana


In [107]:
df.to_csv('overfeat_scores_final.csv')