In [None]:
import os
import os.path as osp
import pandas as pd
from glob import glob

In [None]:
data_dir = osp.abspath('../collected_data/processed/')

full_path = osp.join(data_dir, 'full_processed_collected_data.csv')
full_df = pd.read_csv(full_path, index_col=0)

print(f'{len(full_df)} entries in full df')

valid_path = osp.join(data_dir, 'valid_processed_collected_data.csv')
valid_df = pd.read_csv(valid_path, index_col=0)

print(f'{len(valid_df)} entries in valid df')

In [None]:
valid_with_comments_df = valid_df[~valid_df.comments.isna()]
print(f'{len(valid_with_comments_df)} entries with comments in valid df')

columns = ['item_identifyer', 'raw_annotation', 'clean_annotation', 'head_noun', 'comments']
comments_path = osp.join(data_dir, 'valid_commented_collected_data.csv')
valid_with_comments_df[columns].to_csv(comments_path)

In [None]:
valid_df.groupby('tangram_id').size().value_counts()

In [None]:
valid_df.groupby('scene').size().value_counts()

In [None]:
valid_df.groupby('item_id').size().value_counts()

In [None]:
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Synset

def get_first_head_noun(name):
    possible_names = name.split('/')
    selected_name = possible_names[0].strip().lower()
    return selected_name

def get_possible_synsets(name):
    return [s for s in wn.synsets(name) if s.pos() == 'n']

synset_df = valid_df[['item_identifyer', 'tangram', 'scene', 'raw_annotation', 'clean_annotation', 'head_noun', 'comments', 'image_url']]

synset_df['head_noun'] = synset_df['head_noun'].map(get_first_head_noun)

# get synsets
synset_df['possible_synsets'] = synset_df['head_noun'].map(get_possible_synsets)
synset_df['selected_synset'] = synset_df['possible_synsets'].map(lambda x: x[0] if len(x) > 0 else '')
synset_df['synset_definition'] = synset_df['selected_synset'].map(lambda x: x.definition() if type(x) == Synset else '')

# convert synsets to strings
synset_df['possible_synsets'] = synset_df['possible_synsets'].map(lambda l_s: [s.name() for s in l_s])
synset_df['selected_synset'] = synset_df['selected_synset'].map(lambda x: x.name() if type(x) == Synset else x)
synset_df = synset_df.sort_values(by=['tangram', 'scene', 'head_noun']).reset_index()

# reorder columns
synset_df['corrected_head_noun'] = ''
synset_df = synset_df[[
    'item_identifyer', 'raw_annotation', 'clean_annotation', 'head_noun', 'corrected_head_noun', 'selected_synset', 'synset_definition', 'possible_synsets', 'comments', 'image_url'
]]


synset_df.head()

In [None]:
# split df in two equal sized parts
half = len(synset_df) // 2
synset_df_0 = synset_df.iloc[:half]
synset_df_1 = synset_df.iloc[half:]

# check if combined splits are equal to original df
assert synset_df.equals(pd.concat([synset_df_0, synset_df_1]))

for i, df in enumerate([synset_df_0, synset_df_1]):
    file_name = f'valid_processed_synsets_{i}_auto.csv'
    file_path = osp.join(data_dir, file_name)

    print(f'write df to {file_path}')
    df.to_csv(file_path)