In [1]:
import itertools
import os
import numpy as np
import pandas as pd



# Get the root_path for this jupyter notebook repo.
repo_path = os.path.dirname(os.path.abspath(os.getcwd()))

path_files_locus_index = os.path.join(
    repo_path, 'files', 'tell-dor', 'tell-dor-area-g-locus-image-index.csv'
)

# Path to the Tell Dor file metadata CSV
path_files = os.path.join(
    repo_path, 'files', 'tell-dor', 'tell-dor-files.csv'
)
# Path to the Tell Dor locus metadata CSV 
path_loci = os.path.join(
    repo_path, 'files', 'tell-dor', 'tell-dor-loci.csv'
)
# Output path for associations between the files and the loci.
path_files_contexts = os.path.join(
    repo_path, 'files', 'tell-dor', 'tell-dor-files-contexts.csv'
)


# Read the file - locus index supplied by the Tell Dor team.
fl_df = pd.read_csv(path_files_locus_index)

# Read the file metadata CSV into dataframe f_df.
f_df = pd.read_csv(path_files)

# Read the locus (and wall) CSV into dataframe l_df.
l_df = pd.read_csv(path_loci)

fl_df['Locus_Wall'] = fl_df['Locus_Wall'].astype(str) 
fl_df['Locus ID'] = np.nan
for i, row in fl_df.iterrows():
    wall_id = 'Wall ' + row['Locus_Wall']
    locus_id = 'Locus ' + row['Locus_Wall']
    print('Look for {} or {}'.format(wall_id, locus_id))
    id_indx = ((l_df['Locus ID']==wall_id)|(l_df['Locus ID']==locus_id))
    if l_df[id_indx].empty:
        continue
    up_indx = (fl_df['Locus_Wall'] == row['Locus_Wall'])
    fl_df.loc[up_indx, 'Locus ID'] = l_df[id_indx]['Locus ID'].iloc[0]
    print('Update {} with {}'.format(row['Locus_Wall'], l_df[id_indx]['Locus ID'].iloc[0]))

fl_df.to_csv(path_files_locus_index, index=False)



# Set up a dict for File and Locus (and Wall) associations.
file_locus_data = {
    'File ID':[], 
    'Locus ID': [],
}

# Set up a dict for File and Area associations.
# NOTE: An "Area" is an aggregation of multiple squares in the locus/wall
# datafile. Eric grouped these to make search / browsing easier. They
# don't really have any purpose or value for interpretation.
file_square_data = {
    'File ID':[], 
    'Area': [],
}


def add_to_file_context_data(
    file_ids, 
    context_ids,  
    data,
    context_id_col='Locus ID'
):
    """Adds records of file and context associations to a data dict"""
    if not isinstance(context_ids, list):
        context_ids = [context_ids]
    # Get the cross product of all the file_ids and the
    # context_ids
    crossprod = list(itertools.product(file_ids, context_ids))
    data['File ID'] += [c[0] for c in crossprod]
    data[context_id_col] += [c[1] for c in crossprod]
    return data


Look for Wall 18839 or Locus 18839
Look for Wall 18229 or Locus 18229
Update 18229 with Wall 18229
Look for Wall 18308 or Locus 18308
Update 18308 with Locus 18308
Look for Wall 9729 or Locus 9729
Update 9729 with Wall 9729
Look for Wall 9262 or Locus 9262
Update 9262 with Wall 9262
Look for Wall 9684 or Locus 9684
Update 9684 with Wall 9684
Look for Wall 9626 or Locus 9626
Update 9626 with Wall 9626
Look for Wall 9278 or Locus 9278
Update 9278 with Wall 9278
Look for Wall 9704 or Locus 9704
Update 9704 with Wall 9704
Look for Wall 9857 or Locus 9857
Update 9857 with Locus 9857
Look for Wall 18047 or Locus 18047
Update 18047 with Locus 18047
Look for Wall 18033 or Locus 18033
Update 18033 with Locus 18033
Look for Wall 18297 or Locus 18297
Update 18297 with Locus 18297
Look for Wall 18298 or Locus 18298
Update 18298 with Locus 18298
Look for Wall 18275 or Locus 18275
Update 18275 with Locus 18275
Look for Wall 18243 or Locus 18243
Update 18243 with Locus 18243
Look for Wall 9762 or Loc

Update 9262 with Wall 9262
Look for Wall 18048 or Locus 18048
Update 18048 with Wall 18048
Look for Wall 18035 or Locus 18035
Update 18035 with Locus 18035
Look for Wall 9140 or Locus 9140
Look for Wall 9953 or Locus 9953
Update 9953 with Locus 9953
Look for Wall 9728 or Locus 9728
Update 9728 with Wall 9728
Look for Wall 9626 or Locus 9626
Update 9626 with Wall 9626
Look for Wall 9704 or Locus 9704
Update 9704 with Wall 9704
Look for Wall 9834 or Locus 9834
Update 9834 with Locus 9834
Look for Wall 9728 or Locus 9728
Update 9728 with Wall 9728
Look for Wall 9824 or Locus 9824
Update 9824 with Locus 9824
Look for Wall 9727 or Locus 9727
Update 9727 with Locus 9727
Look for Wall 9736 or Locus 9736
Update 9736 with Locus 9736
Look for Wall 9204 or Locus 9204
Update 9204 with Locus 9204
Look for Wall 18042 or Locus 18042
Update 18042 with Locus 18042
Look for Wall 9727 or Locus 9727
Update 9727 with Locus 9727
Look for Wall 18042 or Locus 18042
Update 18042 with Locus 18042
Look for Wall 

Update 9211 with Wall 9211
Look for Wall 18516 or Locus 18516
Update 18516 with Wall 18516
Look for Wall 18515 or Locus 18515
Update 18515 with Wall 18515
Look for Wall 9823 or Locus 9823
Update 9823 with Locus 9823
Look for Wall 9262 or Locus 9262
Update 9262 with Wall 9262
Look for Wall 9275 or Locus 9275
Look for Wall 9262 or Locus 9262
Update 9262 with Wall 9262
Look for Wall 18021 or Locus 18021
Update 18021 with Locus 18021
Look for Wall 9275 or Locus 9275
Look for Wall 9140 or Locus 9140
Look for Wall 18067 or Locus 18067
Update 18067 with Locus 18067
Look for Wall 9275 or Locus 9275
Look for Wall 9262 or Locus 9262
Update 9262 with Wall 9262
Look for Wall 18252 or Locus 18252
Update 18252 with Locus 18252
Look for Wall 9262 or Locus 9262
Update 9262 with Wall 9262
Look for Wall 9140 or Locus 9140
Look for Wall 9140 or Locus 9140
Look for Wall 18263 or Locus 18263
Update 18263 with Locus 18263
Look for Wall 18271 or Locus 18271
Update 18271 with Locus 18271
Look for Wall 18511 o

In [2]:
f_df.head(3)

Unnamed: 0,File ID,Illustration,Chp,Part Number,Caption,DB ID,FileName,FileType,Error in print
0,Figure 1.1,Figure,1.0,1,Fig. 1.1. Map of Tel Dor showing Area G in rel...,d09Z1-1001,d09Z1-1001.tif,tif,
1,Figure 1.2,Figure,1.0,2,Fig. 1.2. A reconstruction of the Roman street...,d09Z1-1002,d09Z1-1002.tif,tif,
2,Figure 1.3,Figure,1.0,3,Fig. 1.3. Area G during the first season in 19...,p08Z3-1365,p08Z3-1365.tif,tif,


In [3]:
l_df.head(3)

Unnamed: 0,Region,Site,Area,Note,Locus ID,Original Sort Order,Locus/Wall,Number,Square,Phase,Contextual Integrity (I) Code,Contextual Integrity (I),Phasing of Contents (PoC),Comments,Context,Chapter,Status
0,Israel,Tel Dor,AI–AK:31–32,"To facilitate navigation, Open Context editors...",Locus 9000,1,Locus,9000,AI–AK/31–32,1a,--,--,⪰1,Topsoil on top of ashlar pavement of Phase 1a ...,--,"Dor IIIA: 5, 7, 8, 9, 11, 12, 13, 16",Not Final
1,Israel,Tel Dor,AJ:30–34,"To facilitate navigation, Open Context editors...",Locus 9001,2,Locus,9001,AJ/32,1a,--,--,⪰1,Topsoil down to fragment of F9000,--,Dor IIIA: 12,Not Final
2,Israel,Tel Dor,AJ:30–34,"To facilitate navigation, Open Context editors...",Locus 9002,3,Locus,9002,AJ/33,--,n,non-stratified,--,Topsoil,--,Dor IIIA: 13,Not Final


In [4]:

# Find matching Loci (including Wall Loci) by matching their IDs
# with text in the file metadata 'Caption' column.
for locus_wall_id in l_df['Locus ID'].unique().tolist():
    l_w_id = locus_wall_id.replace('Locus ', 'L').replace('Wall ', 'W')
    
    # l_w_mum_id is for locus or wall IDs that are long unlikely to be
    # a false positive, and lack a "L" or "W" in the caption.
    l_w_num_id = l_w_id.replace('L', ' ').replace('W', ' ')
    if len(l_w_num_id) >= 6:
        # Catch cases where the Locus / Wall ID is long like 
        # '18347'.
        l_w_indx = (
            f_df['Caption'].str.contains(l_w_id)
            | f_df['Caption'].str.contains(l_w_num_id)
        )
    else:
        # The locus / wall id is too short to trust without a 
        # "L" or "W" prefix.
        l_w_indx = f_df['Caption'].str.contains(l_w_id)
    
    if f_df[l_w_indx].empty:
        # We didn't find a match, so continue.
        continue
    print('Found: {} for {} as {}'.format(
            len(f_df[l_w_indx]), 
            locus_wall_id,
            l_w_id,
        )
    )
    file_ids = f_df[l_w_indx]['File ID'].unique().tolist()
    file_locus_data = add_to_file_context_data(
        file_ids, 
        locus_wall_id, 
        file_locus_data
    )

# Now make a dataframe of the file - locus associations
file_locus_df = pd.DataFrame(data=file_locus_data)
print('File and Locus Associations (Found: {})'.format(
    len(file_locus_df.index))
)

Found: 3 for Wall 9003 as W9003
Found: 1 for Wall 9015 as W9015
Found: 1 for Wall 9019 as W9019
Found: 2 for Locus 9025 as L9025
Found: 1 for Wall 9041 as W9041
Found: 1 for Wall 9047 as W9047
Found: 2 for Locus 9048 as L9048
Found: 3 for Wall 9058 as W9058
Found: 5 for Wall 9065 as W9065
Found: 14 for Wall 9066 as W9066
Found: 3 for Wall 9096 as W9096


  return func(self, *args, **kwargs)


Found: 4 for Wall 9147 as W9147
Found: 2 for Wall 9162 as W9162
Found: 3 for Locus 9168 as L9168
Found: 2 for Wall 9180 as W9180
Found: 1 for Locus 9185 as L9185
Found: 1 for Locus 9202 as L9202
Found: 2 for Locus 9204 as L9204
Found: 21 for Wall 9211 as W9211
Found: 2 for Wall 9212 as W9212
Found: 6 for Wall 9216 as W9216
Found: 2 for Wall 9217 as W9217
Found: 1 for Wall 9243 as W9243
Found: 2 for Locus 9251 as L9251
Found: 1 for Wall 9253 as W9253
Found: 30 for Wall 9262 as W9262
Found: 30 for Wall 9266 as W9266
Found: 1 for Wall 9274 as W9274
Found: 1 for Wall 9275a–b as W9275a–b
Found: 7 for Wall 9278 as W9278
Found: 1 for Wall 9279 as W9279
Found: 10 for Wall 9282 as W9282
Found: 1 for Wall 9290 as W9290
Found: 1 for Locus 9298 as L9298
Found: 1 for Wall 9301b as W9301b
Found: 2 for Locus 9326 as L9326
Found: 1 for Wall 9340 as W9340
Found: 1 for Locus 9346 as L9346
Found: 7 for Wall 9400 as W9400
Found: 4 for Wall 9408a as W9408a
Found: 7 for Wall 9408b as W9408b
Found: 1 for Wal

In [5]:
# Find matching Loci (including Wall Loci) by matching their Squares
# with text in the file metadata 'Caption' column.
l_df_sq = l_df[~l_df['Square'].isnull()]
for square in l_df_sq['Square'].astype(str).unique().tolist():
    sq_indx = f_df['Caption'].str.contains(square)
    if len(square) < 3 or f_df[sq_indx].empty:
        # Not enough characters for secure match.
        continue
    # Get all file_ids that have his square in their captions
    file_ids = f_df[sq_indx]['File ID'].unique().tolist()
    # Get all the locus ids that are associated with this square
    area_ids = l_df[
        l_df['Square']==square
    ]['Area'].unique().tolist()
    print('Found: {} files with square {} and {} areas'.format(
            len(f_df[sq_indx]), 
            square,
            len(area_ids)
        )
    )
    # Now add to the file_locus_data.
    file_square_data = add_to_file_context_data(
        file_ids, 
        area_ids, 
        file_square_data,
        context_id_col='Area'
    )

# Now make a dataframe of the file - area associations
file_area_df = pd.DataFrame(data=file_square_data)
print('File and Area Associations (Found: {})'.format(
    len(file_area_df.index))
)

Found: 22 files with square AJ/32 and 1 areas
Found: 14 files with square AJ/33 and 1 areas
Found: 3 files with square AJ–AK/33 and 1 areas
Found: 22 files with square AI/32 and 1 areas
Found: 1 files with square AJ/31 and 1 areas
Found: 20 files with square AI/31 and 1 areas
Found: 1 files with square AI/31–32 and 1 areas
Found: 1 files with square AI/31–33 and 1 areas
Found: 4 files with square AJ/32–33 and 1 areas
Found: 12 files with square AK/32 and 1 areas
Found: 6 files with square AJ/34 and 1 areas
Found: 26 files with square AI/33 and 1 areas
Found: 9 files with square AJ–AK/32 and 1 areas
Found: 1 files with square AI/34 and 1 areas
Found: 6 files with square AH/33 and 1 areas
Found: 5 files with square AK/33 and 1 areas
Found: 2 files with square AK/34 and 1 areas
Found: 4 files with square AH/34 and 1 areas
Found: 3 files with square AI–AJ/32 and 1 areas
Found: 1 files with square AI–AJ/33 and 1 areas
Found: 8 files with square AH–AI/33 and 1 areas
Found: 1 files with squar

In [6]:
context_df = pd.merge(file_locus_df, file_area_df, on='File ID', how='outer')
context_linked_files = context_df['File ID'].unique().tolist()
print('Found File and Context Associations for {} unique files (total rows: {})'.format(
    len(context_linked_files),
    len(context_df.index))
)


# Get a list of files that do NOT have context associations
no_context_files = f_df[
    ~f_df['File ID'].isin(context_linked_files)
]['File ID'].unique().tolist()

file_site_data = {
    'File ID':[], 
    'Site Area': [],
}
file_site_data = add_to_file_context_data(
    no_context_files, 
    'Area G', 
    file_site_data,
    context_id_col='Site Area'
)
site_df = pd.DataFrame(data=file_site_data)
context_df = pd.concat([context_df, site_df], sort=False)

# Set the column order for nice aesthetics
context_df = context_df[['File ID', 'Site Area', 'Area', 'Locus ID']]
context_df.sort_values(by=['File ID', 'Locus ID', 'Area'], inplace=True)

context_df.to_csv(path_files_contexts, index=False)
context_df.head(3)


Found File and Context Associations for 426 unique files (total rows: 857)


Unnamed: 0,File ID,Site Area,Area,Locus ID
0,Figure 1.1,Area G,,
532,Figure 1.10,,AJ:30–34,Wall 9914
733,Figure 1.11,,AI–AJ:32–34,
