# Archive: Link metadata to photos

In [1]:
## install required packages

# create requirements.txt
! echo "pandas==1.1.4" > requirements.txt
! echo "numpy==1.19.4" >> requirements.txt
! echo "tqdm==4.54.0" >> requirements.txt
#! echo "xlrd==1.2.0" >> requirements.txt

# install requirements
! pip install -r requirements.txt



In [85]:
## import required packages

import os, ast
import pandas as pd
import numpy as np
from time import time
from tqdm import tqdm
from datetime import datetime

tqdm.pandas()

  from pandas import Panel


In [3]:
# Start timer
t0 = time()

## 0. Variables to be modified

In [4]:
# path to photos root
path_photos = 'base photos/photos terrains'

# path to cartes root
path_cartes = 'base photos/cartes'

## 1. Creation of a dictionnary storing each folder code with the matching metadata

In [5]:
def read_all_sheets(path):
    """
    creates a dataframe with all sheets concatenated from an excel file
    """
    xl = pd.ExcelFile(path)
    n_sheets = len(xl.sheet_names)
    for i in range(0, n_sheets):
        if i == 0:
            df = xl.parse(i)
        else:
            df = df.append(xl.parse(i))
    return df

In [6]:
def gather_cartes_info(path):
    """
    creates a list of dataframes,
    each dataframe containing the information of each sites for a specific type
    
    ignores all files that do not work
    """

    cartes_info = []
    files = os.listdir(path)
    
    for file in files:
        try:
            df_file = read_all_sheets(path + '/' + file)
            cartes_info += [df_file]
        except Exception as e:
            print('\nFile discarded as a carte :', file)
            print('Error :', e)
            
    return(cartes_info)

In [7]:
def remove_digits(string):
    """
    removes all digits from a string
    """
    return ''.join([i for i in string if not i.isdigit()])

In [8]:
def get_code_dictionary(list_df_files):
    """
    returns a dictionary, the keys are the code types, the values are the associated dataframes
    """
        
    code_dictionary = {}

    for df in list_df_files:

        first_code = df['Code Folder'].iloc[0]
        code_root = remove_digits(first_code)

        code_dictionary[code_root] = df

    return code_dictionary

In [9]:
cartes_info = gather_cartes_info(path_cartes)


File discarded as a carte : .DS_Store
Error : Unsupported format, or corrupt file: Expected BOF record; found b'\x00\x00\x00\x01Bud1'

File discarded as a carte : zz other
Error : [Errno 21] Is a directory: 'base photos/cartes/zz other'

File discarded as a carte : ~$p Rock art 11-2020 update 10-2021.xlsx
Error : Unsupported format, or corrupt file: Expected BOF record; found b'\x08quen ti'

File discarded as a carte : ~$rc Regular chortens - 10oct21.xlsx
Error : Unsupported format, or corrupt file: Expected BOF record; found b'\x08quen ti'


In [10]:
code_dictionary = get_code_dictionary(cartes_info)

In [11]:
print(code_dictionary.keys())

dict_keys(['bc', 'ca', 'f', 'os', 'p', 'pc', 'rc', 'ss', 't'])


## 2. Creation of a dataframe containing all photo names and their code folders

In [12]:
def get_list_of_files(dir_name):
    # create a list of file and sub directories 
    # names in the given directory 
    list_of_files = os.listdir(dir_name)
    all_files = list()
    # Iterate over all the entries
    for entry in list_of_files:
        # Create full path
        full_path = os.path.join(dir_name, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(full_path):
            all_files = all_files + get_list_of_files(full_path)
        else:
            all_files.append(full_path)
                
    return all_files

In [13]:
def match_code_photos(photos):
    
    # Create empty dataframe
    photos_info = pd.DataFrame(columns = ['photo_name','code_folder', 'path_folder'])
    
    
    # add photo infos to dataframe
    for i in tqdm(range(0,len(photos))):

        # Exclude files with '#' in the path
        if photos[i].find('#') == -1 :

            split_path = photos[i].split('/')

            photo = split_path[-1]

            # Exclude non '.jpg' or '.JPG' files
            if (photo.split('.')[-1] == 'JPG') | (photo.split('.')[-1] == 'jpg'):

                folder_name = split_path[-2]
                code_folder = folder_name.split(' ')[-1]
                path_folder = '/'.join(photos[i].split('/')[:-1])

                # add photo infos to dataframe
                s = pd.Series([photo,code_folder, path_folder],index=['photo_name','code_folder', 'path_folder'])
                photos_info = photos_info.append(s,ignore_index=True)

    print("{} photos were found, {} were discarded because a '#' was found in the path".format(i,i-photos_info.shape[0]))
    print("dataframe contains {} photos before duplicate management".format(photos_info.shape[0]))

    # Aggregate photos in multiple folder (add '+' in code_folder)
    photos_info = photos_info.groupby('photo_name').agg(
                    {'code_folder':(lambda x: '+'.join(x)),
                     'path_folder':(lambda x: list(x))}).reset_index()

    print('dataframe contains {} photos after duplicate management'.format(photos_info.shape[0]))
    
    return photos_info

In [14]:
photos = get_list_of_files(path_photos)

In [15]:
photos_info = match_code_photos(photos)

100%|██████████| 395/395 [00:00<00:00, 456.30it/s]

394 photos were found, 49 were discarded because a '#' was found in the path
dataframe contains 345 photos before duplicate management
dataframe contains 342 photos after duplicate management





## 3. Match photos & site information

In [16]:
def add_metadata(photos_info, code_dictionary):
    photos_info['metadata'] = [[] for _ in range(len(photos_info))]


    for i in tqdm(range(0,len(photos_info))):
        code_folder = photos_info.iloc[i].code_folder

        code_list = code_folder.split('+')

        for code in code_list:    
            code_type = remove_digits(code)
            try :
                df_code = code_dictionary[code_type]
                row_code = df_code[df_code['Code Folder'] == code]
                metadata = row_code[['Code Display',
                                     'Name',
                                     'Type',
                                     'Latitude',
                                     'Longitude',
                                     'Location',
                                     'Region']].values.tolist()[0]
                photos_info.iloc[i,3] += [metadata]

            except Exception as z:
                print('metadata not added for line ',i,'because error with: ', z)
                
    return photos_info

In [17]:
def sentence(metadata):
    
    n_codes = len(metadata)
    
    if n_codes == 0:
        return ""
    
    else:
    
        sites = ""

        for i, metadata_i in enumerate(metadata):
            
            code, name, type_code, latitude, longitude, location, region = metadata_i
            
            site_i = f"{name} (code: {code}, type: {type_code}, coordinates: {longitude}°N {latitude}°E)"
            
            if i != n_codes - 1:
                site_i += ', '
            else:
                site_i += '. '
            
            sites += site_i

        if n_codes == 1:
            intro = "This is a picture of a heritage site in Ladakh. "
            intro_p2 = "The site is: "

        if n_codes > 1:
            intro = "This is a picture of " + str(n_codes) + " heritage sites in Ladakh. "
            intro_p2 = "The sites are: "
        
        location_sentence = f"Location: {location} ({region}). "
        
        sentence = intro + location_sentence + intro_p2 + sites + "More information: ladakharchaeology.com"
        
        return sentence

In [18]:
def add_sentences(photos_info):
    photos_info['sentence'] = photos_info.metadata.progress_apply(lambda x: sentence(x))
    return photos_info

In [19]:
photos_info = add_metadata(photos_info, code_dictionary)

100%|██████████| 342/342 [00:00<00:00, 471.47it/s]


In [20]:
photos_info = add_sentences(photos_info)

100%|██████████| 342/342 [00:00<00:00, 4960.38it/s]


In [21]:
# End timer
t1 = time() - t0
print(t1)

3.2136476039886475


In [22]:
## Save photos_info into a .csv file
#photos_info.to_csv('photos_info.csv')

## 4. Reshape file

In [23]:
dict_authors = {'QD': "Devers,Quentin,0000-0001-8469-0165"}

In [24]:
#photos_info = pd.read_csv('photos_info.csv', index_col=0)
#photos_info['path_folder'] = photos_info['path_folder'].apply(lambda x: ast.literal_eval(x))
#photos_info['metadata'] = photos_info['metadata'].apply(lambda x: ast.literal_eval(x))
photos_info

Unnamed: 0,photo_name,code_folder,path_folder,metadata,sentence
0,QD-2009-IMG2643.JPG,t045,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of a heritage site in Ladakh...
1,QD-2009-IMG2644.JPG,t045,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of a heritage site in Ladakh...
2,QD-2009-IMG2645.JPG,t045,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of a heritage site in Ladakh...
3,QD-2009-IMG2646.JPG,t045,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of a heritage site in Ladakh...
4,QD-2009-IMG2647.JPG,t045,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of a heritage site in Ladakh...
...,...,...,...,...,...
337,QD-2019-DSC123430.JPG,t047,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t047, Temple Ruin 3, Temple Ruin, 34.038895,...",This is a picture of a heritage site in Ladakh...
338,QD-2019-DSC123431.JPG,t045+t046,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of 2 heritage sites in Ladak...
339,QD-2019-DSC123432.JPG,t045+t046+t047+t048+t049,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of 5 heritage sites in Ladak...
340,QD-2019-DSC123433.JPG,t045+t046+t047+t048+t049,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of 5 heritage sites in Ladak...


In [25]:
photos_info['Status'] = 'pending'

In [26]:
photos_info['Title'] = photos_info['photo_name'].apply(lambda x: x.split('.')[0]) #.split('-')[-1])

In [27]:
photos_info['Path'] = photos_info['path_folder'].apply(lambda x: x[0]) + '/' + photos_info['photo_name']

In [28]:
photos_info['Description'] = photos_info['sentence']

In [29]:
photos_info['Creator'] = photos_info['photo_name'].apply(lambda x: dict_authors[x[0:2]])

In [30]:
photos_info['Year'] = photos_info['photo_name'].apply(lambda x: x.split('.')[0].split('-')[1])

In [31]:
photos_info['Type'] = 'Image' #'http://purl.org/coar/resource_type/c_c513'

In [32]:
photos_info['License'] = 'CC-BY-NC-4.0'

In [33]:
def extract_keywords(row):

    try:
    
        list_keywords = [' '.join(list(reversed(row['Creator'].split(',')[0:2]))), # creator
        row.metadata[0][6], # Region
        row.metadata[0][5], # Location
        row.metadata[0][2]  # Type
                        ]

        code_type = list(list(zip(*row.metadata))[0])
        
        return list_keywords + code_type
    
    except Exception as e:
        print("Error for photo: {} / type: {} / metadata: {} // Error: {}".format(row.photo_name,
                                                                                  row.code_folder,
                                                                                  row.metadata,
                                                                                  e)
             )

        return ['']*4

In [34]:
photos_info[photos_info.apply(lambda row: extract_keywords(row),axis=1).isna()]

Unnamed: 0,photo_name,code_folder,path_folder,metadata,sentence,Status,Title,Path,Description,Creator,Year,Type,License


In [35]:
photos_info['Keywords'] = photos_info.apply(lambda row: extract_keywords(row),axis=1)

In [36]:
photos_info['Collections'] = photos_info['Keywords'].apply(lambda x: ','.join(x[3:]))
photos_info['Keywords'] = photos_info['Keywords'].apply(lambda x: ','.join(x))

In [None]:
photos_info['Date scanned'] = str(datetime.now())[0:10]

In [106]:
photos_info = photos_info.filter(['Status', 'Title', 'Path',
                                  'Description', 'Creator', 'Year',
                                  'Keywords', 'Type', 'License',
                                  'Collections', 'Date scanned'])
photos_info

Unnamed: 0,Status,Title,Path,Description,Creator,Year,Keywords,Type,License,Collections
0,pending,QD-2009-IMG2643,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2009,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045"
1,pending,QD-2009-IMG2644,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2009,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045"
2,pending,QD-2009-IMG2645,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2009,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045"
3,pending,QD-2009-IMG2646,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2009,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045"
4,pending,QD-2009-IMG2647,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2009,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045"
...,...,...,...,...,...,...,...,...,...,...
337,pending,QD-2019-DSC123430,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2019,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t047"
338,pending,QD-2019-DSC123431,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of 2 heritage sites in Ladak...,"Devers,Quentin,0000-0001-8469-0165",2019,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045,t046"
339,pending,QD-2019-DSC123432,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of 5 heritage sites in Ladak...,"Devers,Quentin,0000-0001-8469-0165",2019,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045,t046,t047,t048,t049"
340,pending,QD-2019-DSC123433,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of 5 heritage sites in Ladak...,"Devers,Quentin,0000-0001-8469-0165",2019,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045,t046,t047,t048,t049"


In [38]:
#photos_info.to_csv('photos_info_nkl.csv', index=False)

## 5. Add columns to follow the upload process

In [67]:
photos_info['Uploaded'] = 0

In [71]:
photos_info['Upload date'] = 0

In [72]:
photos_info['Link'] = ''

## 6. Check if existing file

In [133]:
# find an existing version of photos_info_nkl.csv

name_old_file = 'photos_info_nkl.csv'

try:
    photos_info_old = pd.read_csv(name_old_file)
    print("an existing file has been found, containing {} photos".format(len(photos_info_old)))
except:
    photos_info_old = pd.DataFrame()
    print("/!\ no existing file named '{}' has been found!".format(name_old_file))
    print("If you already uploaded photos, make sure the file hasn't been moved or renamed")

an existing file has been found, containing 342 photos


In [134]:
# isolate new photos

if len(photos_info_old) > 0:
    merged = pd.merge(photos_info_old["Title"], photos_info, on='Title', how='outer', indicator=True)
    photos_info_new = merged[merged._merge == 'left_only']
    
else:
    photos_info_new = photos_info

print("{} new photos have been found".format(len(photos_info_new)))

0 new photos have been found


In [135]:
# append new photos 
photos_info_updated = photos_info_old.append(photos_info_new)

print("The new file contains {} new photos, including {} existing photos and {} new photos"\
      .format(photos_info_updated.shape[0],
              photos_info_old.shape[0],
              photos_info_new.shape[0]))

The new file contains 342 new photos, including 342 existing photos and 0 new photos


In [138]:
photos_info_updated.to_csv('photos_info_nkl.csv', index=False)