# Archive: Link metadata to photos

In [1]:
## install required packages

# create requirements.txt
! echo "pandas==1.1.4" > requirements.txt
! echo "numpy==1.19.4" >> requirements.txt
! echo "tqdm==4.54.0" >> requirements.txt
#! echo "xlrd==1.2.0" >> requirements.txt

# install requirements
! pip install -r requirements.txt



In [2]:
## import required packages

import os, ast
import pandas as pd
import numpy as np
from time import time
from tqdm import tqdm

tqdm.pandas()

  from pandas import Panel


In [3]:
# Start timer
t0 = time()

## 0. Variables to be modified

In [4]:
# path to photos root
path_photos = 'base photos/photos terrains'

# path to cartes root
path_cartes = 'base photos/cartes'

## 1. Creation of a dictionnary storing each folder code with the matching metadata

In [5]:
def read_all_sheets(path):
    """
    creates a dataframe with all sheets concatenated from an excel file
    """
    xl = pd.ExcelFile(path)
    n_sheets = len(xl.sheet_names)
    for i in range(0, n_sheets):
        if i == 0:
            df = xl.parse(i)
        else:
            df = df.append(xl.parse(i))
    return df

In [6]:
def gather_cartes_info(path):
    """
    creates a list of dataframes,
    each dataframe containing the information of each sites for a specific type
    
    ignores all files that do not work
    """

    cartes_info = []
    files = os.listdir(path)
    
    for file in files:
        try:
            df_file = read_all_sheets(path + '/' + file)
            cartes_info += [df_file]
        except Exception as e:
            print('\nFile discarded as a carte :', file)
            print('Error :', e)
            
    return(cartes_info)

In [7]:
def remove_digits(string):
    """
    removes all digits from a string
    """
    return ''.join([i for i in string if not i.isdigit()])

In [8]:
def get_code_dictionary(list_df_files):
    """
    returns a dictionary, the keys are the code types, the values are the associated dataframes
    """
        
    code_dictionary = {}

    for df in list_df_files:

        first_code = df['Code Folder'].iloc[0]
        code_root = remove_digits(first_code)

        code_dictionary[code_root] = df

    return code_dictionary

In [9]:
cartes_info = gather_cartes_info(path_cartes)


File discarded as a carte : .DS_Store
Error : Unsupported format, or corrupt file: Expected BOF record; found b'\x00\x00\x00\x01Bud1'

File discarded as a carte : ca Caves - 10oct21.xlsx
Error : 'ElementTree' object has no attribute 'getiterator'

File discarded as a carte : os Other sites - 10oct21.xlsx
Error : 'ElementTree' object has no attribute 'getiterator'

File discarded as a carte : p Rockart 10-2021.xlsx
Error : 'ElementTree' object has no attribute 'getiterator'

File discarded as a carte : rc Regular chortens - 10oct21.xlsx
Error : 'ElementTree' object has no attribute 'getiterator'

File discarded as a carte : ss Stupa stones - 10oct21.xlsx
Error : 'ElementTree' object has no attribute 'getiterator'

File discarded as a carte : t Temples - 13oct21.xlsx
Error : 'ElementTree' object has no attribute 'getiterator'

File discarded as a carte : zz other
Error : [Errno 21] Is a directory: 'base photos/cartes/zz other'

File discarded as a carte : ~$p Rock art 11-2020 update 10-

In [10]:
code_dictionary = get_code_dictionary(cartes_info)

In [11]:
print(code_dictionary.keys())

dict_keys(['bc', 'ca', 'f', 'pc', 't'])


## 2. Creation of a dataframe containing all photo names and their code folders

In [12]:
def get_list_of_files(dir_name):
    # create a list of file and sub directories 
    # names in the given directory 
    list_of_files = os.listdir(dir_name)
    all_files = list()
    # Iterate over all the entries
    for entry in list_of_files:
        # Create full path
        full_path = os.path.join(dir_name, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(full_path):
            all_files = all_files + get_list_of_files(full_path)
        else:
            all_files.append(full_path)
                
    return all_files

In [13]:
def match_code_photos(photos):
    
    # Create empty dataframe
    photos_info = pd.DataFrame(columns = ['photo_name','code_folder', 'path_folder'])
    
    
    # add photo infos to dataframe
    for i in tqdm(range(0,len(photos))):

        # Exclude files with '#' in the path
        if photos[i].find('#') == -1 :

            split_path = photos[i].split('/')

            photo = split_path[-1]

            # Exclude non '.jpg' or '.JPG' files
            if (photo.split('.')[-1] == 'JPG') | (photo.split('.')[-1] == 'jpg'):

                folder_name = split_path[-2]
                code_folder = folder_name.split(' ')[-1]
                path_folder = '/'.join(photos[i].split('/')[:-1])

                # add photo infos to dataframe
                s = pd.Series([photo,code_folder, path_folder],index=['photo_name','code_folder', 'path_folder'])
                photos_info = photos_info.append(s,ignore_index=True)

    print("{} photos were found, {} were discarded because a '#' was found in the path".format(i,i-photos_info.shape[0]))
    print("dataframe contains {} photos before duplicate management".format(photos_info.shape[0]))

    # Aggregate photos in multiple folder (add '+' in code_folder)
    photos_info = photos_info.groupby('photo_name').agg(
                    {'code_folder':(lambda x: '+'.join(x)),
                     'path_folder':(lambda x: list(x))}).reset_index()

    print('dataframe contains {} photos after duplicate management'.format(photos_info.shape[0]))
    
    return photos_info

In [14]:
photos = get_list_of_files(path_photos)

In [15]:
photos_info = match_code_photos(photos)

100%|██████████| 394/394 [00:00<00:00, 587.44it/s]

393 photos were found, 48 were discarded because a '#' was found in the path
dataframe contains 345 photos before duplicate management
dataframe contains 342 photos after duplicate management





## 3. Match photos & site information

In [16]:
def add_metadata(photos_info, code_dictionary):
    photos_info['metadata'] = [[] for _ in range(len(photos_info))]


    for i in tqdm(range(0,len(photos_info))):
        code_folder = photos_info.iloc[i].code_folder

        code_list = code_folder.split('+')

        for code in code_list:    
            code_type = remove_digits(code)
            try :
                df_code = code_dictionary[code_type]
                row_code = df_code[df_code['Code Folder'] == code]
                metadata = row_code[['Code Display',
                                     'Name',
                                     'Type',
                                     'Latitude',
                                     'Longitude',
                                     'Location',
                                     'Region']].values.tolist()[0]
                photos_info.iloc[i,3] += [metadata]

            except Exception as z:
                print('metadata not added for line ',i,'because error with: ', z)
                
    return photos_info

In [17]:
def sentence(metadata):
    
    n_codes = len(metadata)
    
    if n_codes == 0:
        return ""
    
    else:
    
        sites = ""

        for i, metadata_i in enumerate(metadata):
            
            code, name, type_code, latitude, longitude, location, region = metadata_i
            
            site_i = f"{name} (code: {code}, type: {type_code}, coordinates: {longitude}°N {latitude}°E)"
            
            if i != n_codes - 1:
                site_i += ', '
            else:
                site_i += '. '
            
            sites += site_i

        if n_codes == 1:
            intro = "This is a picture of a heritage site in Ladakh. "
            intro_p2 = "The site is: "

        if n_codes > 1:
            intro = "This is a picture of " + str(n_codes) + " heritage sites in Ladakh. "
            intro_p2 = "The sites are: "
        
        location_sentence = f"Location: {location} ({region}). "
        
        sentence = intro + location_sentence + intro_p2 + sites + "More information: ladakharchaeology.com"
        
        return sentence

In [18]:
def add_sentences(photos_info):
    photos_info['sentence'] = photos_info.metadata.progress_apply(lambda x: sentence(x))
    return photos_info

In [19]:
photos_info = add_metadata(photos_info, code_dictionary)

100%|██████████| 342/342 [00:00<00:00, 749.10it/s]

metadata not added for line  251 because error with:  'p'
metadata not added for line  252 because error with:  'p'
metadata not added for line  254 because error with:  'p'
metadata not added for line  255 because error with:  'p'
metadata not added for line  256 because error with:  'p'
metadata not added for line  257 because error with:  'p'
metadata not added for line  258 because error with:  'p'
metadata not added for line  259 because error with:  'p'
metadata not added for line  260 because error with:  'p'
metadata not added for line  261 because error with:  'p'
metadata not added for line  262 because error with:  'p'
metadata not added for line  290 because error with:  'p'
metadata not added for line  291 because error with:  'p'
metadata not added for line  292 because error with:  'p'
metadata not added for line  293 because error with:  'p'
metadata not added for line  294 because error with:  'p'
metadata not added for line  318 because error with:  'rc'
metadata not 




In [20]:
photos_info = add_sentences(photos_info)

100%|██████████| 342/342 [00:00<00:00, 106460.74it/s]


In [21]:
# End timer
t1 = time() - t0
print(t1)

10.609342336654663


In [22]:
## Save photos_info into a .csv file
photos_info.to_csv('photos_info.csv')

## 4. Reshape file

In [23]:
dict_authors = {'QD': "Devers,Quentin,0000-0001-8469-0165"}

In [24]:
photos_info = pd.read_csv('photos_info.csv', index_col=0)
photos_info['path_folder'] = photos_info['path_folder'].apply(lambda x: ast.literal_eval(x))
photos_info['metadata'] = photos_info['metadata'].apply(lambda x: ast.literal_eval(x))
photos_info

Unnamed: 0,photo_name,code_folder,path_folder,metadata,sentence
0,QD-2009-IMG2643.JPG,t045,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of a heritage site in Ladakh...
1,QD-2009-IMG2644.JPG,t045,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of a heritage site in Ladakh...
2,QD-2009-IMG2645.JPG,t045,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of a heritage site in Ladakh...
3,QD-2009-IMG2646.JPG,t045,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of a heritage site in Ladakh...
4,QD-2009-IMG2647.JPG,t045,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of a heritage site in Ladakh...
...,...,...,...,...,...
337,QD-2019-DSC123430.JPG,t047,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t047, Temple Ruin 3, Temple Ruin, 34.038895,...",This is a picture of a heritage site in Ladakh...
338,QD-2019-DSC123431.JPG,t045+t046,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of 2 heritage sites in Ladak...
339,QD-2019-DSC123432.JPG,t045+t046+t047+t048+t049,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of 5 heritage sites in Ladak...
340,QD-2019-DSC123433.JPG,t045+t046+t047+t048+t049,[base photos/photos terrains/Ladakh Upper/Nyar...,"[[t045, Temple Ruin 1, Temple Ruin, 34.039394,...",This is a picture of 5 heritage sites in Ladak...


In [25]:
photos_info['Status'] = 'pending'

In [26]:
photos_info['Title'] = photos_info['photo_name'].apply(lambda x: x.split('.')[0].split('-')[-1])

In [27]:
photos_info['Path'] = photos_info['path_folder'].apply(lambda x: x[0]) + '/' + photos_info['photo_name']

In [28]:
photos_info['Description'] = photos_info['sentence']

In [29]:
photos_info['Creator'] = photos_info['photo_name'].apply(lambda x: dict_authors[x[0:2]])

In [30]:
photos_info['Year'] = photos_info['photo_name'].apply(lambda x: x.split('.')[0].split('-')[1])

In [31]:
photos_info['Type'] = 'Image' #'http://purl.org/coar/resource_type/c_c513'

In [32]:
photos_info['License'] = 'CC-BY-NC-4.0'

In [33]:
def extract_keywords(row):

    try:
    
        list_keywords = [' '.join(list(reversed(row['Creator'].split(',')[0:2]))), # creator
        row.metadata[0][6], # Region
        row.metadata[0][5], # Location
        row.metadata[0][2]  # Type
                        ]

        code_type = list(list(zip(*row.metadata))[0])
        
        return list_keywords + code_type
    
    except Exception as e:
        print("Error for photo: {} / type: {} / metadata: {} // Error: {}".format(row.photo_name,
                                                                                  row.code_folder,
                                                                                  row.metadata,
                                                                                  e)
             )

        return ['']*4

In [34]:
photos_info[photos_info.apply(lambda row: extract_keywords(row),axis=1).isna()]

Error for photo: QD-2016-DSC78809.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2016-DSC78810.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101025.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101026.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101027.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101028.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101029.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101030.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101031.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101032.JPG / type: p252 / metadata: [] // Error: list index out o

Unnamed: 0,photo_name,code_folder,path_folder,metadata,sentence,Status,Title,Path,Description,Creator,Year,Type,License


In [35]:
photos_info['Keywords'] = photos_info.apply(lambda row: extract_keywords(row),axis=1)

Error for photo: QD-2016-DSC78809.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2016-DSC78810.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101025.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101026.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101027.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101028.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101029.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101030.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101031.JPG / type: p252 / metadata: [] // Error: list index out of range
Error for photo: QD-2017-DSC101032.JPG / type: p252 / metadata: [] // Error: list index out o

In [36]:
photos_info['Collections'] = photos_info['Keywords'].apply(lambda x: ','.join(x[3:]))
photos_info['Keywords'] = photos_info['Keywords'].apply(lambda x: ','.join(x))

In [37]:
photos_info = photos_info.filter(['Status', 'Title', 'Path', 'Description', 'Creator', 'Year', 'Keywords', 'Type', 'License', 'Collections'])
photos_info

Unnamed: 0,Status,Title,Path,Description,Creator,Year,Keywords,Type,License,Collections
0,pending,IMG2643,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2009,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045"
1,pending,IMG2644,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2009,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045"
2,pending,IMG2645,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2009,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045"
3,pending,IMG2646,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2009,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045"
4,pending,IMG2647,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2009,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045"
...,...,...,...,...,...,...,...,...,...,...
337,pending,DSC123430,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2019,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t047"
338,pending,DSC123431,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of 2 heritage sites in Ladak...,"Devers,Quentin,0000-0001-8469-0165",2019,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045,t046"
339,pending,DSC123432,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of 5 heritage sites in Ladak...,"Devers,Quentin,0000-0001-8469-0165",2019,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045,t046,t047,t048,t049"
340,pending,DSC123433,base photos/photos terrains/Ladakh Upper/Nyarm...,This is a picture of 5 heritage sites in Ladak...,"Devers,Quentin,0000-0001-8469-0165",2019,"Quentin Devers,Upper Ladakh,Nyarma,Temple Ruin...",Image,CC-BY-NC-4.0,"Temple Ruin,t045,t046,t047,t048,t049"


In [38]:
photos_info.to_csv('photos_info_nkl.csv', index=False)

In [39]:
! jupyter nbconvert --to script archive.ipynb

[NbConvertApp] Converting notebook archive.ipynb to script
[NbConvertApp] Writing 10116 bytes to archive.py


## Draft

In [3]:
wt = pd.read_csv('base photos/wetransfer_2022-11-04_1301/photos_info.csv')

In [4]:
wt_nkl = pd.read_csv('base photos/wetransfer_2022-11-04_1301/photos_info_nkl.csv')

In [5]:
wt[wt.metadata == '[]']

Unnamed: 0.1,Unnamed: 0,photo_name,code_folder,path_folder,metadata,sentence


In [6]:
wt[wt.sentence == '']

Unnamed: 0.1,Unnamed: 0,photo_name,code_folder,path_folder,metadata,sentence


In [7]:
wt_nkl.shape

(153402, 10)

In [8]:
wt.shape

(153402, 6)

In [9]:
wt_nkl[wt_nkl.Description == '']

Unnamed: 0,Status,Title,Path,Description,Creator,Year,Keywords,Type,License,Collections


In [10]:
wt_nkl

Unnamed: 0,Status,Title,Path,Description,Creator,Year,Keywords,Type,License,Collections
0,pending,CIM1517,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2007,"Quentin Devers,Nubra,Khardong,Petroglyphs,p582",Image,CC-BY-NC-4.0,"Petroglyphs,p582"
1,pending,CIM1518,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2007,"Quentin Devers,Nubra,Khardong,Petroglyphs,p582",Image,CC-BY-NC-4.0,"Petroglyphs,p582"
2,pending,CIM1519,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2007,"Quentin Devers,Nubra,Khardong,Petroglyphs,p582",Image,CC-BY-NC-4.0,"Petroglyphs,p582"
3,pending,CIM1520,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2007,"Quentin Devers,Nubra,Khardong,Petroglyphs,p582",Image,CC-BY-NC-4.0,"Petroglyphs,p582"
4,pending,CIM1521,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2007,"Quentin Devers,Nubra,Khardong,Petroglyphs,p582",Image,CC-BY-NC-4.0,"Petroglyphs,p582"
...,...,...,...,...,...,...,...,...,...,...
153397,pending,PH164355,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2022,"Quentin Devers,Upper Ladakh,Leh - Changspa,Bud...",Image,CC-BY-NC-4.0,"Buddhist Carving,bc085"
153398,pending,PH164419,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2022,"Quentin Devers,Upper Ladakh,Leh - Changspa,Bud...",Image,CC-BY-NC-4.0,"Buddhist Carving,bc085"
153399,pending,PH164421,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2022,"Quentin Devers,Upper Ladakh,Leh - Changspa,Bud...",Image,CC-BY-NC-4.0,"Buddhist Carving,bc085"
153400,pending,PH164422,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2022,"Quentin Devers,Upper Ladakh,Leh - Changspa,Bud...",Image,CC-BY-NC-4.0,"Buddhist Carving,bc085"


In [11]:
wt.photo_name

0          QD-2007-CIM1517.JPG
1          QD-2007-CIM1518.JPG
2          QD-2007-CIM1519.JPG
3          QD-2007-CIM1520.JPG
4          QD-2007-CIM1521.JPG
                  ...         
153397    QD-2022-PH164355.jpg
153398    QD-2022-PH164419.jpg
153399    QD-2022-PH164421.jpg
153400    QD-2022-PH164422.jpg
153401    QD-2022-PH164424.jpg
Name: photo_name, Length: 153402, dtype: object

In [13]:
wt.photo_name.apply(lambda x: x[0:2]).value_counts()

QD    153402
Name: photo_name, dtype: int64

In [14]:
wt

Unnamed: 0.1,Unnamed: 0,photo_name,code_folder,path_folder,metadata,sentence
0,0,QD-2007-CIM1517.JPG,p582,['/Users/quentin/Desktop/base photos/photos te...,"[['p582', 'Petroglyph Site 4', 'Petroglyphs', ...",This is a picture of a heritage site in Ladakh...
1,1,QD-2007-CIM1518.JPG,p582,['/Users/quentin/Desktop/base photos/photos te...,"[['p582', 'Petroglyph Site 4', 'Petroglyphs', ...",This is a picture of a heritage site in Ladakh...
2,2,QD-2007-CIM1519.JPG,p582,['/Users/quentin/Desktop/base photos/photos te...,"[['p582', 'Petroglyph Site 4', 'Petroglyphs', ...",This is a picture of a heritage site in Ladakh...
3,3,QD-2007-CIM1520.JPG,p582,['/Users/quentin/Desktop/base photos/photos te...,"[['p582', 'Petroglyph Site 4', 'Petroglyphs', ...",This is a picture of a heritage site in Ladakh...
4,4,QD-2007-CIM1521.JPG,p582,['/Users/quentin/Desktop/base photos/photos te...,"[['p582', 'Petroglyph Site 4', 'Petroglyphs', ...",This is a picture of a heritage site in Ladakh...
...,...,...,...,...,...,...
153397,153397,QD-2022-PH164355.jpg,bc085,['/Users/quentin/Desktop/base photos/photos te...,"[['bc085', 'Buddhist Carving 7', 'Buddhist Car...",This is a picture of a heritage site in Ladakh...
153398,153398,QD-2022-PH164419.jpg,bc085,['/Users/quentin/Desktop/base photos/photos te...,"[['bc085', 'Buddhist Carving 7', 'Buddhist Car...",This is a picture of a heritage site in Ladakh...
153399,153399,QD-2022-PH164421.jpg,bc085,['/Users/quentin/Desktop/base photos/photos te...,"[['bc085', 'Buddhist Carving 7', 'Buddhist Car...",This is a picture of a heritage site in Ladakh...
153400,153400,QD-2022-PH164422.jpg,bc085,['/Users/quentin/Desktop/base photos/photos te...,"[['bc085', 'Buddhist Carving 7', 'Buddhist Car...",This is a picture of a heritage site in Ladakh...


In [15]:
list(wt[wt.photo_name == 'Skyiu old chorten complex09.3.JPG'].path_folder)

[]

In [16]:
wt_nkl.Keywords.apply(lambda x: len(x.split(',')[4:])).value_counts()

1     148741
2       4081
3        319
4         99
5         78
6         40
12        17
60         8
7          6
9          5
8          4
13         2
61         1
35         1
Name: Keywords, dtype: int64

In [20]:
from itertools import chain
pd.DataFrame(set(list(chain(*list(wt_nkl.Keywords.apply(lambda x: x.split(',')[4:])),[])))).to_csv('list_code_sites.csv')

In [30]:
list(wt_nkl[wt_nkl.Keywords.apply(lambda x: x.split(',')[4] == 'Petroglyphs')].Keywords)

['Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quenti

In [36]:
wt_nkl[wt_nkl.Keywords.apply(lambda x: x.split(',')[4] == 'Petroglyphs')]

Unnamed: 0,Status,Title,Path,Description,Creator,Year,Keywords,Type,License,Collections
7779,pending,IMG18522,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2010,"Quentin Devers,Lower Ladakh, Purig,Dargo,Petro...",Image,CC-BY-NC-4.0,"Petroglyphs,p049"
7780,pending,IMG18523,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2010,"Quentin Devers,Lower Ladakh, Purig,Dargo,Petro...",Image,CC-BY-NC-4.0,"Petroglyphs,p049"
7781,pending,IMG18524,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2010,"Quentin Devers,Lower Ladakh, Purig,Dargo,Petro...",Image,CC-BY-NC-4.0,"Petroglyphs,p049"
7782,pending,IMG18525,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2010,"Quentin Devers,Lower Ladakh, Purig,Dargo,Petro...",Image,CC-BY-NC-4.0,"Petroglyphs,p049"
7783,pending,IMG18526,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2010,"Quentin Devers,Lower Ladakh, Purig,Dargo,Petro...",Image,CC-BY-NC-4.0,"Petroglyphs,p049"
...,...,...,...,...,...,...,...,...,...,...
119830,pending,DSC176464,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2022,"Quentin Devers,Lower Ladakh, Purig,Sanjak,Petr...",Image,CC-BY-NC-4.0,"Petroglyphs,p365"
119831,pending,DSC176465,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2022,"Quentin Devers,Lower Ladakh, Purig,Sanjak,Petr...",Image,CC-BY-NC-4.0,"Petroglyphs,p365"
119832,pending,DSC176489,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2022,"Quentin Devers,Lower Ladakh, Purig,Sanjak,Petr...",Image,CC-BY-NC-4.0,"Petroglyphs,p365"
119833,pending,DSC176490,/Users/quentin/Desktop/base photos/photos terr...,This is a picture of a heritage site in Ladakh...,"Devers,Quentin,0000-0001-8469-0165",2022,"Quentin Devers,Lower Ladakh, Purig,Sanjak,Petr...",Image,CC-BY-NC-4.0,"Petroglyphs,p365"


In [41]:
wt.loc[7779].metadata

"[['p049', 'Petroglyph Site 1 ‘Daras’', 'Petroglyphs', '34.494021', '76.64025', 'Dargo', 'Lower Ladakh, Purig']]"

In [45]:
test_1 = pd.DataFrame(set(list(chain(*list(wt_nkl.Keywords.apply(lambda x: x.split(',')[4:])),[]))), columns=['Region'])

In [49]:
test_1[test_1.Region.apply(lambda x: len(x)>5)]

Unnamed: 0,Region
294,Petroglyphs
1558,Petroglyphs and Pictographs


In [51]:
set(list(wt_nkl[wt_nkl.Keywords.apply(lambda x: x.split(',')[4] in ['Petroglyphs', "Petroglyphs and Pictographs"])].Keywords))

{'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p048',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p049',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p050',
 'Quentin Devers,Lower Ladakh, Purig,Dargo,Petroglyphs,p362',
 'Quentin Devers,Lower Ladakh, Purig,Dartsiks to Batalik,Petroglyphs,p053',
 'Quentin Devers,Lower Ladakh, Purig,Garkon,Petroglyphs,p548',
 'Quentin Devers,Lower Ladakh, Purig,Posa,Petroglyphs,p599',
 'Quentin Devers,Lower Ladakh, Purig,Sanjak,Petroglyphs,p257',
 'Quentin Devers,Lower Ladakh, Purig,Sanjak,Petroglyphs,p365',
 'Quentin Devers,Lower Ladakh, Purig,Sanjak,Petroglyphs,p366',
 'Quentin Devers,Lower Ladakh, Purig,Sanjak,Petroglyphs,p367',
 'Quentin Devers,Lower Ladakh, Purig,Santakchan to Posa,Petroglyphs and Pictographs,p600',
 'Quentin Devers,Lower Ladakh, Purig,Santakchan to Posa,Petroglyphs,p491',
 'Quentin Devers,Lower Ladakh, Purig,Santakchan to Posa,Petroglyphs,p557',
 'Quentin Devers,Lower Ladakh, Purig,Santakchan to Posa,Pet