### Comments:
- add Year in keywords (will change the indexation of collections)
- handle missing keywords if not uploaded

# Archive: Link metadata to photos

In [None]:
## install required packages

# create requirements.txt
! echo "pandas==1.1.4" > requirements.txt
! echo "numpy==1.19.4" >> requirements.txt
! echo "tqdm==4.54.0" >> requirements.txt
#! echo "xlrd==1.2.0" >> requirements.txt

# install requirements
! pip install -r requirements.txt

In [None]:
## import required packages

import os, ast
import pandas as pd
import numpy as np
from time import time
from tqdm import tqdm
from datetime import datetime

tqdm.pandas()

In [None]:
# Start timer
t0 = time()

## 0. Variables to be modified

In [None]:
# path to photos root
path_photos = 'base photos/photos terrains'

# path to cartes root
path_cartes = 'base photos/cartes'

# authors path
path_authors = 'authors.xls'

## 1. Creation of a dictionnary storing each folder code with the matching metadata

In [None]:
def read_all_sheets(path):
    """
    creates a dataframe with all sheets concatenated from an excel file
    """
    xl = pd.ExcelFile(path)
    n_sheets = len(xl.sheet_names)
    for i in range(0, n_sheets):
        if i == 0:
            df = xl.parse(i)
        else:
            df = df.append(xl.parse(i))
    return df

In [None]:
def gather_cartes_info(path):
    """
    creates a list of dataframes,
    each dataframe containing the information of each sites for a specific type
    
    ignores all files that do not work
    """

    cartes_info = []
    files = os.listdir(path)
    
    for file in files:
        try:
            df_file = read_all_sheets(path + '/' + file)
            cartes_info += [df_file]
        except Exception as e:
            print('\nFile discarded as a carte :', file)
            print('Error :', e)
            
    return(cartes_info)

In [None]:
def remove_digits(string):
    """
    removes all digits from a string
    """
    return ''.join([i for i in string if not i.isdigit()])

In [None]:
def get_code_dictionary(list_df_files):
    """
    returns a dictionary, the keys are the code types, the values are the associated dataframes
    """
        
    code_dictionary = {}

    for df in list_df_files:

        first_code = df['Code Folder'].iloc[0]
        code_root = remove_digits(first_code)

        code_dictionary[code_root] = df

    return code_dictionary

In [None]:
cartes_info = gather_cartes_info(path_cartes)

In [None]:
code_dictionary = get_code_dictionary(cartes_info)

In [None]:
print(code_dictionary.keys())

## 2. Creation of a dataframe containing all photo names and their code folders

In [None]:
def get_list_of_files(dir_name):
    # create a list of file and sub directories 
    # names in the given directory 
    list_of_files = os.listdir(dir_name)
    all_files = list()
    # Iterate over all the entries
    for entry in list_of_files:
        # Create full path
        full_path = os.path.join(dir_name, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(full_path):
            all_files = all_files + get_list_of_files(full_path)
        else:
            all_files.append(full_path)
                
    return all_files

In [None]:
def match_code_photos(photos):
    
    # Create empty dataframe
    photos_info = pd.DataFrame(columns = ['photo_name','code_folder', 'path_folder'])
    
    
    # add photo infos to dataframe
    for i in tqdm(range(0,len(photos))):

        # Exclude files with '#' in the path
        if photos[i].find('#') == -1 :

            split_path = photos[i].split('/')

            photo = split_path[-1]

            # Exclude non '.jpg' or '.JPG' files
            if (photo.split('.')[-1] == 'JPG') | (photo.split('.')[-1] == 'jpg'):

                folder_name = split_path[-2]
                code_folder = folder_name.split(' ')[-1]
                path_folder = '/'.join(photos[i].split('/')[:-1])

                # add photo infos to dataframe
                s = pd.Series([photo,code_folder, path_folder],index=['photo_name','code_folder', 'path_folder'])
                photos_info = photos_info.append(s,ignore_index=True)

    print("{} photos were found, {} were discarded because a '#' was found in the path".format(i,i-photos_info.shape[0]))
    print("dataframe contains {} photos before duplicate management".format(photos_info.shape[0]))

    # Aggregate photos in multiple folder (add '+' in code_folder)
    photos_info = photos_info.groupby('photo_name').agg(
                    {'code_folder':(lambda x: '+'.join(x)),
                     'path_folder':(lambda x: list(x))}).reset_index()

    print('dataframe contains {} photos after duplicate management'.format(photos_info.shape[0]))
    
    return photos_info

In [None]:
photos = get_list_of_files(path_photos)

In [None]:
photos_info = match_code_photos(photos)

In [None]:
photos_info

## 3. Match photos & site information

In [None]:
def add_metadata(photos_info, code_dictionary):
    
    list_metadata_issues = []
    photos_info['metadata'] = [[] for _ in range(len(photos_info))]


    for i in tqdm(range(0,len(photos_info))):
        code_folder = photos_info.iloc[i].code_folder

        code_list = code_folder.split('+')

        for code in code_list:    
            code_type = remove_digits(code)
            try :
                df_code = code_dictionary[code_type]
                row_code = df_code[df_code['Code Folder'] == code]
                metadata = row_code[['Code Display',
                                     'Name',
                                     'Type',
                                     'Latitude',
                                     'Longitude',
                                     'Location',
                                     'Region']].values.tolist()[0]
                photos_info.iloc[i,3] += [metadata]

            except Exception as z:
                print('metadata not added for line {} becasue error with: {}  // ({})'\
                      .format(i,
                              z,
                              photos_info.photo_name.iloc[i]))
                list_metadata_issues += [photos_info.photo_name.iloc[i]]
                
                
    n_issues = len(list_metadata_issues)
    
    
    if n_issues > 0:
        print('❗️❗️⚠️ A total of {} photos were skipped because of a metadata matching issue!'.format(n_issues))
    else:
        print('All pictures were matched with metadata successfully! 🎉')
    photos_info = photos_info[~photos_info.photo_name.isin(list_metadata_issues)]

    return photos_info, n_issues

In [None]:
def sentence(metadata):
    
    n_codes = len(metadata)
    
    if n_codes == 0:
        return ""
    
    else:
    
        sites = ""

        for i, metadata_i in enumerate(metadata):
            
            code, name, type_code, latitude, longitude, location, region = metadata_i
            
            #import pdb; pdb.set_trace()
            
            longitude = str(longitude)[0:8]
            latitude = str(latitude)[0:8]
            
            site_i = f"{name} (code: {code}, type: {type_code}, coordinates: {longitude}°N {latitude}°E)"
            
            if i != n_codes - 1:
                site_i += ', '
            else:
                site_i += '. '
            
            sites += site_i

        if n_codes == 1:
            intro = "This is a picture of a heritage site in Ladakh. "
            intro_p2 = "The site is: "

        if n_codes > 1:
            intro = "This is a picture of " + str(n_codes) + " heritage sites in Ladakh. "
            intro_p2 = "The sites are: "
        
        location_sentence = f"Location: {location} ({region}). "
        
        sentence = intro + location_sentence + intro_p2 + sites + "More information: ladakharchaeology.com"
        
        return sentence

In [None]:
def add_sentences(photos_info):
    photos_info['sentence'] = photos_info.metadata.progress_apply(lambda x: sentence(x))
    return photos_info

In [None]:
photos_info, n_skipped = add_metadata(photos_info, code_dictionary)

In [None]:
photos_info = add_sentences(photos_info)

## 4. Reshape file

In [None]:
authors = pd.read_excel(path_authors)
authors = authors[['Abreviation', 'Last Name','First Name', 'ORCID']]
authors = authors.fillna('')

dict_authors = {}
for abreviation in authors.Abreviation:
    row = authors[authors.Abreviation == abreviation]
    if row.ORCID.iloc[0] == '':
        dict_authors[abreviation] = ','.join(list(authors[authors.Abreviation == abreviation]\
                                                 [['Last Name', 'First Name']].iloc[0]))
    else:
        dict_authors[abreviation] = ','.join(list(authors[authors.Abreviation == abreviation]\
                                                 [['Last Name', 'First Name', 'ORCID']].iloc[0]))
        
dict_authors

In [None]:
#photos_info = pd.read_csv('photos_info.csv', index_col=0)
#photos_info['path_folder'] = photos_info['path_folder'].apply(lambda x: ast.literal_eval(x))
#photos_info['metadata'] = photos_info['metadata'].apply(lambda x: ast.literal_eval(x))
photos_info

In [None]:
photos_info['Status'] = 0 #'pending'

In [None]:
photos_info['Title'] = photos_info['photo_name'].apply(lambda x: x.split('.')[0]) #.split('-')[-1])

In [None]:
photos_info['Path'] = photos_info['path_folder'].apply(lambda x: x[0]) + '/' + photos_info['photo_name']

In [None]:
photos_info['Description'] = photos_info['sentence']

In [None]:
def replace_authors(photo_name):
    try:
        author = dict_authors[photo_name[0:2]]
    except:
        author = "Devers,Quentin,0000-0001-8469-0165"
        print("/!\ {} does not have a known author! QD was applied by default".format(photo_name))
    
    return author 

In [None]:
photos_info['Creator'] = photos_info['photo_name'].apply(lambda x: replace_authors(x))

In [None]:
photos_info['Year'] = photos_info['photo_name'].apply(lambda x: x.split('.')[0].split('-')[1])

In [None]:
photos_info['Type'] = 'Image' #'http://purl.org/coar/resource_type/c_c513'

In [None]:
photos_info['License'] = 'CC-BY-NC-4.0'

In [None]:
def extract_keywords(row):

    try:
    
        list_keywords = [' '.join(list(reversed(row['Creator'].split(',')[0:2]))), # creator
                         "Picture",
                         row.Year,
                         row.metadata[0][6], # Region
                         row.metadata[0][5], # Location
                         row.metadata[0][2]  # Type
                        ]

        code_type = list(list(zip(*row.metadata))[0])
        
        return list_keywords + code_type
    
    except Exception as e:
        print("Error for photo: {} / type: {} / metadata: {} // Error: {}".format(row.photo_name,
                                                                                  row.code_folder,
                                                                                  row.metadata,
                                                                                  e)
             )

        return ['']*4

In [None]:
photos_info[photos_info.apply(lambda row: extract_keywords(row),axis=1).isna()]

In [None]:
photos_info['Keywords'] = photos_info.apply(lambda row: extract_keywords(row),axis=1)

In [None]:
photos_info['Collections'] = photos_info['Keywords'].apply(lambda x: ','.join(x[5:]))
photos_info['Keywords'] = photos_info['Keywords'].apply(lambda x: ','.join(x))

In [None]:
photos_info['Date scanned'] = str(datetime.now())[0:16]

In [None]:
photos_info = photos_info.filter(['Status', 'Title', 'Path',
                                  'Description', 'Creator', 'Year',
                                  'Keywords', 'Type', 'License',
                                  'Collections', 'Date scanned'])
photos_info

In [None]:
#photos_info.to_csv('photos_info_nkl.csv', index=False)

## 5. Add columns to follow the upload process

In [None]:
photos_info['Uploaded'] = 0
photos_info['Upload date'] = 0
photos_info['Link'] = ''
photos_info['Error'] = ''

## 6. Check if existing file

In [None]:
# find an existing version of photos.csv

name_old_file = 'photos.csv'

try:
    photos_info_old = pd.read_csv(name_old_file)
    print("an existing file has been found, containing {} photos".format(len(photos_info_old)))
except:
    photos_info_old = pd.DataFrame()
    print("/!\ no existing file named '{}' has been found!".format(name_old_file))
    print("If you already uploaded photos, make sure the file hasn't been moved or renamed")

In [None]:
# isolate new photos

if len(photos_info_old) > 0:
    merged = pd.merge(photos_info_old["Title"], photos_info, on='Title', how='outer', indicator=True)
    photos_info_new = merged[merged._merge == 'right_only']
    photos_info_new = photos_info_new.drop("_merge", axis=1)
    
else:
    photos_info_new = photos_info

print("{} new photos have been found".format(len(photos_info_new)))

In [None]:
# append new photos 
photos_info_updated = photos_info_old.append(photos_info_new)

print("The new file contains {} new photos:\n- {} existing photos \n- {} new photos"\
      .format(photos_info_updated.shape[0],
              photos_info_old.shape[0],
              photos_info_new.shape[0]))

if n_skipped >0:
    print('⚠️ {} were skipped because of matching metadata issue'.format(n_skipped))

print("{} photos have already been uploaded".format(sum(photos_info_updated['Uploaded'] == 1)))

In [None]:
photos_info_updated.to_csv('photos.csv', index=False)