# A Correspondence Contextualised
This is a prototypical script for enriching existing data about keywords or person information with information from [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page). More information can be read in the [README](../README.md#enriching-the-network-with-semantic-information) and the [FILE OVERVIEW](README.md#ontologies-rdfification-semantic-modelling).

In [2]:
import pandas as pd
from json import JSONDecodeError
from qwikidata.sparql  import return_sparql_query_results
from SPARQLWrapper import SPARQLWrapper, JSON
import geocoder
import re

In [60]:
inpt = pd.read_excel('../data/cds_sheets/Kopie_von_20220816_sachindex_cds.xlsx')

# Cells that include entries with more than one value and that are seperated by a backslash are split into separate cells.
df_merged = (inpt.set_index(inpt.columns.drop('Deutsch',1).tolist()).Deutsch.str.split('/', expand=True).stack().reset_index().rename(columns={0:'Deutsch'}).loc[:, inpt.columns])
df_merged.to_csv('../data/retrieved/sachindex_singular_values.csv')



In [75]:
def enrich_data(df: pd.DataFrame):
    index = 0
    for string in df['Deutsch']:
        try:
            query = f'' \
                    f'SELECT ?item ?label_fr ' \
                    f'  WHERE {{' \
                    f'      ?item rdfs:label "{string}"@de. ' \
                    f'      ?item rdfs:label ?label_fr filter (lang(?label_fr) = "fr").' \
                    f'  }}'
            res = return_sparql_query_results(query)
            df['Wikidata'][index] = [item['item']['value'] for item in res['results']['bindings']]
            df['Französisch'][index] = [item['label_fr']['value'] for item in res['results']['bindings']]

            index += 1
        except JSONDecodeError:
            index += 1
            continue

    df.to_csv('../data/retrieved/sachindex_additional_data.csv')

In [None]:
enrich_data(df=df_merged)

In [8]:
def query_person_wdt(df: pd.DataFrame, label: str = 'Verfasser'):
    for person in df[label]:
        if type(person) != float:
            l = person.split(',')
            if len(l) > 1:
                #print(re.sub('\s+',' ', re.sub(r'(\([^)]*(CdS|Bruder|SRD|Neffe|Siehe)[^)]*\)*)','',(l[1]+' '+l[0]))))
                print((l[1]+' '+l[0]).strip())



In [None]:
cds_df = pd.read_csv('../data/retrieved/filtered_cds_data.csv')
query_person_wdt(df=cds_df, label='Verfasser')