# Эвотор
## Настройки

In [22]:
import pandas as pd
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, DateTime, Float, select, Date, JSON
import logging
import os
import sys
from math import ceil
from tqdm import tqdm
from pandas.testing import assert_frame_equal

from wikibaseintegrator import wbi_core, wbi_login, wbi_login
from wikibaseintegrator.wbi_config import config as wbi_config

In [2]:
logging.getLogger().setLevel(logging.INFO)

In [3]:
WIKIBASE_HOST = '84.201.142.182'

WIKIBASE_LOGIN = 'WikibaseAdmin'
WIKIBASE_PASSWORD = 'WikibaseDockerAdminPass'

wbi_config['MEDIAWIKI_API_URL'] = f'http://{WIKIBASE_HOST}:8181/api.php'
wbi_config['SPARQL_ENDPOINT_URL'] = f'http://{WIKIBASE_HOST}:8989/bigdata/sparql'
wbi_config['WIKIBASE_URL'] = 'http://wikibase.svc'

## Библиотека

In [280]:
def get_items_by_label(label_list:list, item_type:str, is_unique:bool = True, is_notnull:bool = True):
    '''
    По переданному списку лейблов находит entity_id в базе Wikibase. 
    
    label_list: список искомых лейблов
    item_type: тип искомого объекта. Если не указано, то любой объект. Если указано:
        "P" - Property
        "Q" - Item
    is_unique: если True, то вернёт ошибку, если найдено больше одного значения
    is_notnull: если True, то вернёт ошибку, если не найдено ни одного значения
    '''
    
    query = """
        SELECT DISTINCT ?item ?itemLabel
        WHERE {{
          ?item rdfs:label ?itemLabel. 

          VALUES ?itemLabel {{ {label_filter} }}
        }}""".format(label_filter = ' '.join([f'\"{i}\"@en' for i in label_list]))
    
    result = wbi_core.ItemEngine.execute_sparql_query(query)
    result_list = [[i['itemLabel']['value'], i['item']['value'].replace('http://wikibase.svc/entity/', '')] 
                   for i in result['results']['bindings']]
    
    df = pd.DataFrame(result_list, columns = ['label', 'item'])
        
    if item_type in ('P', 'Q'):
        df = df[df.item.str.contains(item_type)] 

    df_check = df.groupby('label').count()
    if is_unique and df_check.item.max() > 1:
        r = df[df.label.isin(df_check[df_check.item > 1].index.to_list())].sort_values(by = 'label')
        logging.info(f"entity_id определён неоднозначно: \n{r}")
        return None
    elif is_notnull and len(set(label_list) - set(df.label)) > 0:
        r = set(label_list) - set(df.label)
        logging.info(f"entity_id не найден: \n{r}!")
        return None
    else:
        return df
    
    
def get_wb_parent(Q:str, P:str, login_instance:wbi_login.Login) -> str:
    '''
    Свойством P какого объекта-родителя является объект Q? Возвращает ошибку, если родителей ноль или несколько.
        Q - целевой объект
        P - каким параметром он должен быть
    '''

    query = f'''
        SELECT ?entity_id ?entity_name WHERE {{
            ?entity_id wdt:{P} wd:{Q} .
            ?entity_id rdfs:label ?entity_name .
        }}'''
    print(query)
    result = wbi_core.ItemEngine.execute_sparql_query(query)

    result_list = [[i['entity_name']['value'], i['entity_id']['value'].replace('http://wikibase.svc/entity/', '')]
                   for i in result['results']['bindings']]

    result_df = pd.DataFrame(result_list, columns = ['entity_name', 'entity_id'])

    if result_df.shape[0] > 1:
        raise Exception(f'Object with entity_id {Q} have several parents: \n{result_df.entity_id.to_list()}')
    elif result_df.shape[0] == 0:
        raise Exception(f'Object with entity_id {Q} not finded in !')
    else:
        return (result_df.at[0, 'entity_name'], result_df.at[0, 'entity_id'])
    
    
def get_wb_statements(login_instance:wbi_login.Login, Q:str, P:str, Pq:str = 'P0') -> pd.DataFrame:
    '''
    Для объекта Q для заданного стейтмента вывести все его айтемы с заданными квалифаерами
    '''
    
    query = f'''
        SELECT ?STATEMENT_VALUE ?ITEM_LABEL ?QUALIFIER
        WHERE
        {{
             wd:{Q} p:{P} ?statement.
             ?statement ps:{P} ?STATEMENT_VALUE.

             OPTIONAL {{ ?statement pq:{Pq} ?QUALIFIER. }}     

             OPTIONAL {{ ?STATEMENT_VALUE rdfs:label ?ITEM_LABEL }}
        }}     
    '''
    
    result = wbi_core.ItemEngine.execute_sparql_query(query)
    wb_fields_df = []
    for bind in result['results']['bindings']:
        wb_fields_df.append({k: v['value'] for k, v in bind.items()})
    wb_fields_df = pd.DataFrame(wb_fields_df)        

    
    wb_fields_df['STATEMENT_VALUE'] = wb_fields_df['STATEMENT_VALUE'].str.replace('http://wikibase.svc/entity/', '')
        
    return wb_fields_df    
    
    
def gen_prop_dict(properties_list):
    return {i['label']: i['item'] \
        for _, i in get_items_by_label(properties_list, item_type = 'P').iterrows()}    

In [365]:
class WikiObject():
    def _fetch_statements(self):
        if self.new_item:
            self.resolved_fields = self.df_input.copy()
        elif self.df_input.shape[0] == 0:
            logging.info('No input dataframe, can`t fetch!')
        else:
            self.df_input
            self.wb_statements
            
            manual_updated = self.wb_statements[self.wb_statements.QUALIFIER.isnull()].copy()
            
            self.resolved_fields = self.df_input[
                (self.df_input['STATEMENT_LABEL'] == self.repeated_statement_label) |
                ~(self.df_input['STATEMENT_LABEL'].QUALIFIER.isnull())
            ].copy()
            self.resolved_fields = self.resolved_fields.append(manual_updated)
            
            # TO DO: сколько ручных сохранено? есть ли изменения в остальных? сколько новых? сколько удалено?
            
#             logging.info("""
#                 {manual} statements will be saved
#                 {columns} statements in new table
#                 {added} statements are added
#                 {deleted} statements are deleted              
#             """.format(
#                 manual = manual_updated.shape[0],
#                 columns = self.df_input.shape[0],
#                 added = self.df_input[~self.df_input[merge_column].isin(self.wb_fields.STATEMENT)].shape[0],
#                 deleted = self.wb_fields[~self.wb_fields.STATEMENT.isin(self.df_input[merge_column])].shape[0]                    
#             ))

#             fetched_column = [merge_column]
#             fetched_column.extend(self.properties_dict['repeated_statement']['custom_qualifiers'].keys())
            
            
#             self.resolved_fields = self.df_input.merge(self.wb_fields.STATEMENT,
#                                                  how = 'outer', left_on = merge_column, right_on = 'STATEMENT')
            
#             self.resolved_fields['is_deleted'] = self.resolved_fields\
#                 .apply(lambda row: 1 if pd.isnull(row[merge_column]) else 0, axis = 1)
            
            return self.resolved_fields
    
    
    def _set_vars(self):
        Q_df = get_items_by_label([self.label], item_type = 'Q')
        if Q_df is None:  
            # Новый объект
            logging.info(f'No such object: {self.label}! New one will be created.')
            assert self.df_input.shape[0] > 0 , 'Cannot create new item from empty input DataFrame!'
            
            self.Q_parent = get_items_by_label([self.parent_label], item_type = 'Q').at[0, 'item']            
            self.Q = None
            self.new_item = True            
        else:
            # Существующий объект
            self.Q = Q_df.at[0, 'item']
            _, self.Q_parent = None, None #get_wb_parent(self.Q, self.properties_dict['P'], self.login_instance)
            
            
            # Забрать состояние стейтментов с объекта
            # TO DO: плохо то, что если есть стейтмент не из PROPERTY_DICT , то он не будет забран
            # Да и вообще как-то громоздко получилось, это наверняка можно сделать одним запросом
            self.wb_statements = pd.DataFrame()
            for label, P in self.properties_dict['statements'].items():
                state_i = get_wb_statements(
                    login_instance = self.login_instance, 
                    Q = self.Q, 
                    P = P,
                    Pq =  self.properties_dict['global_references']['Source']
                )
                state_i['STATEMENT_LABEL'] = label
                state_i['STATEMENT_TYPE'] = 'string'
                self.wb_statements = self.wb_statements.append(state_i)
                
                
            for label, P in self.properties_dict['global_statements_items'].items():
                state_i = get_wb_statements(
                    login_instance = self.login_instance, 
                    Q = self.Q, 
                    P = P,
                    Pq =  self.properties_dict['global_references']['Source']
                )
                state_i['STATEMENT_LABEL'] = label
                state_i['STATEMENT_TYPE'] = 'item'
                self.wb_statements = self.wb_statements.append(state_i)
                
            # А это, кажется, и не надо больше фетчить
#             if self.repeated_statements is not None:
#                 wb_repeated_statements = get_wb_statements(
#                     login_instance = self.login_instance, 
#                     Q = self.Q, 
#                     P = self.repeated_statements['P'],
#                     Pq =  self.properties_dict['global_references']['Source']
#                 )
#                 wb_repeated_statements['STATEMENT_LABEL'] = self.repeated_statement_label 
#                 wb_repeated_statements['STATEMENT_TYPE'] = 'item'
#                 self.wb_statements = self.wb_statements.append(wb_repeated_statements)    
                
            self.new_item = False
            
        
        self._fetch_statements()
        logging.info("""
            Object {label} (entity_id: {Q}), parent {parent_label} (entity_id: {Qp})
        """.format(
            label = self.label,
            Q = self.Q,
            parent_label = self.parent_label,
            Qp = self.Q_parent
        ))
        
        
    def delete_item(self):
        pass

In [None]:
class WikiCompany(WikiObject): 
    def __init__(
        self, 
        name:str, 
        properties_dict:dict, 
        login_instance:wbi_login.Login, 
        df_input = pd.DataFrame()
    ):        
        self.name = name
        self.login_instance = login_instance
        self.properties_dict = properties_dict['DATABASE']
        self.df_input = df_input
        self.in_property_label = 'COMPANY'
        self._set_vars()
        
        
class WikiDatabase(WikiObject): 
    def __init__(
        self, 
        name:str, 
        properties_dict:dict, 
        login_instance:wbi_login.Login, 
        df_input = pd.DataFrame()
    ):        
        self.name = name
        self.login_instance = login_instance
        self.properties_dict = properties_dict['DATABASE']
        self.df_input = df_input
        self.in_property_label = 'COMPANY'
        self._set_vars()       
        
        
class WikiSchema(WikiObject): 
    def __init__(
        self, 
        name:str, 
        properties_dict:dict, 
        login_instance:wbi_login.Login, 
        df_input = pd.DataFrame()
    ):        
        self.name = name
        self.login_instance = login_instance
        self.properties_dict = properties_dict['SCHEMA']
        self.df_input = df_input
        self.in_property_label = 'DATABASE'
        self._set_vars()        

In [366]:
class WikiTable(WikiObject): 
    def __init__(
        self, 
        label:str, 
        properties_dict:dict, 
        login_instance:wbi_login.Login, 
        df_input = pd.DataFrame()
    ): 
        '''
        df_input - только свойства объекта, напр. DATA_TYPE, DESCRIPTION, DATA_LENGTH
        '''
        
        self.login_instance = login_instance
        self.df_input = df_input
        
        self.label = label
        self.parent_label = '.'.join(label.split('.')[:-1])
        
        self.properties_dict = dict(
            properties_dict['DATABASE']['SCHEMA']['TABLE'], **properties_dict['GLOBAL']
        )
        self.repeated_statements = properties_dict['DATABASE']['SCHEMA']['TABLE']['COLUMN']
        self.repeated_statement_label = 'COLUMN'
        
        self._set_vars()
    
    
    def push_to_wiki(self):
        its = datetime.now()
        update_qualifier = wbi_core.String(f'API update {its}', prop_nr=self.properties_dict['global_references']['Source'], 
                                is_qualifier = True)
        
        statements = [
            wbi_core.ItemID(self.Q_parent, 
                            prop_nr=self.properties_dict['global_statements']['located_in'],
                            qualifiers=[update_qualifier])
        ]
        
        for _, state_i in self.resolved_fields.iterrows():
            statements.append(
                wbi_core.String(str(state_i['value']), prop_nr=self.properties_dict['statements'][state_i['property']], 
                                qualifiers=[update_qualifier])
            )

        item = wbi_core.ItemEngine(new_item=self.new_item, data=statements,core_props=set())

        if self.new_item:
            item.set_label(self.label, if_exists='REPLACE')

        self.write_responce = item.write(login_instance)

In [367]:
class WikiColumn(WikiObject): 
    def __init__(
        self, 
        label:str, 
        properties_dict:dict, 
        login_instance:wbi_login.Login, 
        df_input = pd.DataFrame()
    ):        
        self.login_instance = login_instance
        self.df_input = df_input
        
        self.label = label
        self.parent_label = '.'.join(label.split('.')[:-1])
        
        self.properties_dict = dict(
            properties_dict['DATABASE']['SCHEMA']['TABLE']['COLUMN'], **properties_dict['GLOBAL']
        )
        self.repeated_statements = None
        
        self._set_vars()
    
    
    def push_to_wiki(self):
        its = datetime.now()
        update_qualifier = wbi_core.String(f'API update {its}', prop_nr=self.properties_dict['global_references']['Source'], 
                                is_qualifier = True)
        
        statements = [
            wbi_core.ItemID(self.Q_parent, 
                            prop_nr=self.properties_dict['global_statements']['located_in'],
                            qualifiers=[update_qualifier])
        ]
        
        for _, state_i in self.resolved_fields.iterrows():
            statements.append(
                wbi_core.String(str(state_i['value']), prop_nr=self.properties_dict['statements'][state_i['property']], 
                                qualifiers=[update_qualifier])
            )

        item = wbi_core.ItemEngine(new_item=self.new_item, data=statements,core_props=set())

        if self.new_item:
            item.set_label(self.label, if_exists='REPLACE')

        self.write_responce = item.write(login_instance)

## Обновление
### Исходник из базы

In [232]:
# Выгрузка схемы базы из Эвотора
'''
select col.column_id, 
       col.owner as schema_name,
       col.table_name, 
       col.column_name, 
       col.data_type, 
       col.data_length, 
       col.data_precision, 
       col.data_scale, 
       col.nullable
from sys.all_tab_columns col
inner join sys.all_tables t on col.owner = t.owner 
                              and col.table_name = t.table_name
-- excluding some Oracle maintained schemas
where col.owner not in ('ANONYMOUS','CTXSYS','DBSNMP','EXFSYS', 'LBACSYS', 
   'MDSYS', 'MGMT_VIEW','OLAPSYS','OWBSYS','ORDPLUGINS', 'ORDSYS','OUTLN', 
   'SI_INFORMTN_SCHEMA','SYS','SYSMAN','SYSTEM','TSMSYS','WK_TEST','WKSYS', 
   'WKPROXY','WMSYS','XDB','APEX_040000', 'APEX_PUBLIC_USER','DIP', 
   'FLOWS_30000','FLOWS_FILES','MDDATA', 'ORACLE_OCM', 'XS$NULL',
   'SPATIAL_CSW_ADMIN_USR', 'SPATIAL_WFS_ADMIN_USR', 'PUBLIC')  
order by col.owner, col.table_name, col.column_id;
'''

df_e = pd.read_csv('./evotor_schemas.csv')

In [233]:
# Возьмём самые ходовые схемы для тестов 
schema_list = [
    'AIRFLOW', 
    'BIGDATA_LOADER', 
    'EVOTOR_ANALYTICS', 
    'EVOTOR_BIGDATA', 
    'EVOTOR_MARKET_REPL',
    'EVOTOR_REPORTS',
    'EVOTOR_CRM'
]

df_input = pd.read_csv('./evotor_schemas.csv')
df_input = df_input[df_input.SCHEMA.isin(schema_list)]
login_instance = wbi_login.Login(user=WIKIBASE_LOGIN, pwd=WIKIBASE_PASSWORD)   

### Справочник параметров

In [265]:
PROPERTY_DICT = {
    'GLOBAL': {
        'global_statements_items': gen_prop_dict(['located_in']),
        'global_references': gen_prop_dict(['Source'])
    },
    
    # Каким параметром P будет являться этот объект у родителя?
    'P': get_items_by_label(['COMPANY'], item_type = 'P').at[0, 'item'],
    # Квалифаеры параметра Р у родительского объекта
    'core_qualifiers': {},
    'custom_qualifiers': {},
    
    'statements': {},
    'DATABASE': {
        'P': get_items_by_label(['DATABASE'], item_type = 'P').at[0, 'item'],
        'core_qualifiers': {},
        'custom_qualifiers': {},
        'statements': {},
        'SCHEMA': {
            'P': get_items_by_label(['SCHEMA'], item_type = 'P').at[0, 'item'],
            'core_qualifiers': {},
            'custom_qualifiers': {},
            'statements': {},
            'TABLE': {
                'P': get_items_by_label(['TABLE'], item_type = 'P').at[0, 'item'],
                'core_qualifiers': {},
                'custom_qualifiers': {},
                'statements': gen_prop_dict(['DESCRIPTION']),
                'COLUMN': {
                    'P': get_items_by_label(['COLUMN'], item_type = 'P').at[0, 'item'],
                    'core_qualifiers': {},
                    'custom_qualifiers': {},
                    'statements': gen_prop_dict(['DATA_TYPE', 'DATA_LENGTH', 'DESCRIPTION'])
                }   
            }   
        }        
    }
}

### Взаимодействие с Wikibase

In [358]:
# Входной датафрейм
df_input_t = df_input[(df_input.TABLE == 'TMP_MP_REVENUE') & (df_input.COLUMN == 'NAME')]
df_input_c = pd.DataFrame(columns = ['STATEMENT_LABEL', 'STATEMENT_VALUE'])
for column in df_input_t.columns:
    if column not in ['COLUMN_ID', 'SCHEMA', 'TABLE', 'COLUMN'] \
                            and column in PROPERTY_DICT['DATABASE']['SCHEMA']['TABLE']['COLUMN']['statements'].keys():
        
        df_i = df_input_t[column]
        df_i.name = 'STATEMENT_VALUE'
        df_i = pd.DataFrame(df_i)
        df_i['STATEMENT_LABEL'] = column
        df_i['STATEMENT_TYPE'] = 'String'
        
        df_input_c = df_input_c.append(df_i)

In [368]:
wc = WikiColumn('AIRFLOW.TMP_MP_REVENUE.NAME', PROPERTY_DICT, login_instance, df_input_c)

INFO:root:
            Object AIRFLOW.TMP_MP_REVENUE.NAME (entity_id: Q995), parent AIRFLOW.TMP_MP_REVENUE (entity_id: None)
        


TO FETCH


In [369]:
wt = WikiTable('AIRFLOW.TMP_MP_REVENUE', PROPERTY_DICT, login_instance, df_input_t)

INFO:root:
            Object AIRFLOW.TMP_MP_REVENUE (entity_id: Q270), parent AIRFLOW (entity_id: None)
        


TO FETCH


# Прочие знания

In [None]:
# API поиска
wbi_core.ItemEngine.get_search_results('part_num_10793') 

In [None]:
# Дескрипшн
set_description(self, description, lang=None, if_exists='REPLACE'):

In [None]:
# Обновление существующих айтемов
data = [
    wbi_core.ItemID(img_Q, prop_nr = ITEMS_DICT['P']['Part Image'])
]
item = wbi_core.ItemEngine(new_item=False, item_id = 'Q1234', data=data,core_props=set())
r = item.write(login_instance)  

In [None]:
'''
#All properties with descriptions and aliases and types
SELECT ?property ?propertyType ?propertyLabel ?propertyDescription ?propertyAltLabel WHERE {
  ?property wikibase:propertyType ?propertyType .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
ORDER BY ASC(xsd:integer(STRAFTER(STR(?property), 'P')))
'''