In [3]:
from pprint import pprint

import pickle
from pymongo import MongoClient

import time
from datetime import date, timedelta
import os
import datetime
from dateutil import tz
import pendulum

import config
import function as func

from schema.fact_document import FactDocumentModel
from schema.fact_performance import FactPerformanceModel
from schema.fact_data_extraction import FactDataExtractionModel
from db_connect import EngineConnect as DatabaseConnect

In [4]:
class EclaimsExecutor:
    def __init__(
        self,
        *kwargs,
        environment: str,
        uri: str,
        database_name: str,
        docs_collection_name: str, 
        trans_collection_name: str,
        performance_collection_name: str,
        db: DatabaseConnect
    ):
        self.environment = environment
        self.uri = uri
        self.database_name = database_name
        self.docs_collection_name = docs_collection_name
        self.trans_collection_name = trans_collection_name
        self.performance_collection_name = performance_collection_name
        self.db = db
        self.start_run = time.time()
        self.maxSevSelDelay = 20000
        self.start = config.start
        self.query = config.ECLAIMS_QUERY
        self.performance_query = config.ECLAIMS_PERFORMANCE_QUERY
        self.project_id = config.ECLAIMS_PROJECT_ID
        self.project_name = config.ECLAIMS_PROJECT_NAME
        self.backup_dir = config.BACKUP_DIR
        self.project_backup_dir = config.ECLAIMS_BACKUP_DIR
        self.project_docs_dir = config.ECLAIMS_DOCS_DIR
        self.project_trans_dir = config.ECLAIMS_TRANS_DIR
        self.project_performance_dir = config.ECLAIMS_PERFORMANCE_DIR
        self.backup_file_type = config.BACKUP_FILE_TYPE
        self.schema = config.DWH_ANALYTIC_SCHEMA
        self.fact_document_table = config.DWH_FACT_DOCUMENT_TABLE
        self.fact_performancec_table = config.DWH_FACT_PERFORMANCE_TABLE
        self.fact_data_extraction = config.DWH_FACT_DATA_EXTRACTION_TABLE
        
    def get_docs_and_trans(self):
        if self.environment == 'development':
            obj_docs = pickle.load(open('./backup/docs/' + self.project_id + '.pickle', 'rb'))
            obj_trans = pickle.load(open('./backup/trans/' + self.project_id + '.pickle', 'rb'))
        else: 
            obj_docs = pickle.load(open(self.backup_dir + self.project_backup_dir + self.project_docs_dir + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'rb'))
            obj_trans = pickle.load(open(self.backup_dir + self.project_backup_dir + self.project_trans_dir + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'rb'))
        data_docs = [item for item in obj_docs]
        data_trans = [item for item in obj_trans]
        return data_docs, data_trans

    def get_performance(self):
        if self.environment == 'development':
            obj_performance = pickle.load(open('./backup/performance/' + self.project_id + '.pickle', 'rb'))
        else:
            obj_performance = pickle.load(open(self.backup_dir + self.project_backup_dir + self.project_performance_dir + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'rb'))
        data_performance = [item for item in obj_performance]
        return data_performance
        
    def fact_document(self):
        datas = []
        data_docs, data_trans = self.get_docs_and_trans()
        list_created = [data['created_date'] for data in data_docs]
        meta_datas = [data['project_meta_data'] for data in data_docs]
        for data in data_trans:
            if len(data['records']) == 0:
                continue
            records = data['records'][0]
            created_date_utc_7 = func.check_index_data_docs(meta_datas, list_created, records['requestId'], records['caseId'], records['caseNumber']) \
                + datetime.timedelta(hours = 7)
            last_modified_utc_7 = data['last_modified'] + datetime.timedelta(hours = 7)
            import_date_key_utc_7, import_time_key_utc_7 = func.handle_date_to_date_and_time_id(created_date_utc_7)
            export_date_key_utc_7, export_time_key_utc_7 = func.handle_date_to_date_and_time_id(last_modified_utc_7)
            remark_code = None
            if records['remarkCode'] != None and records['remarkCode'] != '':
                remark_code = records['remarkCode']
            _obj = FactDocumentModel(
                project_id = self.project_id,
                document_id = func.bson_object_to_string(data['doc_id']),
                doc_set_id =  func.bson_object_to_string(data['doc_set_id']),
                remark_code = remark_code,
                remark_description = None,
                import_date_key = import_date_key_utc_7,
                import_time_key = import_time_key_utc_7,
                export_date_key = export_date_key_utc_7,
                export_time_key = export_time_key_utc_7,
                import_timestamp = created_date_utc_7,
                export_timestamp = last_modified_utc_7,
            )
            datas.append(_obj)
        if datas != []:
            print(datas[0].__dict__)
        self.db.create([item.__dict__ for item in datas], self.schema, self.fact_document_table)
                            
    def fact_performance(self):
        datas = []
        data_performance = self.get_performance()
        for performance in data_performance:
            captured_date_timestamp = datetime.datetime.strptime(performance['captured_date'], '%d/%m/%Y')
            obj_ = FactPerformanceModel(
                    ori_id = func.bson_object_to_string(performance['_id']),  
                    project_id = self.project_id,  
                    group_id = performance['group_id'],  
                    document_id = performance['documentId'],  
                    reworked = performance['has_rework'],  
                    work_type_id = func.get_working_type_id_by_name(performance['work_type']),  
                    process_key = func.get_process_id_performance(performance['type']),  
                    number_of_record = performance['records'],  
                    user_name = performance['username'], 
                    ip = None, 
                    captured_date_timestamp = captured_date_timestamp,  
                    captured_date_key = func.time_to_date_key(captured_date_timestamp),  
                    captured_time_key = 0,  
                    total_time_second = performance['total_time']/100     
            )
            datas.append(obj_)
        if datas != []:
            print(datas[0].__dict__)
        self.db.create([item.__dict__ for item in datas], self.schema, self.fact_performancec_table)
    
    def check_connect(self):
        if self.environment == 'development':
            (status, content, time_run) = (True, "good!",  time.time()- self.start_run)
        else:
            client = MongoClient(self.uri, serverSelectionTimeoutMS= self.maxSevSelDelay)
            client.server_info()
            client.close()
            (status, content, time_run) = (True, "good!",  time.time()-self.start_run)
        print('check_connect done!')
        return {"status": status, "content": content, "time": time_run}
    
    def backup_performance(self):
        if self.environment == 'development':
            objects = pickle.load(open('./backup/performance/' + self.project_id + '.pickle', 'rb'))
            data_objects = [item for item in objects]
            handle = open('./backup_test/performance_' + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'wb')
            pickle.dump(data_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)
            handle.close()
        else:
            client = MongoClient(self.uri)
            data_query = client[self.database_name][self.performance_collection_name].find(self.query)
            data_objects = [item for item in data_query]
            client.close()
            handle = open(self.backup_dir + self.project_backup_dir + self.project_performance_dir + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'wb')
            pickle.dump(data_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)
            handle.close()
        print('backup_performance done!')
        
    def backup_docs(self):
        if self.environment == 'development':
            objects = pickle.load(open('./backup/docs/' + self.project_id + '.pickle', 'rb'))
            data_objects = [item for item in objects]
            handle = open('./backup_test/docs_' + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'wb')
            pickle.dump(data_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)
            handle.close()
        else:
            client = MongoClient(self.uri)
            data_query = client[self.database_name][self.docs_collection_name].find(self.query)
            data_objects = [item for item in data_query]
            client.close()
            handle = open(self.backup_dir + self.project_backup_dir + self.project_docs_dir + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'wb')
            pickle.dump(data_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)
            handle.close()
        print('backup_docs done!')
    
    def backup_trans(self):
        if self.environment == 'development':
            objects = pickle.load(open('./backup/trans/' + self.project_id + '.pickle', 'rb'))
            data_objects = [item for item in objects]
            handle = open('./backup_test/tran_' + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'wb')
            pickle.dump(data_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)
            handle.close()
        else:
            client = MongoClient(self.uri)
            data_query = client[self.database_name][self.trans_collection_name].find(self.query)
            data_objects = [item for item in data_query]
            client.close()
            handle = open(self.backup_dir + self.project_backup_dir + self.project_trans_dir + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type , 'wb')
            pickle.dump(data_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)
            handle.close()
        print('backup_trans done!')
    
    def report(self):
        print('report done!')
    
    def clean(self): 
        if self.environment == 'development' or self.environment == 'production':
            now = self.start - timedelta(days=1)
            file_name = str(now.strftime("%Y-%m-%d"))
            docs_file_path = self.backup_dir + self.project_backup_dir + self.project_docs_dir + file_name + self.backup_file_type
            trans_file_path = self.backup_dir + self.project_backup_dir + self.project_trans_dir + file_name + self.backup_file_type
            performance_file_path = self.backup_dir + self.project_backup_dir + self.project_performance_dir + file_name + self.backup_file_type
            if os.path.exists(performance_file_path):
                os.remove(performance_file_path)
            else:
                print("The performance_file_path does not exist")
            if os.path.exists(docs_file_path):
                os.remove(docs_file_path)
            else:
                print("The docs_file_path does not exist")
            if os.path.exists(trans_file_path):
                os.remove(trans_file_path)
            else:
                print("The trans_file_path does not exist")
        print('clean done!')
        
    def fact_data_extract(self):
        datas = []
        data_docs, data_trans = self.get_docs_and_trans()
        key_ignore = ("requestId", "caseId", "caseNumber", "created_date", "last_modified", "documentId", 'attachmentId', 
            'system_processing', 'system_processing123', 'hos_image_type', 'remarkCode', 'remarkDescription')
        key_ignore_trans = ("requestId", "caseId", "caseNumber", "created_date", "last_modified", "documentId", 'attachmentId', 
            'system_processing', 'system_processing123', 'hos_image_type', 'remarkCode', 'remarkDescription', 'Images', 'fileName')
#         for data in data_trans:
#             records = data['records'][0]
#             last_modified_utc_7 = data['last_modified'] + datetime.timedelta(hours = 7)
#             list_keys = list(records.keys())
#             list_values = list(records.values())
#             user_name = None
#             step_type = None
#             process_type = 'transform'
#             module_type = 'transform_data'
#             process_key = func.get_process_key(module_type, process_type, step_type)
#             for i in range(len(list_keys)):
#                 last_modified_date_key_utc_7, last_modified_time_key_utc_7 = func.handle_date_to_date_and_time_id(last_modified_utc_7)
#                 field_name_temp = list_keys[i]
#                 if field_name_temp in key_ignore_trans or field_name_temp.startswith('classify'):
#                     continue
#                 elif (field_name_temp.startswith('cl') or field_name_temp.startswith('ocr_')) and field_name_temp != 'claimNature':
#                     continue
#                 field_name = func.lower_first_string(field_name_temp)
#                 field_value = list_values[i]
#                 _obj = FactDataExtractionModel(
#                     project_id = self.project_id,
#                     document_id = func.bson_object_to_string(data['doc_id']),
#                     doc_set_id =  func.bson_object_to_string(data['doc_set_id']),
#                     last_modified_date_key = last_modified_date_key_utc_7,
#                     last_modified_time_key = last_modified_time_key_utc_7,
#                     last_modified_timestamp = last_modified_utc_7,
#                     user_name = user_name,
#                     process_key = process_key,
#                     field_name = field_name,
#                     field_value = field_value
#                 )
#                 datas.append(_obj)
        for data in data_docs:
            if len(data['records']) == 0:
                continue
            meta_data = data['project_meta_data']
            last_modified_utc_7 = data['last_modified'] + datetime.timedelta(hours = 7)
            last_modified_date_key_utc_7, last_modified_time_key_utc_7 = func.handle_date_to_date_and_time_id(last_modified_utc_7)
            records = data['records'][0]
            for key, value in records.items():
                if key == 'keyed_data':
                    for keyed_data in value:
                        if (keyed_data['section'] == 'Auto_Extract' and keyed_data['source'] == 'queue_transform') or \
                            (keyed_data['section'] == 'Verify_Data' and keyed_data['source'] == 'queue_transform'):
                            process_type = func.fix_process_type_keyed_data(keyed_data['section'])
                            user_name = None
                            data_obj = keyed_data['data'][0]
                            list_keys = list(data_obj.keys())
                            list_values = list(data_obj.values())
                            for i in range(len(list_values)):
                                field_name_temp = list_keys[i]
                                step_type = func.fix_step_type_keyed_data(field_name_temp)
                                field_name = func.fix_field_name_keyed_data(field_name_temp)
                                if field_name in key_ignore:
                                    continue
                                field_value = list_values[i]['text']
                                process_key = func.get_process_key(module_type, process_type, step_type)                    
                                _obj = FactDataExtractionModel(
                                    project_id = self.project_id,
                                    document_id = func.bson_object_to_string(meta_data['documents'][0]['documentId']),
                                    doc_set_id =  func.bson_object_to_string(data['doc_set_id']),
                                    last_modified_date_key = last_modified_date_key_utc_7,
                                    last_modified_time_key = last_modified_time_key_utc_7,
                                    last_modified_timestamp = last_modified_utc_7,
                                    user_name = user_name,
                                    process_key = process_key,
                                    field_name = field_name,
                                    field_value = field_value
                                )
                                datas.append(_obj)
#                 elif key == 'system_data':
#                     module_type = 'system_data'
#                     system_data = value[0]
#                     data_obj = system_data['data'][0]
#                     auto_qc_output_data = data_obj['auto_qc_output_data']
#                     user_name = None
#                     process_type = 'automaticQualityControl'
#                     step_type = None  
#                     process_key = func.get_process_key(module_type, process_type, step_type)
#                     if auto_qc_output_data != []:
#                         for item in auto_qc_output_data:
#                             field_name_temp = item['field_name']
#                             field_name = func.lower_first_string(field_name_temp)
#                             if field_name in key_ignore:
#                                 continue
#                             _obj = FactDataExtractionModel(
#                                 project_id = self.project_id,
#                                 document_id = func.bson_object_to_string(meta_data['documents'][0]['documentId']),
#                                 doc_set_id =  func.bson_object_to_string(data['doc_set_id']),
#                                 last_modified_date_key = last_modified_date_key_utc_7,
#                                 last_modified_time_key = last_modified_time_key_utc_7,
#                                 last_modified_timestamp = last_modified_utc_7,
#                                 user_name = user_name,
#                                 process_key = process_key,
#                                 field_name = field_name,
#                                 field_value = 1
#                             )
#                             datas.append(_obj)

#                 elif key == 'qc_ed_data':
#                     module_type = 'qc_ed_data'
#                     qc_ed_data = value[0][0]
#                     if 'qc_fields_err' not in qc_ed_data.keys():
#                         pass
#                     else:
#                         data_obj = qc_ed_data['qc_fields_err']
#                         user_name = qc_ed_data['qcer']
#                         step_type = None
#                         process_type = func.fix_process_type_keyed_data(qc_ed_data['section'])
#                         process_key = func.get_process_key(module_type, process_type, step_type)
#                         for item in data_obj:
#                             field_name_temp = item['field']
#                             if field_name_temp in key_ignore:
#                                 continue
#                             field_name = func.lower_first_string(field_name_temp)
#                             field_value = item['value']['text']
#                             _obj = FactDataExtractionModel(
#                                 project_id = self.project_id,
#                                 document_id = func.bson_object_to_string(meta_data['documents'][0]['documentId']),
#                                 doc_set_id =  func.bson_object_to_string(data['doc_set_id']),
#                                 last_modified_date_key = last_modified_date_key_utc_7,
#                                 last_modified_time_key = last_modified_time_key_utc_7,
#                                 last_modified_timestamp = last_modified_utc_7,
#                                 user_name = user_name,
#                                 process_key = process_key,
#                                 field_name = field_name,
#                                 field_value = field_value
#                             )
#                             datas.append(_obj)
#                 elif key == 'apr_ed_data':
#                     module_type = 'apr_ed_data'
#                     apr_ed_data = value[0][0]
#                     process_type = func.fix_process_type_keyed_data(apr_ed_data['section'])
#                     step_type = None
#                     process_key = func.get_process_key(module_type, process_type, step_type)
#                     data_obj = apr_ed_data['data']
#                     for item in data_obj:
#                         field_name_temp = item['field']
#                         if field_name_temp in key_ignore:
#                             continue
#                         field_name = func.lower_first_string(field_name_temp)
#                         field_value = item['value']['text']
#                         user_name = item['aper']
#                         _obj = FactDataExtractionModel(
#                             project_id = self.project_id,
#                             document_id = func.bson_object_to_string(meta_data['documents'][0]['documentId']),
#                             doc_set_id =  func.bson_object_to_string(data['doc_set_id']),
#                             last_modified_date_key = last_modified_date_key_utc_7,
#                             last_modified_time_key = last_modified_time_key_utc_7,
#                             last_modified_timestamp = last_modified_utc_7,
#                             user_name = user_name,
#                             process_key = process_key,
#                             field_name = field_name,
#                             field_value = field_value
#                         )
#                         datas.append(_obj)
#                 elif key == 'final_data':
#                     module_type = 'final_data'
#                     final_data = value[0]
#                     user_name = None
#                     data_obj = final_data['data']
#                     process_type = 'finalize'
#                     for item in data_obj:
#                         field_name_temp = list(item.keys())[0]
#                         if field_name_temp in key_ignore or field_name_temp.startswith('cl') or field_name_temp.startswith('ocr_'):
#                             continue
#                         step_type = None
#                         field_name = func.fix_field_name_keyed_data(field_name_temp)
#                         field_value = list(item.values())[0]['text']
#                         process_key = func.get_process_key(module_type, process_type, step_type)
#                         _obj = FactDataExtractionModel(
#                             project_id = self.project_id,
#                             document_id = func.bson_object_to_string(meta_data['documents'][0]['documentId']),
#                             doc_set_id =  func.bson_object_to_string(data['doc_set_id']),
#                             last_modified_date_key = last_modified_date_key_utc_7,
#                             last_modified_time_key = last_modified_time_key_utc_7,
#                             last_modified_timestamp = last_modified_utc_7,
#                             user_name = user_name,
#                             process_key = process_key,
#                             field_name = field_name,
#                             field_value = field_value
#                         )
#                         datas.append(_obj)
#         if datas != []:
#             print(datas[0].__dict__)
        for data in datas:
            print(data.field_name, data.field_value)
        self.db.create([item.__dict__ for item in datas], self.schema, self.fact_data_extraction)    

In [5]:
db_connect = DatabaseConnect(uri = config.DWH_SQLALCHEMY_URI)
executor = EclaimsExecutor(
    environment=config.ENVIRONMENT,
    uri=config.ELROND_URI,
    database_name=config.ELROND_DATABASE,
    docs_collection_name= config.ECLAIMS_DOCS_COLLECTION, 
    trans_collection_name= config.ECLAIMS_TRANS_COLLECTION,
    performance_collection_name = config.ECLAIMS_PERFORMANCE_COLLECTION,
    db = db_connect
)
# executor.clean()
# executor.backup_docs()
# executor.backup_trans()
# executor.backup_performance()
# executor.fact_document()
executor.fact_data_extract()
# executor.fact_performance()
# executor.report()

NameError: name 'module_type' is not defined

In [None]:
dag_params = {
    'dag_id': "dwh_eclaims_project_daily_tmp",
    'start_date': datetime.datetime(2021, 1, 6, tzinfo=config.LOCAL_TIME_ZONE),
    'schedule_interval': '20 5 * * *'
}

dag = DAG(**dag_params)

clean = PythonOperator(task_id='clean', python_callable=executor.clean, dag=dag)
check_connect = PythonOperator(task_id='check_connect', python_callable=executor.check_connect, dag=dag)
backup_docs_json = PythonOperator(task_id='backup_docs_json', python_callable=executor.backup_docs_json, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)
backup_trans_json = PythonOperator(task_id='backup_trans_json', python_callable=executor.backup_trans_json, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)
backup_performance = PythonOperator(task_id='backup_performance', python_callable=executor.backup_performance, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)


fact_performance = PythonOperator(task_id='fact_performance', python_callable=executor.fact_performance, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)
fact_document = PythonOperator(task_id='fact_document', python_callable=executor.fact_document, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)


report = PythonOperator(task_id='report', python_callable=executor.report, dag=dag, trigger_rule=TriggerRule.ALL_DONE)

clean >> check_connect >> [backup_trans_json, backup_docs_json, backup_performance]

fact_performance.set_upstream(backup_performance)
fact_document.set_upstream([backup_trans_json, backup_docs_json])

[fact_performance, fact_document] >> report

In [None]:
trans = pickle.load(open('./backup/trans/' + '5db5c87345052400142992e9' + '.pickle', 'rb'))
docs = pickle.load(open('./backup/docs/' + '5db5c87345052400142992e9' + '.pickle', 'rb'))
performance = pickle.load(open('./backup/performance/' + '5db5c87345052400142992e9' + '.pickle', 'rb'))
# keyed_data
# qc

# pprint(performance)
x = []
for data in docs:
    pprint(data)
    break
    ocr_results = data['records'][0]['system_data'][0]['ocr_data'][0]['ocr_results']
    for ocr_result in ocr_results:
        field_name = ocr_result['field_name']
        if field_name not in x: 
            x.append(field_name)
print(x)
['address', 'birthday', 'expiry', 'home_town', 'id', 'issue_at', 'issue_date', 'name', 'sex']