In [1]:
from pprint import pprint

import pickle
from pymongo import MongoClient

import time
from datetime import date, timedelta
import os
import datetime
from dateutil import tz
import pendulum

import config
import function as func

from schema.fact_document import FactDocumentModel
from schema.fact_performance import FactPerformanceModel
from db_connect import EngineConnect as DatabaseConnect

In [2]:
class MvlStpOcrExecutor:
    def __init__(
        self,
        *kwargs,
        environment: str,
        uri: str,
        database_name: str,
        docs_collection_name: str, 
        trans_collection_name: str,
        performance_collection_name: str,
        db: DatabaseConnect
    ):
        self.environment = environment
        self.uri = uri
        self.database_name = database_name
        self.docs_collection_name = docs_collection_name
        self.trans_collection_name = trans_collection_name
        self.performance_collection_name = performance_collection_name
        self.db = db
        self.start_run = time.time()
        self.maxSevSelDelay = 20000
        self.start = config.start
        self.query = config.ECLAIMS_QUERY
        self.performance_query = config.MVL_STP_OCR_PERFORMANCE_QUERY
        self.project_id = '5db5c87345052400142992e9'
        self.project_name = '148_191004_124_MVL_STP_OCR'
        self.backup_dir = "/usr/local/airflow/storage/"
        self.project_backup_dir = '148_191004_124_MVL_STP_OCR/'
        self.project_docs_dir = 'docs/'
        self.project_trans_dir = 'trans/'
        self.project_performance_dir = 'performance/'
        self.backup_file_type = '.pickle'
        self.schema = config.DWH_ANALYTIC_SCHEMA
        self.fact_document_table = 'fact_document'
        self.fact_performancec_table = 'fact_performance'
        
    def get_docs_and_trans(self):
        if self.environment == 'development':
            obj_docs = pickle.load(open('./backup/docs/' + self.project_id + '.pickle', 'rb'))
            obj_trans = pickle.load(open('./backup/trans/' + self.project_id + '.pickle', 'rb'))
        else: 
            obj_docs = pickle.load(open(self.backup_dir + self.project_backup_dir + self.project_docs_dir + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'rb'))
            obj_trans = pickle.load(open(self.backup_dir + self.project_backup_dir + self.project_trans_dir + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'rb'))
        data_docs = [item for item in obj_docs]
        data_trans = [item for item in obj_trans]
        return data_docs, data_trans

    def get_performance(self):
        if self.environment == 'development':
            obj_performance = pickle.load(open('./backup/performance/' + self.project_id + '.pickle', 'rb'))
        else:
            obj_performance = pickle.load(open(self.backup_dir + self.project_backup_dir + self.project_performance_dir + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'rb'))
        data_performance = [item for item in obj_performance]
        return data_performance
        
    def fact_document(self):
        datas = []
        data_docs, data_trans = self.get_docs_and_trans()
        list_created = [{'doc_id': func.bson_object_to_string(data['_id']), 'created_date': data['created_date']} for data in data_docs]
        for data in data_trans:
            if len(data['records']) == 0:
                continue
            created_date = func.created_date_of_docs_by_id(func.bson_object_to_string(data['doc_id']), list_created)
            if created_date == None:
                (import_date_key_utc_7, import_time_key_utc_7, created_date_utc_7) = (None, None, None)
            else:
                created_date_utc_7 = created_date + datetime.timedelta(hours = 7)
                import_date_key_utc_7, import_time_key_utc_7 = func.handle_date_to_date_and_time_id(created_date_utc_7)
            last_modified_utc_7 = data['last_modified'] + datetime.timedelta(hours = 7)
            export_date_key_utc_7, export_time_key_utc_7 = func.handle_date_to_date_and_time_id(last_modified_utc_7)
            _obj = FactDocumentModel(
                project_id = self.project_id,
                document_id = func.bson_object_to_string(data['doc_id']),
                doc_set_id =  func.bson_object_to_string(data['doc_set_id']),
                remark_code = None,
                remark_description = data['records'][0]['REMARK'],
                import_date_key = import_date_key_utc_7,
                import_time_key = import_time_key_utc_7,
                export_date_key = export_date_key_utc_7,
                export_time_key = export_time_key_utc_7,
                import_timestamp = created_date_utc_7,
                export_timestamp = last_modified_utc_7,
            )
            datas.append(_obj)
        if datas != []:
            print(datas[0].__dict__)
        self.db.create([item.__dict__ for item in datas], self.schema, self.fact_document_table)
    
        
    def fact_performance(self):
        datas = []
        data_performance = self.get_performance()
        for performance in data_performance:
            captured_date_timestamp = datetime.datetime.strptime(performance['captured_date'], '%d/%m/%Y')
            obj_ = FactPerformanceModel(
                    ori_id = func.bson_object_to_string(performance['_id']),  
                    project_id = self.project_id,  
                    group_id = performance['group_id'],  
                    document_id = performance['documentId'],  
                    reworked = performance['has_rework'],  
                    work_type_id = func.get_working_type_id_by_name(performance['work_type']),  
                    process_key = func.get_process_id_performance(performance['type']),  
                    number_of_record = performance['records'],  
                    user_name = performance['username'], 
                    ip = None, 
                    captured_date_timestamp = captured_date_timestamp,  
                    captured_date_key = func.time_to_date_key(captured_date_timestamp),  
                    captured_time_key = 0,  
                    total_time_second = performance['total_time']/100     
            )
            datas.append(obj_)
        if datas != []:
            print(datas[0].__dict__)
        self.db.create([item.__dict__ for item in datas], self.schema, self.fact_performancec_table)
    
    def check_connect(self):
        if self.environment == 'development':
            (status, content, time_run) = (True, "good!",  time.time()- self.start_run)
        else:
            client = MongoClient(self.uri, serverSelectionTimeoutMS= self.maxSevSelDelay)
            client.server_info()
            client.close()
            (status, content, time_run) = (True, "good!",  time.time()-self.start_run)
        print('check_connect done!')
        return {"status": status, "content": content, "time": time_run}
    
    def backup_performance(self):
        if self.environment == 'development':
            objects = pickle.load(open('./backup/performance/' + self.project_id + '.pickle', 'rb'))
            data_objects = [item for item in objects]
            handle = open('./backup_test/performance_' + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'wb')
            pickle.dump(data_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)
            handle.close()
        else:
            client = MongoClient(self.uri)
            data_query = client[self.database_name][self.performance_collection_name].find(self.query)
            data_objects = [item for item in data_query]
            client.close()
            handle = open(self.backup_dir + self.project_backup_dir + self.project_performance_dir + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'wb')
            pickle.dump(data_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)
            handle.close()
        print('backup_performance done!')
        
    def backup_docs(self):
        if self.environment == 'development':
            objects = pickle.load(open('./backup/docs/' + self.project_id + '.pickle', 'rb'))
            data_objects = [item for item in objects]
            handle = open('./backup_test/docs_' + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'wb')
            pickle.dump(data_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)
            handle.close()
        else:
            client = MongoClient(self.uri)
            data_query = client[self.database_name][self.docs_collection_name].find(self.query)
            data_objects = [item for item in data_query]
            client.close()
            handle = open(self.backup_dir + self.project_backup_dir + self.project_docs_dir + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'wb')
            pickle.dump(data_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)
            handle.close()
        print('backup_docs done!')
    
    def backup_trans(self):
        if self.environment == 'development':
            objects = pickle.load(open('./backup/trans/' + self.project_id + '.pickle', 'rb'))
            data_objects = [item for item in objects]
            handle = open('./backup_test/tran_' + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'wb')
            pickle.dump(data_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)
            handle.close()
        else:
            client = MongoClient(self.uri)
            data_query = client[self.database_name][self.trans_collection_name].find(self.query)
            data_objects = [item for item in data_query]
            client.close()
            handle = open(self.backup_dir + self.project_backup_dir + self.project_trans_dir + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type , 'wb')
            pickle.dump(data_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)
            handle.close()
        print('backup_trans done!')
    
    def report(self):
        print('report done!')
    
    def clean(self): 
        if self.environment == 'development' or self.environment == 'production':
            now = self.start - timedelta(days=1)
            file_name = str(now.strftime("%Y-%m-%d"))
            docs_file_path = self.backup_dir + self.project_backup_dir + self.project_docs_dir + file_name + self.backup_file_type
            trans_file_path = self.backup_dir + self.project_backup_dir + self.project_trans_dir + file_name + self.backup_file_type
            performance_file_path = self.backup_dir + self.project_backup_dir + self.project_performance_dir + file_name + self.backup_file_type
            if os.path.exists(performance_file_path):
                os.remove(performance_file_path)
            else:
                print("The performance_file_path does not exist")
            if os.path.exists(docs_file_path):
                os.remove(docs_file_path)
            else:
                print("The docs_file_path does not exist")
            if os.path.exists(trans_file_path):
                os.remove(trans_file_path)
            else:
                print("The trans_file_path does not exist")
        print('clean done!')
        

In [4]:
db_connect = DatabaseConnect(uri = config.DWH_SQLALCHEMY_URI)
executor = MvlStpOcrExecutor(
    environment=config.ENVIRONMENT,
    uri=config.ELROND_URI,
    database_name=config.ELROND_DATABASE,
    docs_collection_name= config.MVL_STP_OCR_DOCS_COLLECTION, 
    trans_collection_name= config.MVL_STP_OCR_TRANS_COLLECTION,
    performance_collection_name = config.MVL_STP_OCR_PERFORMANCE_COLLECTION,
    db = db_connect
)
executor.clean()
executor.backup_docs()
executor.backup_trans()
executor.backup_performance()
executor.fact_document()
executor.fact_performance()
executor.report()

The performance_file_path does not exist
The docs_file_path does not exist
The trans_file_path does not exist
clean done!
backup_docs done!
backup_trans done!
backup_performance done!
{'project_id': '5db5c87345052400142992e9', 'document_id': '5ffa64ed474eb70010c3cc8b', 'doc_set_id': '5ffa64ed474eb70010c3cc8a', 'import_time_key': 92237, 'import_date_key': 20210110, 'export_time_key': 92405, 'export_date_key': 20210110, 'import_timestamp': datetime.datetime(2021, 1, 10, 9, 22, 37, 828000), 'export_timestamp': datetime.datetime(2021, 1, 10, 9, 24, 5, 906000), 'remark_code': None, 'remark_description': ''}
{'ori_id': '5dfccb6652439f0014026014', 'project_id': '5db5c87345052400142992e9', 'group_id': None, 'document_id': '5dfc2ef1691f5300101137cc', 'reworked': False, 'work_type_id': 1, 'process_key': 3, 'number_of_record': 2, 'user_name': 'hiennm', 'ip': None, 'captured_date_timestamp': datetime.datetime(2019, 12, 20, 0, 0), 'captured_date_key': 20191220, 'captured_time_key': 0, 'total_time_s

In [None]:
dag_params = {
    'dag_id': "dwh_mvl_stp_ocr_project_daily_tmp",
    'start_date': datetime.datetime(2021, 1, 6, tzinfo=config.LOCAL_TIME_ZONE),
    'schedule_interval': '20 5 * * *'
}

dag = DAG(**dag_params)

clean = PythonOperator(task_id='clean', python_callable=executor.clean, dag=dag)
check_connect = PythonOperator(task_id='check_connect', python_callable=executor.check_connect, dag=dag)
backup_docs_json = PythonOperator(task_id='backup_docs_json', python_callable=executor.backup_docs_json, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)
backup_trans_json = PythonOperator(task_id='backup_trans_json', python_callable=executor.backup_trans_json, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)
backup_performance = PythonOperator(task_id='backup_performance', python_callable=executor.backup_performance, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)


fact_performance = PythonOperator(task_id='fact_performance', python_callable=executor.fact_performance, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)
fact_document = PythonOperator(task_id='fact_document', python_callable=executor.fact_document, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)


report = PythonOperator(task_id='report', python_callable=executor.report, dag=dag, trigger_rule=TriggerRule.ALL_DONE)

clean >> check_connect >> [backup_trans_json, backup_docs_json, backup_performance]

fact_performance.set_upstream(backup_performance)
fact_document.set_upstream([backup_trans_json, backup_docs_json])

[fact_performance, fact_document] >> report

In [None]:
trans = pickle.load(open('./backup/trans/' + '5db5c87345052400142992e9' + '.pickle', 'rb'))
docs = pickle.load(open('./backup/docs/' + '5db5c87345052400142992e9' + '.pickle', 'rb'))
performance = pickle.load(open('./backup/performance/' + '5db5c87345052400142992e9' + '.pickle', 'rb'))
# keyed_data
# qc

# pprint(performance)
x = []
for data in docs:
    pprint(data)
    break
    ocr_results = data['records'][0]['system_data'][0]['ocr_data'][0]['ocr_results']
    for ocr_result in ocr_results:
        field_name = ocr_result['field_name']
        if field_name not in x: 
            x.append(field_name)
print(x)
['address', 'birthday', 'expiry', 'home_town', 'id', 'issue_at', 'issue_date', 'name', 'sex']