In [1]:
import csv
import difflib
import os
import unicodedata
import json
import pandas as pd 
import numpy as np 
import requests
import re
from operator import itemgetter
from elasticsearch import helpers, Elasticsearch


def preprocess_data(csv_path):
    df = pd.read_csv(csv_path, converters={'銀行コード':str, '郵便番号':str, '地名': str, '口座番号': str})
    df = df.replace('\n','', regex=True)
    df.fillna(axis=1,value='',inplace=True)
    dir_path = os.path.dirname(csv_path)
    filename, _ = os.path.splitext(os.path.basename(csv_path))
    output = os.path.join(dir_path,'data_'+ os.path.basename(csv_path))
    alternatives = {
            '名称 1': 'company_name_1' ,
            '名称 2': 'company_name_2',
            '名称 3': 'company_name_3',
            '名称 4': 'company_name_4', 
            '地名': 'company_address' ,
            '電話番号' :'tel' ,
            'FAX番号' : 'fax',
            '仕入先ｺｰﾄﾞ': 'company_id',
            '銀行コード': 'bank_branch_id',
            '口座番号': 'account',
            '種別': 'type_of_account',
            '口座名義人名': 'account_name'
        }
    for _jp, _en in alternatives.items():
        df.rename(columns = {'{}'.format(_jp):'{}'.format(_en)}, inplace = True)
    for column in df.columns:
        if column == 'tel' or column == 'fax':
            df[column] = df[column].map(lambda x: ''.join([i for i in str(x) if i.isdigit()]))
            df[column] = df[column].map(lambda x: unicodedata.normalize('NFKC', str(x)))
            df[column] = df[column].map(lambda x:  str(x).replace('',' ').strip())
        elif column == 'company_id':
            df[column] = df[column].map(lambda x: unicodedata.normalize('NFKC', str(x)))
        elif column == 'account':
            df[column] = df[column].apply(lambda x : x[::-1]) #reverse account number
            df[column] = df[column].map(lambda x: unicodedata.normalize('NFKC', str(x)))
            df[column] = df[column].map(lambda x:  str(x).replace(' ',''))
            df[column] = df[column].map(lambda x:  str(x).replace('',' ').strip())
        else:
            df[column] = df[column].map(lambda x: unicodedata.normalize('NFKC', str(x)))
            df[column] = df[column].map(lambda x:  str(x).replace(' ',''))
            df[column] = df[column].map(lambda x:  str(x).replace('',' ').strip())
    df.to_csv(output, sep=',', encoding='utf-8', index=False)
    return output, filename

def load_json(json_path):
    with open(json_path, 'r') as json_data:
        ocr_result = json.load(json_data)
    return ocr_result

class MizuhoElastic:
    def __init__(self, debug = False):
        self.es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
        self.company_result = {}
        self.bank_result = {}
        self.company_json = {}
        self.bank_json = {}
        self.debug = debug
        self.indexs = {
            'company_information': ['company_name', 'company_address', 'tel', 'fax'],
            'bank_information': ['banks']
        }
    
    def find_index(self, field):
        index_out = ''
        for index, fields in self.indexs.items():
            if field in fields:
                index_out = index
                break
        return index_out
    
    def normalize_output(self, old_string):
        new_string = unicodedata.normalize('NFKC', old_string.replace(' ', '').strip())
        return new_string
    
    def normalize_input(self, old_string):
        new_string = unicodedata.normalize('NFKC', old_string.replace(' ', ''))
        new_string = unicodedata.normalize('NFKC', new_string.replace('', ' ').strip())
        return new_string
    
    def import_data(self, csv_path):
        data_path, index = preprocess_data(csv_path)
        with open(data_path) as f:
            reader = csv.DictReader(f)
            helpers.bulk(self.es, reader, index=index, doc_type='mizuho')
    
    def search_a_field(self, index, type_field, value_field):
        value_field = self.normalize_input(value_field)
        body = {"query": {"match": {"{}".format(type_field): '{}'.format(value_field)}}}
        res = self.es.search(index=index, doc_type='mizuho', body=body)
        res = res['hits']['hits']
        _ids = {}
        if res:
            for result in res:
                _score = 0.0
                value_search =  self.normalize_output(result['_source']['{}'.format(type_field)])
                diff = difflib.SequenceMatcher(None, self.normalize_output(value_field), value_search)
                _score = diff.ratio()
                if _score > 0.7:
                    _ids[result['_source']['company_id']] = _score
        return _ids
    
    def search_multiple_fields(self, index, type_field, value_field):
        value_field = self.normalize_input(value_field)
        print(value_field)
        body =  {"query": {"multi_match":{
            "fields": ["{}*".format(type_field)],
            "query" : "{}".format(value_field)
        }}}
        res = self.es.search(index=index, doc_type='mizuho', body=body)
        res = res['hits']['hits']
        _ids = {}
        if res:
            for result in res:
                _score = 0.0
                for _field, _value in result['_source'].items():
                    if _field.startswith('{}'.format(type_field)):
                        value_search =  self.normalize_output(result['_source']['{}'.format(_field)])
                        diff = difflib.SequenceMatcher(None, self.normalize_output(value_field), value_search)
                        if diff.ratio() > 0.7 and diff.ratio() > _score:
                            _score = diff.ratio()
                            _ids[result['_source']['company_id']] = _score
        return _ids

    def search_company_fields(self):
        if any([v for k, v in self.company_result.items()]):
            _ids = self.company_true_id()
            if _ids is not None:
                _id = _ids[0]
                self.company_id = _id
                body = {"query": {"term": {"company_id": "{}".format(self.company_id)}}}
                result_from_id = self.es.search(index='company_information', doc_type='mizuho', body= body)
                result_from_id = result_from_id['hits']['hits'][0]
                if result_from_id.get('_source'):
                    _score_company_name = 0
                    for field, value_field in self.company_result.items():
                        if field.startswith('company_name'):
                            fields = [i for i in result_from_id['_source'].keys() if i.startswith('company_name')]
                            for field_result in fields:
                                _diff = difflib.SequenceMatcher(None, self.normalize_output(result_from_id['_source'][field_result]), 
                                                            self.company_json['company_name'])
                                _score = _diff.ratio( ) 
                                if _score > _score_company_name:
                                    self.company_result['company_name'] = self.normalize_output(result_from_id['_source'][field_result])
                                    _score_company_name = _score
                            continue
                        self.company_result[field] =  self.normalize_output(result_from_id['_source'][field])
        return self.company_result
            
    def company_true_id(self):
        _total_ids = []
        _ids = set([i for k,v in self.company_result.items() for i in v.keys()])
        for _id in _ids:
            _count = sum([1 for k,v in self.company_result.items() for i in v.keys() if i == _id])
            _score = sum([v[i] for k,v in self.company_result.items() for i in v.keys() if i == _id])
            _total_ids.append((_id, _count, _score))    # (id, _count, _total_score)
        _total_ids = sorted(_total_ids, key=itemgetter(1,2), reverse=True)    # sorting _count to _total_score
        if len(_total_ids) > 0:
            return _total_ids[0]
        else:
            return None
    
    def search_bank_fields(self):
        if self.company_id:
            body = {"query": {"term": {"company_id": "{}".format(self.company_id)}}}
            result_from_id = self.es.search(index='company_information', doc_type='mizuho', body= body)
            result_from_id = result_from_id['hits']['hits']
            if result_from_id:
                for _index, result in enumerate(result_from_id):
                    self.bank_result['bank'+ str(_index+1)] = {
                        'bank': self.normalize_output(result['_source']['bank_branch_id'])[:4],
                        'branch': self.normalize_output(result['_source']['bank_branch_id'])[4:],
                        'type_of_account': self.normalize_output(result['_source']['type_of_account']),
                        'account': self.normalize_output(result['_source']['account'])[::-1],
                    }
            print(self.bank_result)
        else:
            for bank in self.bank_json:
                _value_account = self.bank_json[bank][::-1]
                body = {"query": {"term": {"account": "{}".format(_value_account)}}}
                result_from_id = self.es.search(index='company_information', doc_type='mizuho', body= body)
                
                
    def run(self,json_input):
        print('SPLITTING SJON INPUT')
        self.company_json = {}
        self.bank_json = {}
        for field, value_field in json_input.items():
            index = self.find_index(field)
            if index.startswith('company_information'):
                self.company_json[field] = value_field
                if field.startswith('company_name'):
                    self.company_result[field] = self.search_multiple_fields(index, 'company_name' ,value_field)
                    continue
                self.company_result[field] = self.search_a_field('company_information', field, value_field)
            else:
                self.bank_json[field] = value_field
        print(self.search_company_fields())
    
    def delete_elastic(self):
        os.system('curl -XDELETE localhost:9200/mizuho')

In [713]:
json_input = {
    'company_name':'芙蓉オートリース株式会社',
    'company_address':'',
    'tel':'',
    'fax':'',
    'bank1': {
                 'bank': 'みずほ銀行',
                 'branch': '東京中央支店',
                 'type_of_account': '',
                 'account': '566',
             },
    'bank2': {
                 'bank': 'みずq',
                 'branch': '東京中央支店',
                 'type_of_account': '',
                 'account': '309',
             }
}
mizuho = MizuhoElastic(debug=True)
mizuho.import_data('/Users/duongthanh/Documents/Mizuho_2/company_information.csv')

In [714]:
# mizuho.search_a_field('company_information', 'tel', '03853304')

In [716]:
mizuho.run(json_input)

SPLITTING SJON INPUT
芙 蓉 オ ー ト リ ー ス 株 式 会 社
{'company_name': '芙蓉オートリース株式会社', 'company_address': '〒102-0073九段北1―13―5ヒューリック九段ビル11階千代田区', 'tel': '0366852411', 'fax': ''}


In [717]:
mizuho.company_result

{'bank1': {'bank': 'みずほ銀行',
  'branch': '東京中央支店',
  'type_of_account': '',
  'account': '566'},
 'bank2': {'bank': 'みずq',
  'branch': '東京中央支店',
  'type_of_account': '',
  'account': '309'}}

In [718]:
mizuho.search_bank_fields()

{'bank1': {'bank': '0001', 'branch': '125', 'type_of_account': '1', 'account': '1395309'}}


In [74]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
body = {"query": {"regexp": {"account": {"value": "9 0 ~ 1","flags": "COMPLEMENT|INTERVAL"}}}}
# body={"query": {"wildcard" : { "account" : "9 0 3*1" }}}

In [75]:
es.search(index='company_information', doc_type='mizuho', body= body)

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 0, 'max_score': None, 'hits': []}}

In [76]:
import re
re.match('^[0-9]{4}369$','12334369')

In [55]:
condition = '^[0-9]{%d}%d$' %(3,1931); condition

'^[0-9]{3}1931$'