## SmoothNLP for Money NER

In [1]:
import smoothnlp
import json
smoothnlp.initJVMConnection("/storm/smoothnlp/smoothnlp_maven/target/smoothnlp-0.1-jar-with-dependencies.jar")

In [2]:
sample_text1 = "您的信用卡上月总共支出2350.6元"
sample_text2 = "您正在申请借款5000.00元"
sample_text3 = "您尾号9999的的卡与2019年1月1日支付平台网上支付1400.00元"
sample_text4 = "xxx 向您发了一笔欠条展期, 金额2000元"
sample_text5 = "您尾号2222的储蓄卡在1月1日无卡自主消费支出人民币148.00元,活期余额剩余475.25元"
sample_text6 = "您尾号2222的储蓄卡转账收入人民币400.00元, 活期余额剩余1475.23元"
sample_text7 = "您上个月借了300元钱"

In [3]:
m = smoothnlp.learner.MoneyLearner()

In [4]:
json.loads(m.transform(sample_text1))

[{'targetCharStart': '17',
  'sourceCharStart': '9',
  'sourceCharEnd': '11',
  'entityType': 'MONEY',
  'moneyNormalizedAmount': '2350.6',
  'sourceTokenIndex': '5',
  'charEnd': '18',
  'sourceToken': '支出',
  'normalizedEntityTag': '¥2350.6',
  'targetToken': '元',
  'charStart': '11',
  'targetCharEnd': '18',
  'tokens': '[2350.6-8, 元-9]',
  'text': '2350.6元',
  'relationship': '动宾关系',
  'targetTokenIndex': '7'}]

In [5]:
sample_texts = [sample_text1,sample_text2,sample_text3,sample_text4,sample_text5,sample_text6,sample_text7]
res_list = []
for s in sample_texts:
    money_res = json.loads(m.transform(s))
    for mres in money_res:
        res_list.append(mres)
        print("关系动作: ", mres['sourceToken'], "   结构化金额: ",mres['moneyNormalizedAmount'])

关系动作:  支出    结构化金额:  2350.6
关系动作:  申请    结构化金额:  5000
关系动作:  支付    结构化金额:  1400
关系动作:  发    结构化金额:  2000
关系动作:  消费    结构化金额:  148
关系动作:  剩余    结构化金额:  475.25
关系动作:  unknown    结构化金额:  400
关系动作:  剩余    结构化金额:  1475.23
关系动作:  借    结构化金额:  300


## Use Squeeze for Ftr Engineering

In [6]:
import squeeze
from squeeze.transformer import MultiSequenceTransformer
money_ner_adaptor = squeeze.adaptor.smoothnlp_money_ner_adaptor

In [7]:
squeezed_seqs = money_ner_adaptor.convert2seqs(res_list)
squeezed_seqs

{'sourceToken': {'name': 'sourceToken',
  'value': ['支出', '申请', '支付', '发', '消费', '剩余', 'unknown', '剩余', '借']},
 'moneyNormalizedAmount': {'name': 'moneyNormalizedAmount',
  'value': [2350.6, 5000, 1400, 2000, 148, 475.25, 400, 1475.23, 300]}}

In [12]:
mst = MultiSequenceTransformer()

In [17]:
mst.fit(squeezed_seqs)
mst.transform(squeezed_seqs)

{'sourceToken.flucturate_rate': 0.0,
 'sourceToken.length': 9,
 'sourceToken.duplicates_count': 1,
 'sourceToken.percentage_of_most_reoccuring_value_to_all_values': 0.125,
 'sourceToken.percentage_of_most_reoocuring_value_to_all_datapoints': 0.2222222222222222,
 'sourceToken.ratio_value_number_to_seq_length': 0.8888888888888888,
 'sourceToken.uniqueCount': 8,
 'moneyNormalizedAmount.fist_location_of_max': 1,
 'moneyNormalizedAmount.fist_location_of_min': 4,
 'moneyNormalizedAmount.freq_of_max': 1,
 'moneyNormalizedAmount.freq_of_min': 1,
 'moneyNormalizedAmount.kurtosis': 3.114739,
 'moneyNormalizedAmount.last_location_of_max': 8,
 'moneyNormalizedAmount.last_location_of_min': 5,
 'moneyNormalizedAmount.length': 9,
 'moneyNormalizedAmount.max': 5000,
 'moneyNormalizedAmount.mean': 1505.4533333333334,
 'moneyNormalizedAmount.median': 1400,
 'moneyNormalizedAmount.median_mean_distance': 0.021733992855180004,
 'moneyNormalizedAmount.min': 148,
 'moneyNormalizedAmount.duplicates_count': 0,