## SmoothNLP for Money NER

In [1]:
import smoothnlp
import json
smoothnlp.initJVMConnection("/storm/smoothnlp/smoothnlp_maven/target/smoothnlp-0.1-jar-with-dependencies.jar")

In [2]:
sample_text1 = "您的信用卡上月总共支出2350.6元"
sample_text2 = "您正在申请借款5000.00元"
sample_text3 = "您尾号9999的的卡与2019年1月1日支付平台网上支付1400.00元"
sample_text4 = "xxx 向您发了一笔欠条展期, 金额2000元"
sample_text5 = "您尾号2222的储蓄卡在1月1日无卡自主消费支出人民币148.00元,活期余额剩余475.25元"
sample_text6 = "您尾号2222的储蓄卡转账收入人民币400.00元, 活期余额剩余1475.23元"

In [3]:
m = smoothnlp.learner.MoneyLearner()

In [4]:
json.loads(m.transform(sample_text1))

[{'charStart': '11',
  'entityType': 'MONEY',
  'moneyNormalizedAmount': '2350.6',
  'charEnd': '18',
  'sourceToken': '支出',
  'tokens': '[2350.6-8, 元-9]',
  'normalizedEntityTag': '¥2350.6',
  'text': '2350.6元',
  'relationship': '动宾关系'}]

In [5]:
sample_texts = [sample_text1,sample_text2,sample_text3,sample_text4,sample_text5,sample_text6]
res_list = []
for s in sample_texts:
    money_res = json.loads(m.transform(s))
    for mres in money_res:
        res_list.append(mres)
        print("关系动作: ", mres['sourceToken'], "   结构化金额: ",mres['moneyNormalizedAmount'])

关系动作:  支出    结构化金额:  2350.6
关系动作:  申请    结构化金额:  5000
关系动作:  支付    结构化金额:  1400
关系动作:  发    结构化金额:  2000
关系动作:  消费    结构化金额:  148
关系动作:  剩余    结构化金额:  475.25
关系动作:  unknown    结构化金额:  400
关系动作:  剩余    结构化金额:  1475.23


## Use Squeeze for Ftr Engineering

In [8]:
import squeeze
from squeeze import SequenceTransformer
money_ner_adaptor = squeeze.adaptor.smoothnlp_money_ner_adaptor

In [7]:
squeezed_seqs = money_ner_adaptor.convert2seqs(res_list)
squeezed_seqs

{'sourceToken': {'name': 'sourceToken',
  'value': ['支出', '申请', '支付', '发', '消费', '剩余', 'unknown', '剩余']},
 'moneyNormalizedAmount': {'name': 'moneyNormalizedAmount',
  'value': [2350.6, 5000, 1400, 2000, 148, 475.25, 400, 1475.23]}}

In [9]:
st = SequenceTransformer()

In [10]:
st.transform(squeezed_seqs['moneyNormalizedAmount'])

{'moneyNormalizedAmount.fist_location_of_max': 1,
 'moneyNormalizedAmount.fist_location_of_min': 4,
 'moneyNormalizedAmount.freq_of_max': 1,
 'moneyNormalizedAmount.freq_of_min': 1,
 'moneyNormalizedAmount.kurtosis': 2.825035,
 'moneyNormalizedAmount.last_location_of_max': 7,
 'moneyNormalizedAmount.last_location_of_min': 4,
 'moneyNormalizedAmount.length': 8,
 'moneyNormalizedAmount.max': 5000,
 'moneyNormalizedAmount.mean': 1656.135,
 'moneyNormalizedAmount.median': 1437.615,
 'moneyNormalizedAmount.median_mean_distance': 0.045037098103874684,
 'moneyNormalizedAmount.min': 148,
 'moneyNormalizedAmount.duplicates_count': 0,
 'moneyNormalizedAmount._number_peaks': 3,
 'moneyNormalizedAmount.percentage_below_mean': 0.625,
 'moneyNormalizedAmount.percentage_of_most_reoccuring_value_to_all_values': 0.125,
 'moneyNormalizedAmount.percentage_of_most_reoocuring_value_to_all_datapoints': 0.125,
 'moneyNormalizedAmount.ratio_value_number_to_seq_length': 1.0,
 'moneyNormalizedAmount.skewness': 

In [11]:
st.transform(squeezed_seqs['sourceToken'])

{'sourceToken.flucturate_rate': 0.0,
 'sourceToken.length': 8,
 'sourceToken.duplicates_count': 1,
 'sourceToken.percentage_of_most_reoccuring_value_to_all_values': 0.14285714285714285,
 'sourceToken.percentage_of_most_reoocuring_value_to_all_datapoints': 0.25,
 'sourceToken.ratio_value_number_to_seq_length': 0.875,
 'sourceToken.uniqueCount': 7}