# 第一部分

In [1]:
import codecs
import jieba
import jieba.analyse
import jieba.posseg
import pandas as pd
import re

import param
import util

## 定义分词函数

In [2]:
pattern = re.compile('[&]\w*[;]')
def split_word(text, stopwords):
    word_list = jieba.cut(re.sub(pattern,'', text))
    start = True
    result = ''
    for word in word_list:
        word = word.strip()
        if word not in stopwords:
            if start is True:
                result = word
                start = False
            else:
                result += ' ' + word
    return result


## 加载停用词

In [3]:
stopwords = {}
for line in codecs.open(param.data_path+'/input/stop.txt', encoding='utf-8'):
    stopwords[line.rstrip()] = 1
    

## 加载数据 并分词

In [9]:
df_tr = []
for i, line in enumerate(open(param.data_path+'/input/train.txt', encoding='utf-8')):
    if i % 1000 == 1:
        util.log('train data: iter = %d' % i)
    line = re.sub(re.compile('[&]\w*[;]'),'', line)
    segs = line.split('\t')
    if i ==1:
        print(segs[1]) # 看一下数据是否乱码
    row = {}
    row['id'] = segs[0]
    row['content'] = split_word(segs[1].strip(), stopwords)
    row['penalty'] = segs[2]
    row['laws'] = segs[3].strip()
    df_tr.append(row)
#     if i == 1000: # 调试用
#         break
    
df_tr = pd.DataFrame(df_tr)


df_te = []
for i, line in enumerate(open(param.data_path+'/input/test.txt', encoding='utf-8')):
    if i % 1000 == 1:
        util.log('test data: iter = %d ' % i)
    segs = line.split('\t')
    row = {}
    row['id'] = segs[0]
    row['content'] = split_word(segs[1].strip(), stopwords)
    df_te.append(row)
#     if i == 2000: # 调试用
#         break
    
df_te = pd.DataFrame(df_te)

print('train data shape: ', df_tr.shape)
print('test data shape: ', df_te.shape)
print(df_tr.info())
df_tr.head()

2019-04-03 15:08:31 train data: iter = 1
公诉机关北京市昌平区人民检察院。被告人呼，男，27岁（1986年11月10日出生）。因涉嫌犯盗窃罪于2014年6月20日被羁押，同年7月3日被逮捕，现羁押于北京市昌平区看守所。北京市昌平区人民检察院以京昌检公诉刑诉（2014）610号起诉书指控被告人呼犯盗窃罪，于2014年7月22日向本院提起公诉。本院依法适用简易程序，实行独任审判，公开开庭进行了审理。北京市昌平区人民检察院指派检察员夏文广出庭支持公诉。被告人呼到庭参加诉讼。本案现已审理终结。北京市昌平区人民检察院起诉书指控：被告人呼于2014年6月20日16时50分许，进入昌平区东小口镇兴旺地市场南门西侧赵烟酒店内，趁四周无人，将店内货架抽屉内的人民币5700余元及5000余元的手机充值卡窃走。赵发现后随即追赶，被告人呼在逃跑过程中分两次将所窃钱、卡抛撒，后被抓获。赵在追赶过程中捡拾被盗人民币1587元，手机充值卡2960元。上述事实，被告人呼在庭审过程中未提出异议，并有经过庭审质证、认证的被害人赵的陈述，证人李、郭的证言，辨认笔录，公安机关出具的接报案和到案经过，照片，工作说明，身份证明材料，光盘，被告人呼的供述等证据在案佐证，足以认定。
2019-04-03 15:08:38 train data: iter = 1001
2019-04-03 15:08:46 train data: iter = 2001
2019-04-03 15:08:52 train data: iter = 3001
2019-04-03 15:08:59 train data: iter = 4001
2019-04-03 15:09:06 train data: iter = 5001
2019-04-03 15:09:13 train data: iter = 6001
2019-04-03 15:09:20 train data: iter = 7001
2019-04-03 15:09:27 train data: iter = 8001
2019-04-03 15:09:34 train data: iter = 9001
2019-04-03 15:09:40 train data: iter = 10001
2019-04-03 15

Unnamed: 0,content,id,laws,penalty
0,原 公诉 机关 榆阳区 人民检察院 上诉人 （ 原审 被告人 ） 刘某 男 汉族 陕西省 横...,60,2242526275272,7
1,公诉 机关 北京市 昌平区 人民检察院 被告人 呼 男 27 岁 （ 1986 年 11 月...,93,26467525364,1
2,公诉 机关 平湖市 人民检察院 被告人 高某 无业 本案 2012 年 月 12 日 平湖市...,107,26726425696772,4
3,公诉 机关 平湖市 人民检察院 被告人 张某 无业 本案 2011 年 10 月 12 日 ...,114,30325276772,4
4,公诉 机关 平湖市 人民检察院 被告人 闫明云 无业 盗窃 2001 年 月 上海市 劳动教...,126,2642565,7


## 写出数据

In [10]:
df_all = pd.concat([df_tr, df_te]).fillna(0)
df_all.to_csv(param.data_path+'/output/corpus/all_data.csv', index=None)

# 第二部分

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

import param
import util


In [2]:
%config ZMQInteractiveShell.ast_node_interactivity = 'all'

## 定义评估函数

In [3]:
def micro_avg_f1(y_true, y_pred):
    # 关于f1_score参数average的解释：https://www.cnblogs.com/techengin/p/8962024.html
#     try:
#         f1_score = f1_score(y_true, y_pred, average='micro')
#     except :
#         print(len(y_true), len(y_pred))
#         return 
    return f1_score(y_true, y_pred, average='micro')

## 加载数据

In [4]:
# 注意all_data.csv文件的编码要转为utf-8。直接用notepad++转码就可以
df_all = pd.read_csv(param.data_path+'/output/corpus/all_data.csv', sep=',')


In [5]:
df_all.head()
# df_all['penalty'] = df_all['penalty'] - 1
df_all.info()

Unnamed: 0,content,id,laws,penalty
0,原 公诉 机关 榆阳区 人民检察院 上诉人 （ 原审 被告人 ） 刘某 男 汉族 陕西省 横...,60,2242526275272,7
1,公诉 机关 北京市 昌平区 人民检察院 被告人 呼 男 27 岁 （ 1986 年 11 月...,93,26467525364,1
2,公诉 机关 平湖市 人民检察院 被告人 高某 无业 本案 2012 年 月 12 日 平湖市...,107,26726425696772,4
3,公诉 机关 平湖市 人民检察院 被告人 张某 无业 本案 2011 年 10 月 12 日 ...,114,30325276772,4
4,公诉 机关 平湖市 人民检察院 被告人 闫明云 无业 盗窃 2001 年 月 上海市 劳动教...,126,2642565,7


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 4 columns):
content    70000 non-null object
id         70000 non-null int64
laws       70000 non-null object
penalty    70000 non-null int64
dtypes: int64(2), object(2)
memory usage: 2.1+ MB


In [6]:
df_all['penalty'].unique()

array([7, 1, 4, 8, 2, 6, 5, 3, 0], dtype=int64)

## tf-idf

In [7]:
# 参数解释：https://blog.csdn.net/blmoistawinde/article/details/80816179
tfv = TfidfVectorizer(min_df=3, max_df=0.95, sublinear_tf=True)
x_sp = tfv.fit_transform(df_all['content'])
x_sp.shape


(70000, 114149)

In [10]:
# print(x_sp[0].toarray()) #第1行每个词的tf_idf值。
print(tfv.get_feature_names())

['00', '000', '0000', '000005', '000007', '000008', '00001', '00002', '00003', '00004', '00005', '00006', '00007', '00008', '00009', '0001', '00010', '00011', '00012', '00013', '00014', '00015', '0001508', '00016', '00017', '00018', '00019', '0002', '00020', '00021', '00022', '00023', '00024', '00027', '00028', '00029', '0003', '00030', '00031', '00032', '00033', '00034', '00035', '00036', '00037', '00038', '00039', '0004', '00040', '00041', '00042', '00043', '00044', '00045', '00046', '00047', '00048', '00049', '0005', '00050', '00051', '00052', '00053', '00054', '00055', '00056', '00057', '00059', '0006', '00060', '00061', '00062', '00063', '00065', '00066', '00067', '00068', '00069', '0007', '00070', '00071', '00072', '00073', '00074', '00076', '00077', '00078', '00079', '0008', '00080', '00081', '00084', '00085', '00086', '00087', '00088', '00089', '0009', '00090', '00092', '00093', '00094', '00095', '00096', '00097', '0009729', '00099', '001', '0010', '00100', '00102', '00103', '0

In [13]:
print(x_sp[:5].toarray())


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [8]:
df_all['penalty'].value_counts()
# 0为填充的缺失值，数量正好是30000，应该就是测试集的标签全被设置为0了

0    30000
7     9435
2     6782
6     6440
1     5548
3     4878
5     4651
4     2019
8      247
Name: penalty, dtype: int64

## lr stacking

In [9]:
tr_num = param.train_num
num_class = len(df_all['penalty'].value_counts()) - 1 # 我们填充的缺失值为0，相当于多一个填充标签
n = 5

x = x_sp[:tr_num]              # 从tf-idf特征向量中将属于训练集的特征分出来
y = df_all['penalty'][:tr_num] # 标签/实际值
x_te = x_sp[tr_num:]           # 测试集
# y_te = df_all['penalty'][tr_num:]

print(x.shape, y.shape, x_te.shape)

stack_tr = np.zeros((x.shape[0], num_class), dtype=float)     # (40000, 8)
stack_te = np.zeros((x_te.shape[0], num_class), dtype=float) # (30000, 8)

score_va = 0


(40000, 114149) (40000,) (30000, 114149)


In [38]:
skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=param.seed)

for i, (tr, va) in enumerate(skf.split(x, y)):
    if i == 0:
        print('(tr_index, va_index): ', (tr, va))
        print('(x[tr_index], x[va_index]): ', (x[tr], x[va]))
        print('=============================================')
    util.log('stack_tr: %d %d' % ((i+1), n))
    clf = LogisticRegression(C=2)
    clf.fit(x[tr], y[tr])
    y_pred_va = clf.predict_proba(x[va])
    y_pred_te = clf.predict_proba(x_te)
    util.log('va acc: %f' % micro_avg_f1(y[tr], clf.predict(x[tr])))
    
    score_va += micro_avg_f1(y[va], clf.predict(x[va]))
    print('score_va:', score_va)
    
    stack_tr[va] += y_pred_va  # 因为每一折验证集所取到的样本都是随机的，所以采用这种索引定位的方式累加预测概率值
    stack_te += y_pred_te
    
score_va /= n
util.log('va avg acc :%.f' % score_va)


(tr_index, va_index):  (array([    0,     1,     2, ..., 39997, 39998, 39999]), array([    8,    14,    20, ..., 39988, 39994, 39996]))
(x[tr_index], x[va_index]):  (<31997x114149 sparse matrix of type '<class 'numpy.float64'>'
	with 5361053 stored elements in Compressed Sparse Row format>, <8003x114149 sparse matrix of type '<class 'numpy.float64'>'
	with 1348387 stored elements in Compressed Sparse Row format>)
2019-04-05 10:26:28 stack: 1 5




LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

2019-04-05 10:26:42 va acc: 0.764322
score_va: 0.44433337498438086
2019-04-05 10:26:42 stack: 2 5


LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

2019-04-05 10:26:57 va acc: 0.762110
score_va: 0.8794745896807068
2019-04-05 10:26:57 stack: 3 5


LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

2019-04-05 10:27:10 va acc: 0.763125
score_va: 1.3110995896807067
2019-04-05 10:27:10 stack: 4 5


LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

2019-04-05 10:27:24 va acc: 0.764640
score_va: 1.7463333981328197
2019-04-05 10:27:24 stack: 5 5


LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

2019-04-05 10:27:37 va acc: 0.761866
score_va: 2.1823719125757357
2019-04-05 10:27:38 va avg acc :0


In [40]:
stack_te /= n
stack_all = np.vstack([stack_tr, stack_te])
df_stack = pd.DataFrame(index=range(len(df_all))) # (70000, 8)

for i in range(stack_all.shape[1]):
    df_stack['tfidf_lr_{}'.format(i)] = stack_all[:, i]
    
df_stack.to_csv(param.data_path+'/output/feature/tfidf/lr_prob_21w.csv', index=None, encoding='utf-8')

## BernoulliNB stack

In [14]:
tr_num = param.train_num
num_class = len(df_all['penalty'].value_counts()) - 1 # 我们填充的缺失值为0，相当于多一个填充标签
n = 5

x = x_sp[:tr_num]              # 从tf-idf特征向量中将属于训练集的特征分出来
y = df_all['penalty'][:tr_num] # 标签/实际值
x_te = x_sp[tr_num:]           # 测试集
# y_te = df_all['penalty'][tr_num:]

print(x.shape, y.shape, x_te.shape)

stack_tr = np.zeros((x.shape[0], num_class), dtype=float)     # (40000, 8)
stack_te = np.zeros((x_te.shape[0], num_class), dtype=float) # (30000, 8)

score_va = 0


(40000, 114149) (40000,) (30000, 114149)


In [16]:
from sklearn.naive_bayes import BernoulliNB

skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=param.seed)
for i, (tr, va) in enumerate(skf.split(x, y)):
    util.log('stack_tr: %d %d' % ((i+1), n))
    clf = BernoulliNB()
    clf.fit(x[tr], y[tr])
    y_pred_va = clf.predict_proba(x[va])
    y_pred_te = clf.predict_proba(x_te) # (30000, 8)
    util.log('va acc: %f' % micro_avg_f1(y[va], clf.predict(x[va])))
#     util.log('te acc: %f' % micro_avg_f1(y_te, clf.predict(x_te)))
    score_va += micro_avg_f1(y[va], clf.predict(x[va]))
    
    stack_tr[va] += y_pred_va  # 因为每一折验证集所取到的样本都是随机的，所以采用这种索引定位的方式累加预测概率值
    stack_te += y_pred_te
    print('===================================================================')
    
score_va /= n

util.log('va avg acc: %f' % score_va)

stack_te /= n

stack_all = np.vstack([stack_tr, stack_te])
df_stack = pd.DataFrame(index=range(len(df_all)))
for i in range(stack_all.shape[1]):
    df_stack['tfidf_bnb_{}'.format(i)] = stack_all[:, i]
    
df_stack.to_csv(param.data_path+'/output/feature/tfidf/nbn_prob_21w.csv', index=None, encoding='utf-8')

2019-04-05 15:36:41 stack_tr: 1 5


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

2019-04-05 15:36:42 va acc: 0.329376
2019-04-05 15:36:42 stack_tr: 2 5


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

2019-04-05 15:36:42 va acc: 0.326793
2019-04-05 15:36:42 stack_tr: 3 5


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

2019-04-05 15:36:43 va acc: 0.317000
2019-04-05 15:36:43 stack_tr: 4 5


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

2019-04-05 15:36:44 va acc: 0.330833
2019-04-05 15:36:44 stack_tr: 5 5


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

2019-04-05 15:36:44 va acc: 0.328373
2019-04-05 15:36:44 va avg acc: 0.391770


## MultinomialNB stack

In [17]:
from sklearn.naive_bayes import MultinomialNB

tr_num = param.train_num
num_class = len(df_all['penalty'].value_counts()) - 1 # 我们填充的缺失值为0，相当于多一个填充标签
n = 5

x = x_sp[:tr_num]              # 从tf-idf特征向量中将属于训练集的特征分出来
y = df_all['penalty'][:tr_num] # 标签/实际值
x_te = x_sp[tr_num:]           # 测试集
# y_te = df_all['penalty'][tr_num:]

print(x.shape, y.shape, x_te.shape)

stack_tr = np.zeros((x.shape[0], num_class), dtype=float)     # (40000, 8)
stack_te = np.zeros((x_te.shape[0], num_class), dtype=float) # (30000, 8)

score_va = 0

# ===================================================================================
skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=param.seed)
for i, (tr, va) in enumerate(skf.split(x, y)):
    util.log('stack_tr:%d/%d' % ((i+1), n))
    clf = MultinomialNB()
    clf.fit(x[tr], y[tr])
    y_pred_va = clf.predict_proba(x[va])
    y_pred_te = clf.predict_proba(x_te)
    score_va += micro_avg_f1(y[va], clf.predict(x[va]))
    util.log('va acc: %f' % micro_avg_f1(y[va], clf.predict(x[va])))
    
    stack_tr[va] += y_pred_va
    stack_te += y_pred_te
score_va /= n
util.log('va avg acc:%f' % score_va)
stack_te /= n

stack_all = np.vstack([stack_tr, stack_te])
df_stack = pd.DataFrame(index=range(len(df_all)))
for i in range(stack_all.shape[1]):
    df_stack['tfidf_mnb_{}'.format(i)] = stack_all[:, i]
    
df_stack.to_csv(param.data_path+'/output/feature/tfidf/mnb_prob_21w.csv', index=None, encoding='utf-8')

(40000, 114149) (40000,) (30000, 114149)
2019-04-05 15:51:46 stack_tr:1/5


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

2019-04-05 15:51:46 va acc: 0.367987
2019-04-05 15:51:46 stack_tr:2/5


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

2019-04-05 15:51:47 va acc: 0.366908
2019-04-05 15:51:47 stack_tr:3/5


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

2019-04-05 15:51:47 va acc: 0.364875
2019-04-05 15:51:47 stack_tr:4/5


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

2019-04-05 15:51:47 va acc: 0.362591
2019-04-05 15:51:47 stack_tr:5/5


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

2019-04-05 15:51:48 va acc: 0.365137
2019-04-05 15:51:48 va avg acc:0.365500


## LinearSVC stack

# amt特征（amount 金额）

In [2]:
import re
import pandas as pd
import numpy as np
import param
import util

%config ZMQInteractiveShell.ast_node_interactivity='all'

In [5]:
pattern = re.compile('[&]\w*[;]')

df_tr = []
util.log('For train.txt: ')
for i, line in enumerate(open(param.data_path+'/input/train.txt', encoding='utf-8')):
    if i % 10000 ==1:
        util.log('iter = %d' % i)
    segs = line.split('\t')
    row = {}
    row['id'] = segs[0]
    row['raw_content'] = re.sub(pattern, '', segs[1]).strip()
    df_tr.append(row)
df_tr = pd.DataFrame(df_tr)

df_te = []
util.log('For test.txt: ')
for i, line in enumerate(open(param.data_path+'/input/test.txt', encoding='utf-8')):
    if i % 10000 == 1:
        util.log('iter = %d' % i)
    segs = line.split('\t')
    row = {}
    row['id'] = segs[0]
    row['raw_content'] = re.sub(pattern, '', segs[1]).strip()
    df_te.append(row)
df_te = pd.DataFrame(df_te)


# we can use the drop parameter to avoid the old index being added as a column
df_all = pd.concat([df_tr, df_te]).reset_index(drop=True)
df_all.head()
df_all.info()

2019-04-06 08:38:56 For train.txt: 
2019-04-06 08:38:56 iter = 1
2019-04-06 08:38:56 iter = 10001
2019-04-06 08:38:56 iter = 20001
2019-04-06 08:38:56 iter = 30001
2019-04-06 08:38:57 For test.txt: 
2019-04-06 08:38:57 iter = 1
2019-04-06 08:38:57 iter = 10001
2019-04-06 08:38:57 iter = 20001


Unnamed: 0,id,raw_content
0,60,原公诉机关榆阳区人民检察院。上诉人（原审被告人）刘某，男，汉族，陕西省横山县，小学文化，货车...
1,93,公诉机关北京市昌平区人民检察院。被告人呼，男，27岁（1986年11月10日出生）。因涉嫌犯...
2,107,公诉机关平湖市人民检察院。被告人高某，无业。因本案，于2012年8月12日被平湖市公安局取保...
3,114,公诉机关平湖市人民检察院。被告人张某，无业。因本案，于2011年10月12日被本市公安局刑事...
4,126,公诉机关平湖市人民检察院。被告人闫明云，无业。因盗窃，于2001年4月被上海市劳动教养管理委...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 2 columns):
id             70000 non-null object
raw_content    70000 non-null object
dtypes: object(2)
memory usage: 1.1+ MB


In [8]:
amt_list = []
for i, row in df_all.iterrows():
    if i % 10000 == 1:
        util.log('iter = %d' % i)
    amt = re.findall(u'(\d*\.?\d+)元', row['raw_content'])
    amt_tt = re.findall(u'(\d*\.?\d+)万元', row['raw_content'])
    
    for a in amt:
        amt_list.append([row['id'], float(a)])
    for a in amt_tt:
        amt_list.append([row['id'], float(a)*10000])
        
amt_feat = pd.DataFrame(amt_list, columns=['id', 'amount'])
# np.ptp：range of value(max-min)
amt_feat = amt_feat.groupby('id')['amount'].agg([sum, min, max, np.ptp, np.mean, np.std]).reset_index().fillna(0)
# 查看一下数据内容
amt_feat.head()
amt_feat = pd.merge(df_all, amt_feat, how='left', on='id').drop(['id', 'raw_content'], axis=1)
amt_feat.columns = ['amt_'+i for i in amt_feat.columns]

amt_feat.info()
amt_feat.to_csv(param.data_path+'/output/feature/amt/amt_21w.csv', index=None, encoding='utf-8')


2019-04-06 09:24:08 iter = 1
2019-04-06 09:24:12 iter = 10001
2019-04-06 09:24:15 iter = 20001
2019-04-06 09:24:19 iter = 30001
2019-04-06 09:24:23 iter = 40001
2019-04-06 09:24:26 iter = 50001
2019-04-06 09:24:30 iter = 60001


Unnamed: 0,id,sum,min,max,ptp,mean,std
0,100007,3900.0,1900.0,2000.0,100.0,1950.0,70.710678
1,100012,130000.0,65000.0,65000.0,0.0,65000.0,0.0
2,100015,9000.0,4500.0,4500.0,0.0,4500.0,0.0
3,100033,17500.0,3500.0,10500.0,7000.0,5833.333333,4041.451884
4,100050,73800.0,73800.0,73800.0,0.0,73800.0,0.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 70000 entries, 0 to 69999
Data columns (total 6 columns):
amt_sum     56427 non-null float64
amt_min     56427 non-null float64
amt_max     56427 non-null float64
amt_ptp     56427 non-null float64
amt_mean    56427 non-null float64
amt_std     56427 non-null float64
dtypes: float64(6)
memory usage: 3.7 MB


70000条数据里面，只有56427条记录涉及到了金额。是不是考虑去掉空的记录。或许空记录起到占位的作用，为了以后用index去比对和匹配。？？？

In [10]:
amt_feat.dropna().shape

(56427, 6)

In [11]:
# 把原数据再读入，看看那些空记录会不会附加进来。
dfin=pd.read_csv(param.data_path+'/output/feature/amt/amt_21w.csv', encoding='utf-8')
dfin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 6 columns):
amt_sum     56427 non-null float64
amt_min     56427 non-null float64
amt_max     56427 non-null float64
amt_ptp     56427 non-null float64
amt_mean    56427 non-null float64
amt_std     56427 non-null float64
dtypes: float64(6)
memory usage: 3.2 MB


# 基于Doc2Vec的特征

In [1]:
import numpy as np
import pandas as pd
from collections import namedtuple
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import subprocess
import codecs

import param
import util


In [2]:
%config ZMQInteractiveShell.ast_node_interactivity='all'

In [6]:
# 加载数据
df_all = pd.read_csv(param.data_path+'/output/corpus/all_data.csv', encoding='utf-8')
# df_all['penalty'] = df_all['penalty'] - 1
df_all.head()
df_all.info()

Unnamed: 0,content,id,laws,penalty
0,原 公诉 机关 榆阳区 人民检察院 上诉人 （ 原审 被告人 ） 刘某 男 汉族 陕西省 横...,60,2242526275272,7
1,公诉 机关 北京市 昌平区 人民检察院 被告人 呼 男 27 岁 （ 1986 年 11 月...,93,26467525364,1
2,公诉 机关 平湖市 人民检察院 被告人 高某 无业 本案 2012 年 月 12 日 平湖市...,107,26726425696772,4
3,公诉 机关 平湖市 人民检察院 被告人 张某 无业 本案 2011 年 10 月 12 日 ...,114,30325276772,4
4,公诉 机关 平湖市 人民检察院 被告人 闫明云 无业 盗窃 2001 年 月 上海市 劳动教...,126,2642565,7


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 4 columns):
content    70000 non-null object
id         70000 non-null int64
laws       70000 non-null object
penalty    70000 non-null int64
dtypes: int64(2), object(2)
memory usage: 2.1+ MB


## 定义函数、类、变量

In [7]:
def run_cmd(cmd):
    print(cmd)
    process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    for t, line in enumerate(iter(process.stdout.readline, 'b')):
        line = line.decode('utf8').rstrip()
        print(line)
    process.communicate()
    return process.returncode

SentimentDocument = namedtuple('SentimentDocument', 'words tags')

class Doc_list(object):
    def __init__(self, f):
        self.f = f
    
    def __iter__(self):
        for i, line in enumerate(codecs.open(self.f, encoding='utf8')):
            words = line.strip().split(' ')
            if i == 0:                          # 看一眼读入的行
                print('orgin words: ', words)
            tags = [int(words[0][2:])]
            words = words[1:]
            if i == 0:                          # 看一眼words，tags
                print('words: ', words)
                print('tags: ', tags)
            yield SentimentDocument(words, tags)

## 准备数据