In [1]:
import codecs
import numpy as np
from keras_bert import Tokenizer

VOCABULARY_PATH =  './vocab.txt'

SEQUENCE_LENGTH = 128

token_dict = {}
with codecs.open(VOCABULARY_PATH, 'r', 'utf8') as file_reader:
    for line in file_reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

tokenizer = Tokenizer(token_dict)

def bert_vectorize_data(articles_text):
    data_X = []

    for text in articles_text:
        ids, _ = tokenizer.encode(text, max_len=SEQUENCE_LENGTH)
        data_X.append(ids)

    data_X = np.array([data_X, np.zeros_like(data_X)])


Using TensorFlow backend.


In [7]:
SEQ_LEN = 1024

In [8]:
staging_articles = []
staging_x = []

import pickle
import numpy as np
import os

article_list_files = sorted(os.listdir('../../../data/article_list'))

latest_article_list_filename = None if len(article_list_files) == 0 else article_list_files[-1]

last_updated = latest_article_list_filename[:-4]

article_list = pickle.load(open('../../../data/article_list/{}'.format(latest_article_list_filename), 'rb'))

for article_id in article_list:
    text = pickle.load(open('../../../data/articles/{}.pkl'.format(article_id), 'rb'))['text']
    ids, segments = tokenizer.encode(text, max_len=SEQ_LEN)
    staging_x.append(ids)


In [9]:
len(staging_x)

23772

In [10]:
len(staging_x[0])

1024

In [11]:
staging_x[0]

[101,
 898,
 4212,
 1922,
 7382,
 4496,
 4495,
 4229,
 5543,
 4638,
 100,
 3417,
 5471,
 6365,
 100,
 1353,
 2746,
 1333,
 4415,
 4777,
 4634,
 4638,
 4634,
 7442,
 6172,
 5390,
 100,
 782,
 6863,
 1922,
 7382,
 100,
 8024,
 679,
 852,
 679,
 3298,
 3300,
 3417,
 2450,
 3160,
 4496,
 4495,
 8024,
 5445,
 684,
 7444,
 6206,
 1914,
 2208,
 7442,
 5543,
 2218,
 5543,
 4496,
 4495,
 1914,
 2208,
 7442,
 5543,
 511,
 2792,
 809,
 6206,
 2682,
 1343,
 3417,
 7442,
 5445,
 679,
 3174,
 7442,
 8024,
 2218,
 2553,
 7519,
 6656,
 677,
 686,
 4518,
 5543,
 3975,
 4638,
 3173,
 6638,
 1248,
 4500,
 100,
 6631,
 2206,
 4634,
 7442,
 100,
 511,
 3075,
 3300,
 7770,
 4906,
 2825,
 4638,
 1044,
 6868,
 1751,
 2157,
 4412,
 791,
 6963,
 679,
 3140,
 6738,
 6241,
 2450,
 3417,
 8024,
 5445,
 684,
 5160,
 5160,
 6206,
 6518,
 2549,
 3176,
 6818,
 2407,
 2399,
 5173,
 3632,
 4125,
 1213,
 4634,
 7442,
 2449,
 4638,
 6243,
 1205,
 8024,
 5445,
 1378,
 4124,
 679,
 1072,
 6631,
 2206,
 4634,
 7442,
 722,
 5

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
import json
import pandas as pd

repos = ['raw_data']

repo_files = []
for repo in repos:
    files = os.listdir(repo)
    files = [file for file in files if 'json' in file]
    
    repo_files.append(files)

    print('There are '+ str(len(files)) +' files in '+repo )

# load tags and text information of the files
# encoding: utf-8

repo = repos[0]
files = repo_files[0]

#df = pd.DataFrame()
#
#for file in files:
#    data = pd.read_json(os.path.join(repo, file))
#    data['file_name'] = file
#    df = df.append(data, ignore_index = True)

tag_type = 17
define_columns = ['id', 'text', 'tag_0', 'tag_1', 'tag_2', 'tag_3', 'tag_4', 'tag_5', 'tag_6',
                 'tag_7', 'tag_8', 'tag_9', 'tag_10', 'tag_11', 'tag_12', 'tag_13', 'tag_14', 'tag_15', 'tag_16']

data_list = []
for file in files:
    
    with open(os.path.join(repo, file), 'r', encoding='utf8') as f:
        data = json.load(f)
        
    tags_list = [0]*tag_type
    
    for tag in data['tags']:
        tags_list[tag] = 1
    
    # TBD: using file name or original id for modeling id
    #data_list.append([data['id'], data['text']] + tags_list)
    data_list.append([file, data['text']] + tags_list)

df_data = pd.DataFrame(data_list, columns=define_columns)  

There are 14908 files in raw_data


In [79]:
import jieba
jieba.set_dictionary('./dict.txt.big.txt')

In [129]:
SEQ_LEN = 128

def load_data(df_dataset):
    
    tokenized_text = []
    
    indices, labels = [], []
    vectorizer = TfidfVectorizer()
    
    for row in df_dataset.iterrows():
        text = row[1]['text']
        tokenized_text.append(' '.join(jieba.cut(text, cut_all=True)))

        label = list(row[1].iloc[2:])
        label = label.index(max(label))
        labels.append(label)

    items = list(zip(tokenized_text, labels))
    
    np.random.shuffle(items)
    test_items = items[int(0.8*len(items)):]
    train_items = items[:int(0.8*len(items))]
    
    text_test, labels_test = zip(*test_items)
    text_train, labels_train = zip(*train_items)
    

        
    
    
    return vectorizer.fit_transform(text_train), labels_train, vectorizer.transform(text_test), labels_test
    
train_x, train_y, test_x, test_y = load_data(df_data)
        
    



#     indices_test = np.array(indices_test)
#     indices_train = np.array(indices_train)

#     
  
# train_x, train_y, test_x, test_y = load_data(df_data)

In [131]:
test_x

<2982x89235 sparse matrix of type '<class 'numpy.float64'>'
	with 148116 stored elements in Compressed Sparse Row format>

In [106]:
from sklearn.feature_extraction.text import TfidfVectorizer
meow = ['全由 觀光 觀光局 支付     相當 可疑', '自 中國 進口 口水 水壺 水壺蓋 壺蓋     太和 工房 被 爆 驗 出 殘渣 值 超標 57 倍']

In [107]:
vectorizer = TfidfVectorizer()

In [108]:
X = vectorizer.fit_transform(meow)

In [110]:
print(X)

  (0, 4)	0.408248290463863
  (0, 12)	0.408248290463863
  (0, 8)	0.408248290463863
  (0, 14)	0.408248290463863
  (0, 13)	0.408248290463863
  (0, 2)	0.408248290463863
  (1, 0)	0.30151134457776363
  (1, 15)	0.30151134457776363
  (1, 9)	0.30151134457776363
  (1, 7)	0.30151134457776363
  (1, 6)	0.30151134457776363
  (1, 5)	0.30151134457776363
  (1, 11)	0.30151134457776363
  (1, 10)	0.30151134457776363
  (1, 3)	0.30151134457776363
  (1, 16)	0.30151134457776363
  (1, 1)	0.30151134457776363


In [59]:
train_x

array([[ 101, 3336, 7448, ...,  791, 2399,  102],
       [ 101, 6857,  679, ...,    0,    0,    0],
       [ 101, 3297, 6818, ..., 3596, 3389,  102],
       ...,
       [ 101, 7350, 2209, ...,    0,    0,    0],
       [ 101, 1060, 4934, ...,    0,    0,    0],
       [ 101,  521,  679, ..., 3189, 2405,  102]])

In [60]:
train_y.shape

(11926,)

In [119]:
train_x

<14908x99547 sparse matrix of type '<class 'numpy.float64'>'
	with 816198 stored elements in Compressed Sparse Row format>

In [144]:
clf = RandomForestClassifier(n_estimators=100, max_depth=25, min_samples_leaf=30, max_features=0.1)
clf.fit(train_x, train_y)
print(clf.score(train_x, train_y))
print(clf.score(test_x, test_y))

0.6132819050813348
0.5938967136150235


In [104]:
' '.join(list(jieba.cut('全由觀光局支付 相當可疑', cut_all=True)))

'全由 觀光 觀光局 支付     相當 可疑'

In [154]:
ESTIMATOR_N = [500]
MAX_DEPTH = [80]
MIN_SAMPLES_LEAF = [3]
MAX_FEATURES = [0.1]

best_score = 0
best_params = (0,0,0,0)

for E in ESTIMATOR_N:
    for md in MAX_DEPTH:
        for msl in MIN_SAMPLES_LEAF:
            for mf in MAX_FEATURES:
                print('n_estimators =', E, 'max_depth =', md, 'min_samples_leaf =', msl, 'max_features =', mf)
                clf = RandomForestClassifier(n_estimators=E, max_depth=md, min_samples_leaf=msl, max_features=mf, verbose=True, n_jobs=4)
                clf.fit(train_x, train_y)
                print(clf.score(train_x, train_y))
                test_score = clf.score(test_x, test_y)
                print(test_score)
                if test_score > best_score:
                    best_score = test_score
                    best_params = (E, md, msl, mf)
                    print('new best!', test_score)

n_estimators = 500 max_depth = 80 min_samples_leaf = 3 max_features = 0.1


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    7.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   33.6s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:  1.4min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.7s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s


0.8117558276035552


[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.2s finished


0.6505700871898055
new best! 0.6505700871898055


In [None]:
list(jieba.cut('全由觀光局支付 相當可疑', cut_all=True))

In [155]:
clf = RandomForestClassifier(n_estimators=E, max_depth=md, min_samples_leaf=msl, max_features=mf, verbose=True, n_jobs=4)
clf.fit(train_x, train_y)
train_predicts = clf.predict(train_x)
test_predicts = clf.predict(test_x)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    7.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   33.6s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:  1.4min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.7s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.2s finished


In [156]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import classification_report

print(classification_report(train_y, train_predicts))
print(classification_report(test_y, test_predicts))

              precision    recall  f1-score   support

           0       0.86      0.73      0.79       438
           1       0.94      0.65      0.77       114
           2       1.00      0.43      0.60        49
           3       0.88      0.87      0.88      2739
           4       1.00      0.10      0.18        10
           5       1.00      0.17      0.29        41
           6       0.98      0.67      0.79        78
           7       1.00      0.12      0.22        74
           8       0.94      0.63      0.76       771
           9       0.97      0.42      0.59       285
          10       0.88      0.90      0.89       674
          11       0.63      0.89      0.74      1403
          12       0.88      0.26      0.40       522
          13       0.92      0.62      0.74       336
          14       0.80      0.95      0.86      3322
          15       0.81      0.82      0.82       969
          16       0.94      0.61      0.74       101

    accuracy              

  'precision', 'predicted', average, warn_for)


In [105]:
' '.join(list(jieba.cut(df_data.iloc[1]['text'], cut_all=True)))

'自 中國 進口 口水 水壺 水壺蓋 壺蓋     太和 工房 被 爆 驗 出 殘渣 值 超標 57 倍'

In [45]:
clf.score(train_x, train_y)

0.8903236625859466

In [46]:
clf.score(test_x, test_y)

0.3936955063715627

In [49]:
clf.predict(train_x)

array([14,  8,  3, ..., 11, 14,  3])