In [1]:
import os
import shutil
import math
import pandas as pd
import sqlite3
import pymongo
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from sklearn import cross_validation, preprocessing, decomposition
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, recall_score, confusion_matrix, precision_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE



In [2]:
receipts = [
    'MED_CM_TBL_2016',
    'MED_CO_TBL_2016',
    'MED_GR_TBL_2016',
    'MED_HOKO_TBL_2016',
    'MED_IY_TBL_2016',
    'MED_SI_TBL_2016',
    'MED_SJ_TBL_2016',
    'MED_SY_TBL_2016',
    'MED_TO_TBL_2016',
    'PHA_CM_TBL_2016',
    'PHA_CO_TBL_2016',
    'PHA_HOKO_TBL_2016',
    'PHA_IY_TBL_2016',
    'PHA_TO_TBL_2016']

In [3]:
client = pymongo.MongoClient('localhost', 27017)
db = client.kikin

In [4]:
dbname = 'kikin.sqlite3'
con = sqlite3.connect(dbname)
c = con.cursor()
dfs = {}
for receipt in receipts:
    q = 'select * from {}'.format(receipt)
    df = pd.io.sql.read_sql(q,con)
    dfs[receipt] = df
c.close()

### 傷病、診療、医薬から説明変数を取り出す

In [10]:
df = dfs['MED_SY_TBL_2016']
sy = df['shobyo_code'].drop_duplicates()
n_sy = df['shobyo_code'].drop_duplicates().count()
n_sy

2930

In [11]:
df = dfs['MED_SI_TBL_2016']
si = df['s_tekiyo_code'].drop_duplicates()
n_si = df['s_tekiyo_code'].drop_duplicates().count()
n_si

1371

In [12]:
df = dfs['MED_IY_TBL_2016']
iy = df['s_tekiyo_code'].drop_duplicates()
n_iy = df['s_tekiyo_code'].drop_duplicates().count()
n_iy

2908

In [13]:
dd = pd.concat([sy,iy,si]).dropna()
dd.count()

7209

### 傷病コード、診療コード、医薬コードに重複なし

In [14]:
dd.drop_duplicates().count()

7209

In [15]:
dd.count()

7209

In [16]:
x_values = dd.values#.tolist()
x_size = x_values.size
x_dic = { v:k for (k,v) in enumerate(x_values)}

### 素性をつくる

In [17]:
xs = []
ys = []
doc = db.med.find()
for row in doc:
    x = np.zeros(x_size)
    
    # 該当する傷病があれば、その傷病に対応するindexの値を1, なければ0
    if 'MED_SY_TBL_2016' in row:
        for d in row['MED_SY_TBL_2016']:
            idx = x_dic[d['shobyo_code']]
            x[idx] = 1
            
    # 医薬品に対応するindexに点数を挿入
    if 'MED_IY_TBL_2016' in row:
        for d in row['MED_IY_TBL_2016']:
            s_code = d['s_tekiyo_code']
            k_code = d['k_tekiyo_code']
            if math.isnan(s_code) and math.isnan(k_code):
                continue
            
            code = s_code if not math.isnan(s_code) else k_code
            idx = x_dic[code]
            s = d['s_tensu']
            k = d['k_tensu']
            x[idx] =\
                s if not math.isnan(s) else\
                k if not math.isnan(k) else\
                0
    
    # 診療に対応するindexに点数を挿入
    if 'MED_SI_TBL_2016' in row:
        for d in row['MED_SI_TBL_2016']:
            s_code = d['s_tekiyo_code']
            k_code = d['k_tekiyo_code']
            if math.isnan(s_code) and math.isnan(k_code):
                continue
            
            code = s_code if not math.isnan(s_code) else k_code
            idx = x_dic[code]
            s = d['s_tensu']
            k = d['k_tensu']
            x[idx] =\
                s if not math.isnan(s) else\
                k if not math.isnan(k) else\
                0
    

    # 保険者レコードか公費レコード、どちらかに請求点数と決定点数に差があれば異常として1
    diff =  sum([d['diff_tensu'] for d in row['MED_HOKO_TBL_2016']])
    y = 1 if diff else 0
    
    xs.append(x)
    ys.append(y)
x_data = np.array(xs)
y_data = np.array(ys)

### PCAのち、ブースティング

In [18]:
sc= preprocessing.StandardScaler()
sc.fit(x_data)
X = sc.transform(x_data)
pca = decomposition.PCA(n_components=100)
X_transformed = pca.fit_transform(X)

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_data, test_size=0.3, random_state=666)

# resampling
rus = RandomUnderSampler(random_state=0)
ros = RandomOverSampler(random_state=0)
smt = SMOTE(random_state=0)
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
X_train_over, y_train_over = ros.fit_sample(X_train, y_train)
X_train_smote, y_train_smote = smt.fit_sample(X_train, y_train)

In [19]:
# modeling & evaluation
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

pipe_gb = Pipeline([('scl',StandardScaler()),
                    ('est',GradientBoostingClassifier(random_state=1))])


X_train, y_train = X_train_over, y_train_over
pipe_gb.fit(X_train, y_train)
print('Oversampling')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()

X_train, y_train = X_train_smote, y_train_smote
pipe_gb.fit(X_train, y_train)
print('SMOTE')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()


Oversampling
f1 score Train: 0.9925265881
f1 score Test: 0.276923076923
recall score Train: 1.0
recall score Test: 0.409090909091

SMOTE
f1 score Train: 0.978028249394
f1 score Test: 0.194444444444
recall score Train: 0.992615117289
recall score Test: 0.477272727273



In [23]:
xx = pipe_gb.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test,xx).ravel()
r = recall_score(y_test,xx)
f = f1_score(y_test,xx)
print(r)
print(f)
confusion_matrix(y_test,xx),tn, fp, fn, tp

0.477272727273
0.194444444444


(array([[2805,  151],
        [  23,   21]]), 2805, 151, 23, 21)

In [53]:
tp/(tp+fp), precision_score(y_test,xx)

(0.10457516339869281, 0.10457516339869281)

In [49]:
xx = pipe_gb.predict(X_test)
confusion_matrix(y_test, xx)

array([[2819,  137],
       [  28,   16]])

In [26]:
import inspect

In [42]:
confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0])

array([[0, 2],
       [1, 1]])

In [29]:
print(inspect.getsource(confusion_matrix))

def confusion_matrix(y_true, y_pred, labels=None, sample_weight=None):
    """Compute confusion matrix to evaluate the accuracy of a classification

    By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
    is equal to the number of observations known to be in group :math:`i` but
    predicted to be in group :math:`j`.

    Thus in binary classification, the count of true negatives is
    :math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is
    :math:`C_{1,1}` and false positives is :math:`C_{0,1}`.

    Read more in the :ref:`User Guide <confusion_matrix>`.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (correct) target values.

    y_pred : array, shape = [n_samples]
        Estimated targets as returned by a classifier.

    labels : array, shape = [n_classes], optional
        List of labels to index the matrix. This may be used to reorder
        or select a subset of labels.
        If none is g

In [21]:
precision_score(y_test, xx)

0.99120028159098905

In [22]:
recall_score(y_test, xx)

0.95263870094722602

In [52]:
# modeling & evaluation
pipe_gb = Pipeline([('scl',StandardScaler()),('est',KNeighborsClassifier(n_neighbors=3))])

X_train, y_train = X_train, y_train
pipe_gb.fit(X_train, y_train)
print('Original')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()

X_train, y_train = X_train_under, y_train_under
pipe_gb.fit(X_train, y_train)
print('Undersampling')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()

X_train, y_train = X_train_over, y_train_over
pipe_gb.fit(X_train, y_train)
print('Oversampling')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()

X_train, y_train = X_train_smote, y_train_smote
pipe_gb.fit(X_train, y_train)
print('SMOTE')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()


Original
f1 score Train: 0.99101599591
f1 score Test: 0.979996580612
recall score Train: 0.982334202143
recall score Test: 0.969553450609

Undersampling
f1 score Train: 0.885572139303
f1 score Test: 0.969075747047
recall score Train: 0.946808510638
recall score Test: 0.94350473613

Oversampling
f1 score Train: 0.997168372904
f1 score Test: 0.988665200474
recall score Train: 0.994352736751
recall score Test: 0.98849797023

SMOTE
f1 score Train: 0.99101599591
f1 score Test: 0.979996580612
recall score Train: 0.982334202143
recall score Test: 0.969553450609



In [53]:
# modeling & evaluation
pipe_gb = Pipeline([('scl',StandardScaler()),('est',KNeighborsClassifier(n_neighbors=20))])

X_train, y_train = X_train, y_train
pipe_gb.fit(X_train, y_train)
print('Original')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()

X_train, y_train = X_train_under, y_train_under
pipe_gb.fit(X_train, y_train)
print('Undersampling')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()

X_train, y_train = X_train_over, y_train_over
pipe_gb.fit(X_train, y_train)
print('Oversampling')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()

X_train, y_train = X_train_smote, y_train_smote
pipe_gb.fit(X_train, y_train)
print('SMOTE')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()


Original
f1 score Train: 0.964681233164
f1 score Test: 0.960866526904
recall score Train: 0.933391253982
recall score Test: 0.930311231394

Undersampling
f1 score Train: 0.738095238095
f1 score Test: 0.990518117169
recall score Train: 0.989361702128
recall score Test: 0.98951285521

Oversampling
f1 score Train: 0.980965028774
f1 score Test: 0.976264189886
recall score Train: 0.962641181581
recall score Test: 0.960081190798

SMOTE
f1 score Train: 0.964681233164
f1 score Test: 0.960866526904
recall score Train: 0.933391253982
recall score Test: 0.930311231394



### グリッドサーチする

In [None]:
from sklearn.model_selection import GridSearchCV

svc_param_grid = {
    'n_neighbors': [1,3,5,10,20,30]
}

grid_search = GridSearchCV(KNeighborsClassifier(), svc_param_grid, cv=10)


X_train, y_train = X_train_smote, y_train_smote
grid_search.fit(X_train, y_train)



### PCA200でk近傍法k=20

In [54]:
sc= preprocessing.StandardScaler()
sc.fit(x_data)
X = sc.transform(x_data)
pca = decomposition.PCA(n_components=200)
X_transformed = pca.fit_transform(X)

In [55]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_data, test_size=0.3, random_state=666)

# resampling
rus = RandomUnderSampler(random_state=0)
ros = RandomOverSampler(random_state=0)
smt = SMOTE(random_state=0)
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
X_train_over, y_train_over = ros.fit_sample(X_train, y_train)
X_train_smote, y_train_smote = smt.fit_sample(X_train, y_train)

In [56]:
# modeling & evaluation
pipe_gb = Pipeline([('scl',StandardScaler()),('est',KNeighborsClassifier(n_neighbors=1))])

X_train, y_train = X_train, y_train
pipe_gb.fit(X_train, y_train)
print('Original')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()

X_train, y_train = X_train_under, y_train_under
pipe_gb.fit(X_train, y_train)
print('Undersampling')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()

X_train, y_train = X_train_over, y_train_over
pipe_gb.fit(X_train, y_train)
print('Oversampling')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()

X_train, y_train = X_train_smote, y_train_smote
pipe_gb.fit(X_train, y_train)
print('SMOTE')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()


Original
f1 score Train: 1.0
f1 score Test: 0.991582491582
recall score Train: 1.0
recall score Test: 0.996278755074

Undersampling
f1 score Train: 1.0
f1 score Test: 0.960770577933
recall score Train: 1.0
recall score Test: 0.927943166441

Oversampling
f1 score Train: 1.0
f1 score Test: 0.991755005889
recall score Train: 1.0
recall score Test: 0.996955345061

SMOTE
f1 score Train: 1.0
f1 score Test: 0.985600542097
recall score Train: 1.0
recall score Test: 0.984100135318



In [59]:
xx = pipe_gb.predict(X_test)
confusion_matrix(y_test, xx)

array([[   6,   38],
       [  47, 2909]])

In [60]:
xx

array([1, 1, 1, ..., 1, 1, 1])

### PCA1000

In [23]:
sc= preprocessing.StandardScaler()
sc.fit(x_data)
X = sc.transform(x_data)
pca = decomposition.PCA(n_components=1000)
X_transformed = pca.fit_transform(X)

In [24]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_data, test_size=0.3, random_state=666)

# resampling
rus = RandomUnderSampler(random_state=0)
ros = RandomOverSampler(random_state=0)
smt = SMOTE(random_state=0)
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
X_train_over, y_train_over = ros.fit_sample(X_train, y_train)
X_train_smote, y_train_smote = smt.fit_sample(X_train, y_train)

In [26]:
# resampling
rus = RandomUnderSampler(random_state=0)
ros = RandomOverSampler(random_state=0)
smt = SMOTE(random_state=0)
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
X_train_over, y_train_over = ros.fit_sample(X_train, y_train)
X_train_smote, y_train_smote = smt.fit_sample(X_train, y_train)

In [29]:
# modeling & evaluation
pipe_gb = Pipeline([('scl',StandardScaler()),
                    ('est',KNeighborsClassifier(n_neighbors=20))])

X_train, y_train = X_train, y_train
pipe_gb.fit(X_train, y_train)
print('Original')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()

X_train, y_train = X_train_under, y_train_under
pipe_gb.fit(X_train, y_train)
print('Undersampling')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()

X_train, y_train = X_train_over, y_train_over
pipe_gb.fit(X_train, y_train)
print('Oversampling')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()

X_train, y_train = X_train_smote, y_train_smote
pipe_gb.fit(X_train, y_train)
print('SMOTE')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()


Original


  'precision', 'predicted', average, warn_for)


f1 score Train: 0.0
f1 score Test: 0.0
recall score Train: 0.0
recall score Test: 0.0

Undersampling
f1 score Train: 0.0


  'precision', 'predicted', average, warn_for)


f1 score Test: 0.0
recall score Train: 0.0
recall score Test: 0.0

Oversampling
f1 score Train: 0.99452764977
f1 score Test: 0.112359550562
recall score Train: 1.0
recall score Test: 0.113636363636

SMOTE
f1 score Train: 0.986853386682
f1 score Test: 0.14598540146
recall score Train: 1.0
recall score Test: 0.227272727273



In [35]:
xxx = pipe_gb.predict(X_test)

In [36]:
xxx

array([0, 0, 0, ..., 1, 0, 0])

In [44]:
y_true = [2, 0, 2, 2, 0, 1]
y_pred = [0, 0, 2, 2, 0, 2]
confusion_matrix(y_true, y_pred)

array([[2, 0, 0],
       [0, 0, 1],
       [1, 0, 2]])

In [46]:
y_true = [1, 0, 1, 1, 0, 1,1,1,1,1]
y_pred = [0, 0, 1, 1, 0, 1,1,1,1,1]
confusion_matrix(y_true, y_pred)

array([[2, 0],
       [1, 7]])

In [39]:
confusion_matrix(y_test, xxx)

array([[2873,   83],
       [  34,   10]])

In [38]:
confusion_matrix(xxx, y_test)

array([[2873,   34],
       [  83,   10]])

In [27]:
def recall(y_true, y_pred):
    from keras import backend as K
    # Calculates the recall
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    from keras import backend as K
    # Calculates the precision
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * p * r / (p+r)

In [None]:
y_train = np_utils.to_categorical(y_train, nb_class)
y_test = np_utils.to_categorical(y_test, nb_class)

In [None]:
import keras