In [1]:
import os
import shutil
import math
import pandas as pd
import sqlite3
import pymongo
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from sklearn import cross_validation, preprocessing, decomposition
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, fbeta_score, recall_score, precision_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE



In [2]:
receipts = [
    'MED_CM_TBL_2016',
    'MED_CO_TBL_2016',
    'MED_GR_TBL_2016',
    'MED_HOKO_TBL_2016',
    'MED_IY_TBL_2016',
    'MED_SI_TBL_2016',
    'MED_SJ_TBL_2016',
    'MED_SY_TBL_2016',
    'MED_TO_TBL_2016',
    'PHA_CM_TBL_2016',
    'PHA_CO_TBL_2016',
    'PHA_HOKO_TBL_2016',
    'PHA_IY_TBL_2016',
    'PHA_TO_TBL_2016']

In [3]:
client = pymongo.MongoClient('localhost', 27017)
db = client.kikin

In [4]:
dbname = 'kikin.sqlite3'
con = sqlite3.connect(dbname)
c = con.cursor()
dfs = {}
for receipt in receipts:
    q = 'select * from {}'.format(receipt)
    df = pd.io.sql.read_sql(q,con)
    dfs[receipt] = df
c.close()

### 傷病、診療、医薬から説明変数を取り出す

In [5]:
df = dfs['MED_SY_TBL_2016']
sy = df['shobyo_code'].drop_duplicates()
n_sy = df['shobyo_code'].drop_duplicates().count()
n_sy

2930

In [6]:
df = dfs['MED_SI_TBL_2016']
si = df['s_tekiyo_code'].drop_duplicates()
n_si = df['s_tekiyo_code'].drop_duplicates().count()
n_si

1371

In [7]:
df = dfs['MED_IY_TBL_2016']
iy = df['s_tekiyo_code'].drop_duplicates()
n_iy = df['s_tekiyo_code'].drop_duplicates().count()
n_iy

2908

In [8]:
dd = pd.concat([sy,iy,si]).dropna()
dd.count()

7209

### 傷病コード、診療コード、医薬コードに重複なし

In [9]:
dd.drop_duplicates().count()

7209

In [10]:
dd.count()

7209

In [9]:
x_values = dd.values#.tolist()
x_size = x_values.size
x_dic = { v:k for (k,v) in enumerate(x_values)}

### 素性をつくる

In [10]:
xs = []
ys = []
doc = db.med.find()
for row in doc:
    x = np.zeros(x_size)
    
    # 該当する傷病があれば、その傷病に対応するindexの値を1, なければ0
    if 'MED_SY_TBL_2016' in row:
        for d in row['MED_SY_TBL_2016']:
            idx = x_dic[d['shobyo_code']]
            x[idx] = 1
            
    # 医薬品に対応するindexに点数を挿入
    if 'MED_IY_TBL_2016' in row:
        for d in row['MED_IY_TBL_2016']:
            s_code = d['s_tekiyo_code']
            k_code = d['k_tekiyo_code']
            if math.isnan(s_code) and math.isnan(k_code):
                continue
            
            code = s_code if not math.isnan(s_code) else k_code
            idx = x_dic[code]
            s = d['s_tensu']
            k = d['k_tensu']
            x[idx] =\
                s if not math.isnan(s) else\
                k if not math.isnan(k) else\
                0
    
    # 診療に対応するindexに点数を挿入
    if 'MED_SI_TBL_2016' in row:
        for d in row['MED_SI_TBL_2016']:
            s_code = d['s_tekiyo_code']
            k_code = d['k_tekiyo_code']
            if math.isnan(s_code) and math.isnan(k_code):
                continue
            
            code = s_code if not math.isnan(s_code) else k_code
            idx = x_dic[code]
            s = d['s_tensu']
            k = d['k_tensu']
            x[idx] =\
                s if not math.isnan(s) else\
                k if not math.isnan(k) else\
                0
    

    # 保険者レコードか公費レコード、どちらかに請求点数と決定点数に差があれば異常とし1
    diff =  sum([d['diff_tensu'] for d in row['MED_HOKO_TBL_2016']])
    y = 1 if diff else 0
    
    xs.append(x)
    ys.append(y)
x_data = np.array(xs)
y_data = np.array(ys)

### PCAのち、近傍法

In [11]:
sc= preprocessing.StandardScaler()
sc.fit(x_data)
X = sc.transform(x_data)
pca = decomposition.PCA(n_components=100)
X_transformed = pca.fit_transform(X)

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_data, test_size=0.3, random_state=666)

# resampling
rus = RandomUnderSampler(random_state=0)
ros = RandomOverSampler(random_state=0)
smt = SMOTE(random_state=0)
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
X_train_over, y_train_over = ros.fit_sample(X_train, y_train)
X_train_smote, y_train_smote = smt.fit_sample(X_train, y_train)

In [15]:
# modeling & evaluation
from sklearn import datasets
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier

pipe_gb = Pipeline([('scl',StandardScaler()),
                    ('est',BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5))])

X_train, y_train = X_train_over, y_train_over
pipe_gb.fit(X_train, y_train)
print('Oversampling')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()

X_train, y_train = X_train_smote, y_train_smote
pipe_gb.fit(X_train, y_train)
print('SMOTE')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()


Oversampling
f1 score Train: 0.991742658146
f1 score Test: 0.362068965517
recall score Train: 1.0
recall score Test: 0.477272727273

SMOTE
f1 score Train: 0.97962371317
f1 score Test: 0.254901960784
recall score Train: 0.998986388648
recall score Test: 0.590909090909



In [16]:
A = pipe_gb.predict(X_test)#*np.array([1,1])
tn, fp, fn, tp = confusion_matrix(y_test, A).ravel()
print('f1:',f1_score(y_test, A))
print('recall:',recall_score(y_test, A))
confusion_matrix(y_test, A),tn, fp, fn, tp

f1: 0.254901960784
recall: 0.590909090909


(array([[2822,  134],
        [  18,   26]]), 2822, 134, 18, 26)