In [1]:
import os
import shutil
import math
import pandas as pd
import sqlite3
import pymongo
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from sklearn import cross_validation, preprocessing, decomposition
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, recall_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE



In [2]:
receipts = [
    'MED_CM_TBL_2016',
    'MED_CO_TBL_2016',
    'MED_GR_TBL_2016',
    'MED_HOKO_TBL_2016',
    'MED_IY_TBL_2016',
    'MED_SI_TBL_2016',
    'MED_SJ_TBL_2016',
    'MED_SY_TBL_2016',
    'MED_TO_TBL_2016',
    'PHA_CM_TBL_2016',
    'PHA_CO_TBL_2016',
    'PHA_HOKO_TBL_2016',
    'PHA_IY_TBL_2016',
    'PHA_TO_TBL_2016']

In [3]:
client = pymongo.MongoClient('localhost', 27017)
db = client.kikin

In [4]:
dbname = 'kikin.sqlite3'
con = sqlite3.connect(dbname)
c = con.cursor()
dfs = {}
for receipt in receipts:
    q = 'select * from {}'.format(receipt)
    df = pd.io.sql.read_sql(q,con)
    dfs[receipt] = df
c.close()

### 傷病、診療、医薬から説明変数を取り出す

In [5]:
df = dfs['MED_SY_TBL_2016']
sy = df['shobyo_code'].drop_duplicates()
n_sy = df['shobyo_code'].drop_duplicates().count()
n_sy

2930

In [6]:
df = dfs['MED_SI_TBL_2016']
si = df['s_tekiyo_code'].drop_duplicates()
n_si = df['s_tekiyo_code'].drop_duplicates().count()
n_si

1371

In [7]:
df = dfs['MED_IY_TBL_2016']
iy = df['s_tekiyo_code'].drop_duplicates()
n_iy = df['s_tekiyo_code'].drop_duplicates().count()
n_iy

2908

In [8]:
dd = pd.concat([sy,iy,si])
dd.count()

7209

### 傷病コード、診療コード、医薬コードに重複なし

In [9]:
dd.drop_duplicates().count()

7209

In [10]:
dd.count()

7209

In [11]:
x_values = dd.values#.tolist()
x_size = x_values.size
x_dic = { v:k for (k,v) in enumerate(x_values)}
x_kv = { k:v for (k,v) in enumerate(x_values)}

### 素性をつくる

In [12]:
xs = []
ys = []
doc = db.med.find()
for iii, row in enumerate(doc):
    x = np.zeros(x_size)
    
    # 該当する傷病があれば、その傷病に対応するindexの値を1, なければ0
    if 'MED_SY_TBL_2016' in row:
        for d in row['MED_SY_TBL_2016']:
            idx = x_dic[d['shobyo_code']]
            x[idx] = 1
            
    # 医薬品に対応するindexに点数を挿入
    if 'MED_IY_TBL_2016' in row:
        for d in row['MED_IY_TBL_2016']:
            s_code = d['s_tekiyo_code']
            k_code = d['k_tekiyo_code']
            if math.isnan(s_code) and math.isnan(k_code):
                continue
            
            code = s_code if not math.isnan(s_code) else k_code
            idx = x_dic[code]
            s = d['s_tensu']
            k = d['k_tensu']

            # 点数が負のとき、math.nanのときは 0
            x[idx] =\
                s if s>=0 else\
                k if k>=0 else\
                0

    # 診療に対応するindexに点数を挿入
    if 'MED_SI_TBL_2016' in row:
        for d in row['MED_SI_TBL_2016']:
            s_code = d['s_tekiyo_code']
            k_code = d['k_tekiyo_code']
            if math.isnan(s_code) and math.isnan(k_code):
                continue
            
            code = s_code if not math.isnan(s_code) else k_code
            idx = x_dic[code]
            s = d['s_tensu']
            k = d['k_tensu']
            
            # 点数が負のとき、math.nanのときは 0
            x[idx] =\
                s if s>=0 else\
                k if k>=0 else\
                0
    

    # 保険者レコードか公費レコード、どちらかに請求点数と決定点数に差があれば異常として1
    diff =  sum([d['diff_tensu'] for d in row['MED_HOKO_TBL_2016']])
    y = 1 if diff else 0
    
    xs.append(x)
    ys.append(y)
x_data = np.array(xs)
y_data = np.array(ys)

In [13]:
x_data.shape

(10000, 7211)

In [14]:
y_data.sum()

138

### ランダムフォレスト

In [15]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=666)

# resampling
rus = RandomUnderSampler(random_state=0)
ros = RandomOverSampler(random_state=0)
smt = SMOTE(random_state=0)
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
X_train_over, y_train_over = ros.fit_sample(X_train, y_train)
X_train_smote, y_train_smote = smt.fit_sample(X_train, y_train)

In [16]:
# modeling & evaluation
import sklearn
from sklearn.ensemble import RandomForestClassifier

select = sklearn.feature_selection.SelectKBest(k=100)
clf = sklearn.ensemble.RandomForestClassifier()

pipe_gb = Pipeline([('feature_selection', select),
        ('random_forest', clf)])


X_train, y_train = X_train_over, y_train_over
pipe_gb.fit(X_train, y_train)
print('Oversampling')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()

X_train, y_train = X_train_smote, y_train_smote
pipe_gb.fit(X_train, y_train)
print('SMOTE')
print('f1 score Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('f1 score Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print('recall score Train:', recall_score(y_train, pipe_gb.predict(X_train)))
print('recall score Test:', recall_score(y_test, pipe_gb.predict(X_test)))
print()


  406  409  459  482  483  498  500  504  506  508  509  520  526  542  551
  562  574  614  648  661  662  688  719  724  730  737  742  755  759  761
  763  764  797  799  802  803  804  806  824  828  830  831  840  877  891
  895  896  899  912  914  915  970  978 1005 1007 1009 1040 1047 1048 1102
 1108 1114 1154 1155 1167 1180 1192 1216 1218 1220 1245 1246 1264 1271 1281
 1283 1284 1297 1300 1301 1310 1311 1312 1313 1314 1320 1321 1323 1324 1332
 1342 1343 1344 1345 1346 1347 1348 1352 1367 1368 1371 1373 1391 1402 1406
 1409 1421 1425 1444 1445 1449 1451 1469 1474 1479 1481 1485 1488 1498 1506
 1518 1522 1528 1529 1531 1533 1536 1538 1579 1584 1610 1627 1657 1658 1669
 1670 1671 1672 1674 1677 1681 1682 1683 1684 1691 1701 1702 1704 1706 1707
 1709 1711 1716 1742 1743 1747 1774 1779 1785 1790 1792 1810 1811 1812 1813
 1815 1819 1820 1821 1823 1825 1829 1834 1837 1838 1846 1855 1856 1857 1858
 1864 1866 1867 1871 1879 1882 1885 1897 1899 1901 1903 1907 1908 1910 1924
 1928 1941 1

Oversampling
f1 score Train: 0.968593335299
f1 score Test: 0.126315789474
recall score Train: 0.951201853461
recall score Test: 0.136363636364



  406  409  459  482  483  498  500  504  506  508  509  520  526  542  551
  562  574  614  648  661  662  688  719  724  730  737  742  755  759  761
  763  764  797  799  802  803  804  806  824  828  830  831  840  877  891
  895  896  899  912  914  915  970  978 1005 1007 1009 1040 1047 1048 1102
 1108 1114 1154 1155 1167 1180 1192 1216 1218 1220 1245 1246 1264 1271 1281
 1283 1284 1297 1300 1301 1310 1311 1312 1313 1314 1320 1321 1323 1324 1332
 1342 1343 1344 1345 1346 1347 1348 1352 1367 1368 1371 1373 1391 1402 1406
 1409 1421 1425 1444 1445 1449 1451 1469 1474 1479 1481 1485 1488 1498 1506
 1518 1522 1528 1529 1531 1533 1536 1538 1579 1584 1610 1627 1657 1658 1669
 1670 1671 1672 1674 1677 1681 1682 1683 1684 1691 1701 1702 1704 1706 1707
 1709 1711 1716 1742 1743 1747 1774 1779 1785 1790 1792 1810 1811 1812 1813
 1815 1819 1820 1821 1823 1825 1829 1834 1837 1838 1846 1855 1856 1857 1858
 1864 1866 1867 1871 1879 1882 1885 1897 1899 1901 1903 1907 1908 1910 1924
 1928 1941 1

SMOTE
f1 score Train: 0.991971974894
f1 score Test: 0.0416666666667
recall score Train: 0.984071821604
recall score Test: 0.0227272727273



In [17]:
A = pipe_gb.predict(X_test)#*np.array([1,1])
tn, fp, fn, tp = confusion_matrix(y_test, A).ravel()
print('f1:',f1_score(y_test, A))
print('recall:',recall_score(y_test, A))
confusion_matrix(y_test, A),tn, fp, fn, tp

f1: 0.0416666666667
recall: 0.0227272727273


(array([[2953,    3],
        [  43,    1]]), 2953, 3, 43, 1)