# Project: ICD-AIS conversion using Deep Learning

This script examines looks at the TF-IDF of the different terms in the data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import icd9cms
import icd10
import math

## Load data

In [2]:
icd_obs_file ="../Data/test_icd_pre_I9_A05.csv"

In [3]:
icd = pd.read_csv(icd_obs_file, header=None, names=["ICD9CODE"])

## Calc TF

In [4]:
icd_split = icd['ICD9CODE'].str.split(' ', expand=True).fillna('NA')

In [5]:
idx, count = np.unique(icd_split, return_counts = True)
tf = pd.Series(count, index = idx)

In [6]:
tf

A18       2222
A19       2480
A20       2474
A21       2848
A22       2801
          ... 
P99.84       2
P99.88       1
P99.91       1
P99.95       1
P99.99    1062
Length: 6439, dtype: int64

## Calc DF

In [7]:
mask = icd_split.apply(pd.Series.duplicated, 1) & icd_split.astype(bool)

In [8]:
icd_unique = icd_split.mask(mask,"NA")

In [9]:
idx_u, count_u = np.unique(icd_unique, return_counts = True)
df = pd.Series(count_u, index = idx_u)

## Merge data

In [10]:
tfidf = pd.DataFrame(tf, columns=['tf'])

In [11]:
tfidf = tfidf.join(pd.DataFrame(df, columns=['df']), how='left')

## Calc TF-IDF

In [12]:
tfidf['idf'] = 1/tfidf.df

In [13]:
tfidf['tf_idf'] = tfidf.tf * tfidf.idf

In [14]:
tfidf.sort_values('tf_idf', ascending=False)

Unnamed: 0,tf,df,idf,tf_idf
P84.44,3,1,1.000000e+00,3.000000
P89.16,3,1,1.000000e+00,3.000000
P97.16,221,89,1.123596e-02,2.483146
P96.53,12,5,2.000000e-01,2.400000
P23.49,7,3,3.333333e-01,2.333333
...,...,...,...,...
E885.0,49,49,2.040816e-02,1.000000
D921.9,237,237,4.219409e-03,1.000000
D920,10962,10962,9.122423e-05,1.000000
D881.01,1381,1381,7.241130e-04,1.000000


## Categorize terms

In [15]:
tfidf['term'] = tfidf.index

In [16]:
tfidf['cat'] = tfidf['term'].str[0]

In [17]:
tfidf

Unnamed: 0,tf,df,idf,tf_idf,term,cat
A18,2222,2222,0.000450,1.000000,A18,A
A19,2480,2480,0.000403,1.000000,A19,A
A20,2474,2474,0.000404,1.000000,A20,A
A21,2848,2848,0.000351,1.000000,A21,A
A22,2801,2801,0.000357,1.000000,A22,A
...,...,...,...,...,...,...
P99.84,2,2,0.500000,1.000000,P99.84,P
P99.88,1,1,1.000000,1.000000,P99.88,P
P99.91,1,1,1.000000,1.000000,P99.91,P
P99.95,1,1,1.000000,1.000000,P99.95,P


## Get stats on categories

In [18]:
tfidf.groupby('cat').mean('tf_idf')

Unnamed: 0_level_0,tf,df,idf,tf_idf
cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,1699.639,1699.639,0.000619553,1.0
D,124.3091,124.3091,0.3572017,1.0
E,220.8917,220.8917,0.313695,1.0
N,14224100.0,14269990.0,7.007714e-08,0.996784
P,278.8416,258.5887,0.3348286,1.035491


In [19]:
tfidf.groupby('cat').mean()

Unnamed: 0_level_0,tf,df,idf,tf_idf
cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,1699.639,1699.639,0.000619553,1.0
D,124.3091,124.3091,0.3572017,1.0
E,220.8917,220.8917,0.313695,1.0
N,14224100.0,14269990.0,7.007714e-08,0.996784
P,278.8416,258.5887,0.3348286,1.035491
