# Project: ICD-AIS conversion using Deep Learning

This script creates sparse matrices for the input and output files

## Setup

In [1]:
import numpy as np
import pandas as pd
import sqlite3
import random
import math
import torch
import sys

In [47]:
ais_train_file = "../Data/train_ais_agecat_I9_A05.csv"
icd_train_file ="../Data/train_icd_agecat_I9_A05.csv"
ais_val_file = "../Data/val_ais_agecat_I9_A05.csv"
icd_val_file ="../Data/val_icd_agecat_I9_A05.csv"
ais_test_file = "../Data/test_ais_agecat_I9_A05.csv"
icd_test_file ="../Data/test_icd_agecat_I9_A05.csv"
ais_codes_file = "../Data/AIS08_codes.csv"

## Load data

In [58]:
icd_train = pd.read_csv(icd_train_file, header=None, names=["icd_code"])
icd_val = pd.read_csv(icd_val_file, header=None, names=["icd_code"])
icd_test = pd.read_csv(icd_test_file, header=None, names=["icd_code"])
ais_train = pd.read_csv(ais_train_file, header=None, names=["icd_code"])
ais_val = pd.read_csv(ais_val_file, header=None, names=["icd_code"])
ais_test = pd.read_csv(ais_test_file, header=None, names=["icd_code"])

# load AIS codes
ais_map = pd.read_csv(ais_codes_file, header=0, encoding='iso-8859-1')

## Extract all unique patient ICD and AIS codes

In [27]:
def get_unique_codes(dfs):
    
    # empty list for codes
    unique_codes = []
    
    # loop through all dataframes
    for df in dfs:
        
        # loop through all rows
        for i in range(0,len(df)):
          
             # extract line and split into 
            line = df.iloc[i].str.split(" ").values[0]
            
            # loop through terms
            for j in line:
                
                # check if terms in already in dictionary
                if j not in unique_codes:
    
                    # add term to dictionary
                    unique_codes.append(j)

    # sort values
    unique_codes.sort()
    
    return unique_codes
            

In [39]:
%%time
# get unique icd codes from all sets
icd_codes = get_unique_codes([icd_train, icd_val, icd_test])

CPU times: user 6min 35s, sys: 4.85 s, total: 6min 40s
Wall time: 6min 37s


In [42]:
# create dictionary of ICD codes
icd_dict = dict(zip(icd_codes, list(range(len(icd_codes)))))

In [64]:
%%time
# get unique ais codes from all sets
ais_codes = get_unique_codes([ais_train, ais_val, ais_test])

CPU times: user 4min 31s, sys: 4.28 s, total: 4min 36s
Wall time: 4min 32s


In [77]:
# create dictionary of AIS codes
ais_dict = dict(zip(ais_codes, list(range(len(ais_codes)))))

## Create sparse matrix

In [80]:
def decode_df_coo(df, dic):
        
    # decoded array
    row = []
    col = []
    data = []
    
    # loop through all rows
    for i in range(0,len(df)):
        
        # extract line and split into 
        line = df.iloc[i].str.split(" ").values[0]
        
        # decoded line
        line_d = []
        
        # loop through terms
        for j in line:
                  
            # create new decoded line
            row.append(i)
            col.append(dic[j])
            data.append(1)
    
    # return dictionary and decoded array
    return dic, row, col, data

In [92]:
%%time
dic, row, col, data = decode_df_coo(icd_train, icd_dict)

CPU times: user 3min 40s, sys: 3.43 s, total: 3min 44s
Wall time: 3min 41s


In [93]:
%%time
dic, row, col, data = decode_df_coo(ais_train, ais_dict)

CPU times: user 3min 34s, sys: 3.22 s, total: 3min 37s
Wall time: 3min 34s


In [95]:
s = torch.sparse_coo_tensor([row,col], data)

In [96]:
len(icd_codes)

10362

In [97]:
s

tensor(indices=tensor([[      0,       0,       0,  ..., 1099358, 1099359,
                        1099360],
                       [     13,     750,    1237,  ...,      12,    1562,
                           1114]]),
       values=tensor([1, 1, 1,  ..., 1, 1, 1]),
       size=(1099361, 1986), nnz=3585129, layout=torch.sparse_coo)