In [3]:
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns


df = pl.read_ndjson(
            '/home/gnoblit/takehome/codametrix/data/clean/train_data.ndjson'
)

df = df.with_columns(
    pl.all().replace({'':None})
)

In [5]:
def join_dfs(df, cols, join_term):
    df = df.select(cols).join(
        df.select(cols),
        how='left',
        on=join_term
    ).filter(~(pl.col('description').eq(pl.col('description_right')))).drop(join_term)

    df = df.with_columns(
        codes = pl.concat_list('code', 'code_right')
    )
    df = df.with_columns(
        codes = pl.col('codes').list.sort()
    )

    df = df.unique('codes').sort('code')

    return df

laterality_cols = ['code', 'up_to_laterality', 'description']
laterality_df = join_dfs(df, laterality_cols, 'up_to_laterality')


In [23]:
laterality_df.head()

code,description,code_right,description_right,codes
str,str,str,str,list[str]
"""C44102""","""Unspecified malignant neoplasm…","""C441021""","""Unspecified malignant neoplasm…","[""C44102"", ""C441021""]"
"""C44102""","""Unspecified malignant neoplasm…","""C441022""","""Unspecified malignant neoplasm…","[""C44102"", ""C441022""]"
"""C441021""","""Unspecified malignant neoplasm…","""C441022""","""Unspecified malignant neoplasm…","[""C441021"", ""C441022""]"
"""C44109""","""Unspecified malignant neoplasm…","""C441091""","""Unspecified malignant neoplasm…","[""C44109"", ""C441091""]"
"""C44109""","""Unspecified malignant neoplasm…","""C441092""","""Unspecified malignant neoplasm…","[""C44109"", ""C441092""]"


In [45]:
df['code'][0]

'A00'

In [76]:
def gen_negatives(df, master_df, n: int):
    """
    Subset negatives not section.
    Randomly draw n
    Label as negatives
    """
    negatives = []

    for iter_, el_ in enumerate(df.iter_rows(named=True)):
        subset_df = master_df.filter(
            pl.col('section') != el_['code'][:1]
        ).sample(n)
    
        subset_df = subset_df.select(['code', 'section', 'description'])
        subset_df = subset_df.rename(
            {
                'code':'code_right',
                'description': 'description_right'
            }
        )
        subset_df = subset_df.with_columns(
            positive=pl.lit(False),
            code=pl.lit(el_['code']),
            description=pl.lit(el_['description'])
        )
        
        negatives.append(subset_df.drop('section'))
    return negatives

In [77]:
t = gen_negatives(df=laterality_df[:3], master_df=df, n=2)

In [78]:
t

[shape: (2, 5)
 ┌────────────┬─────────────────────────────┬──────────┬────────┬───────────────────────────────────┐
 │ code_right ┆ description_right           ┆ positive ┆ code   ┆ description                       │
 │ ---        ┆ ---                         ┆ ---      ┆ ---    ┆ ---                               │
 │ str        ┆ str                         ┆ bool     ┆ str    ┆ str                               │
 ╞════════════╪═════════════════════════════╪══════════╪════════╪═══════════════════════════════════╡
 │ S63497     ┆ Traumatic rupture of other  ┆ false    ┆ C44102 ┆ Unspecified malignant neoplasm…   │
 │            ┆ lig…                        ┆          ┆        ┆                                   │
 │ T5492XS    ┆ Toxic effect of unspecified ┆ false    ┆ C44102 ┆ Unspecified malignant neoplasm…   │
 │            ┆ co…                         ┆          ┆        ┆                                   │
 └────────────┴─────────────────────────────┴──────────┴────────┴──

In [7]:
df.head()

path,code,category,details,section,part,root_operation,etiology,location,laterality,extension,up_to_etiology,up_to_location,up_to_laterality,description,ancestors
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""A-A00""","""A00""","""A00""",,"""A""","""0""","""0""",,,,,"""A00""",,,"""Cholera""",
"""A-A00-A000""","""A000""","""A00""","""0""","""A""","""0""","""0""","""0""",,,,"""A000""",,,"""Cholera due to Vibrio cholerae…","""A00"""
"""A-A00-A001""","""A001""","""A00""","""1""","""A""","""0""","""0""","""1""",,,,"""A001""",,,"""Cholera due to Vibrio cholerae…","""A00"""
"""A-A00-A009""","""A009""","""A00""","""9""","""A""","""0""","""0""","""9""",,,,"""A009""",,,"""Cholera, unspecified""","""A00"""
"""A-A01""","""A01""","""A01""",,"""A""","""0""","""1""",,,,,"""A01""",,,"""Typhoid and paratyphoid fevers""",


In [13]:
df.with_columns(
    pl.col('code').str.slice(0,1).alias('section')
)

path,code,category,details,section,part,root_operation,etiology,location,laterality,extension,up_to_etiology,up_to_location,up_to_laterality,description,ancestors
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""A-A00""","""A00""","""A00""",,"""A""","""0""","""0""",,,,,"""A00""",,,"""Cholera""",
"""A-A00-A000""","""A000""","""A00""","""0""","""A""","""0""","""0""","""0""",,,,"""A000""",,,"""Cholera due to Vibrio cholerae…","""A00"""
"""A-A00-A001""","""A001""","""A00""","""1""","""A""","""0""","""0""","""1""",,,,"""A001""",,,"""Cholera due to Vibrio cholerae…","""A00"""
"""A-A00-A009""","""A009""","""A00""","""9""","""A""","""0""","""0""","""9""",,,,"""A009""",,,"""Cholera, unspecified""","""A00"""
"""A-A01""","""A01""","""A01""",,"""A""","""0""","""1""",,,,,"""A01""",,,"""Typhoid and paratyphoid fevers""",
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Z-Z99-Z998-Z9981""","""Z9981""","""Z99""","""81""","""Z""","""9""","""9""","""8""","""1""",,,"""Z998""","""Z9981""",,"""Dependence on supplemental oxy…","""Z99-Z998"""
"""Z-Z99-Z998-Z9989""","""Z9989""","""Z99""","""89""","""Z""","""9""","""9""","""8""","""9""",,,"""Z998""","""Z9989""",,"""Dependence on other enabling m…","""Z99-Z998"""
"""U-U07""","""U07""","""U07""",,"""U""","""0""","""7""",,,,,"""U07""",,,"""Emergency use of U07""",
"""U-U07-U070""","""U070""","""U07""","""0""","""U""","""0""","""7""","""0""",,,,"""U070""",,,"""Vaping-related disorder""","""U07"""


In [3]:
train_df.head()

code,description,code_right,description_right
str,str,str,str
"""A000""","""Cholera due to Vibrio cholerae…","""A00""","""Cholera"""
"""A001""","""Cholera due to Vibrio cholerae…","""A00""","""Cholera"""
"""A009""","""Cholera, unspecified""","""A00""","""Cholera"""
"""A00""","""Cholera""","""A000""","""Cholera due to Vibrio cholerae…"
"""A001""","""Cholera due to Vibrio cholerae…","""A000""","""Cholera due to Vibrio cholerae…"


In [3]:
train_df = train_df.with_columns(
    test = pl.concat_str(
        pl.col('code') + ' ' + pl.col('description')
    ),
    
)

In [4]:
train_df = train_df.with_columns(
    test_right = pl.concat_str(
        pl.col('code_right') + ' ' + pl.col('description_right')
    ),
    
)

: 

In [3]:
train_df.head()

code,description,code_right,description_right
str,str,str,str
"""A00""","""Cholera""","""A000""","""Cholera due to Vibrio cholerae…"
"""A00""","""Cholera""","""A001""","""Cholera due to Vibrio cholerae…"
"""A00""","""Cholera""","""A009""","""Cholera, unspecified"""
"""A000""","""Cholera due to Vibrio cholerae…","""A00""","""Cholera"""
"""A000""","""Cholera due to Vibrio cholerae…","""A001""","""Cholera due to Vibrio cholerae…"


In [16]:

test= pl.read_ndjson(
            '/home/gnoblit/takehome/codametrix/data/clean/raw_icd10.ndjson'
        ).with_columns(pl.col(pl.String).replace('', None))
test.slice(42448, 5)

path,code,category,details,section,part,root_operation,etiology,location,laterality,extension,up_to_etiology,up_to_location,up_to_laterality,description,ancestors
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""S-S45-S453-S4531-S45319""","""S45319""","""S45""","""319""","""S""","""4""","""5""","""3""","""1""","""9""",,"""S453""","""S4531""","""S45319""","""Laceration of superficial vein…","""S45-S453-S4531"""
"""S-S45-S453-S4531-S45319-S45319…","""S45319A""","""S45""","""319A""","""S""","""4""","""5""","""3""","""1""","""9""","""A""","""S453""","""S4531""","""S45319""","""Laceration of superficial vein…","""S45-S453-S4531-S45319"""
"""S-S45-S453-S4531-S45319-S45319…","""S45319D""","""S45""","""319D""","""S""","""4""","""5""","""3""","""1""","""9""","""D""","""S453""","""S4531""","""S45319""","""Laceration of superficial vein…","""S45-S453-S4531-S45319"""
"""S-S45-S453-S4531-S45319-S45319…","""S45319S""","""S45""","""319S""","""S""","""4""","""5""","""3""","""1""","""9""","""S""","""S453""","""S4531""","""S45319""","""Laceration of superficial vein…","""S45-S453-S4531-S45319"""
"""S-S45-S453-S4539""","""S4539""","""S45""","""39""","""S""","""4""","""5""","""3""","""9""",,,"""S453""","""S4539""","""S4539""","""Other specified injury of supe…","""S45-S453"""


In [32]:
test2 = test.with_columns(
    pl.when(
        pl.col('up_to_laterality').eq(pl.col('up_to_location'))
    )
    .then(None)
    .otherwise(pl.col('up_to_laterality'))
    .alias('test')
)

test2.slice(42448, 5)

path,code,category,details,section,part,root_operation,etiology,location,laterality,extension,up_to_etiology,up_to_location,up_to_laterality,description,ancestors,test
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""S-S45-S453-S4531-S45319""","""S45319""","""S45""","""319""","""S""","""4""","""5""","""3""","""1""","""9""",,"""S453""","""S4531""","""S45319""","""Laceration of superficial vein…","""S45-S453-S4531""","""S45319"""
"""S-S45-S453-S4531-S45319-S45319…","""S45319A""","""S45""","""319A""","""S""","""4""","""5""","""3""","""1""","""9""","""A""","""S453""","""S4531""","""S45319""","""Laceration of superficial vein…","""S45-S453-S4531-S45319""","""S45319"""
"""S-S45-S453-S4531-S45319-S45319…","""S45319D""","""S45""","""319D""","""S""","""4""","""5""","""3""","""1""","""9""","""D""","""S453""","""S4531""","""S45319""","""Laceration of superficial vein…","""S45-S453-S4531-S45319""","""S45319"""
"""S-S45-S453-S4531-S45319-S45319…","""S45319S""","""S45""","""319S""","""S""","""4""","""5""","""3""","""1""","""9""","""S""","""S453""","""S4531""","""S45319""","""Laceration of superficial vein…","""S45-S453-S4531-S45319""","""S45319"""
"""S-S45-S453-S4539""","""S4539""","""S45""","""39""","""S""","""4""","""5""","""3""","""9""",,,"""S453""","""S4539""","""S4539""","""Other specified injury of supe…","""S45-S453""",


In [9]:
location_cols = ['code', 'up_to_location', 'description']
up_to_location_df = df.select(location_cols).join(
        df.select(location_cols),
        how='left',
        on='up_to_location'
    ).filter(~(pl.col('description').eq(pl.col('description_right'))))
up_to_location_df

code,up_to_location,description,code_right,description_right
str,str,str,str,str
"""B0801""","""B0801""","""Cowpox and vaccinia not from v…","""B08010""","""Cowpox"""
"""B0801""","""B0801""","""Cowpox and vaccinia not from v…","""B08011""","""Vaccinia not from vaccine"""
"""B08010""","""B0801""","""Cowpox""","""B0801""","""Cowpox and vaccinia not from v…"
"""B08010""","""B0801""","""Cowpox""","""B08011""","""Vaccinia not from vaccine"""
"""B08011""","""B0801""","""Vaccinia not from vaccine""","""B0801""","""Cowpox and vaccinia not from v…"
…,…,…,…,…
"""Z98871""","""Z9887""","""Personal history of in utero p…","""Z98870""","""Personal history of in utero p…"
"""Z9889""","""Z9889""","""Other specified postprocedural…","""Z98891""","""History of uterine scar from p…"
"""Z98890""","""Z9889""","""Other specified postprocedural…","""Z98891""","""History of uterine scar from p…"
"""Z98891""","""Z9889""","""History of uterine scar from p…","""Z9889""","""Other specified postprocedural…"


NameError: name 'pl' is not defined

In [11]:
df['up_to_laterality'].unique()

up_to_laterality
null
""


In [6]:
laterality_cols = ['code', 'up_to_laterality', 'description']
up_to_laterality_df = df.select(laterality_cols).join(
        df.select(laterality_cols),
        how='left',
        on='up_to_laterality'
    ).filter(~(pl.col('description').eq(pl.col('description_right'))))
up_to_laterality_df

code,up_to_laterality,description,code_right,description_right
str,null,str,str,str


In [3]:
category_cols = ['code', 'category', 'description']
category_df = df.select(category_cols).join(
    df.select(category_cols),
    how='left',
    on='category'
).filter(~(pl.col('description').eq(pl.col('description_right'))))
category_df

code,category,description,code_right,description_right
str,str,str,str,str
"""A00""","""A00""","""Cholera""","""A000""","""Cholera due to Vibrio cholerae…"
"""A00""","""A00""","""Cholera""","""A001""","""Cholera due to Vibrio cholerae…"
"""A00""","""A00""","""Cholera""","""A009""","""Cholera, unspecified"""
"""A000""","""A00""","""Cholera due to Vibrio cholerae…","""A00""","""Cholera"""
"""A000""","""A00""","""Cholera due to Vibrio cholerae…","""A001""","""Cholera due to Vibrio cholerae…"
…,…,…,…,…
"""U07""","""U07""","""Emergency use of U07""","""U071""","""COVID-19"""
"""U070""","""U07""","""Vaping-related disorder""","""U07""","""Emergency use of U07"""
"""U070""","""U07""","""Vaping-related disorder""","""U071""","""COVID-19"""
"""U071""","""U07""","""COVID-19""","""U07""","""Emergency use of U07"""


In [4]:
df = df.with_columns(
    pl.col('path').str.split('-').alias('path_split')
)
df = df.with_columns(
    pl.col('path_split')[:-1]
)

In [5]:
df.h

path,code,category,details,section,part,root_operation,etiology,location,laterality,extension,description,path_split
str,str,str,i64,str,i64,i64,i64,i64,i64,str,str,list[str]
"""A-A00""","""A00""","""A00""",,"""A""",0,0,,,,,"""Cholera""","[""A"", ""A00""]"
"""A-A00-A000""","""A000""","""A00""",0.0,"""A""",0,0,0.0,,,,"""Cholera due to Vibrio cholerae…","[""A"", ""A00"", ""A000""]"
"""A-A00-A001""","""A001""","""A00""",1.0,"""A""",0,0,1.0,,,,"""Cholera due to Vibrio cholerae…","[""A"", ""A00"", ""A001""]"
"""A-A00-A009""","""A009""","""A00""",9.0,"""A""",0,0,9.0,,,,"""Cholera, unspecified""","[""A"", ""A00"", ""A009""]"
"""A-A01""","""A01""","""A01""",,"""A""",0,1,,,,,"""Typhoid and paratyphoid fevers""","[""A"", ""A01""]"


In [8]:
df.select(['code', 'category', 'description'])

code,category,description
str,str,str
"""A00""","""A00""","""Cholera"""
"""A000""","""A00""","""Cholera due to Vibrio cholerae…"
"""A001""","""A00""","""Cholera due to Vibrio cholerae…"
"""A009""","""A00""","""Cholera, unspecified"""
"""A01""","""A01""","""Typhoid and paratyphoid fevers"""
…,…,…
"""Z9981""","""Z99""","""Dependence on supplemental oxy…"
"""Z9989""","""Z99""","""Dependence on other enabling m…"
"""U07""","""U07""","""Emergency use of U07"""
"""U070""","""U07""","""Vaping-related disorder"""


In [10]:
df.select(['code', 'category', 'description']).join(
    df.select(['code', 'category', 'description']),
    how='left',
    on='category'
)

code,category,description,code_right,description_right
str,str,str,str,str
"""A00""","""A00""","""Cholera""","""A00""","""Cholera"""
"""A00""","""A00""","""Cholera""","""A000""","""Cholera due to Vibrio cholerae…"
"""A00""","""A00""","""Cholera""","""A001""","""Cholera due to Vibrio cholerae…"
"""A00""","""A00""","""Cholera""","""A009""","""Cholera, unspecified"""
"""A000""","""A00""","""Cholera due to Vibrio cholerae…","""A00""","""Cholera"""
…,…,…,…,…
"""U070""","""U07""","""Vaping-related disorder""","""U070""","""Vaping-related disorder"""
"""U070""","""U07""","""Vaping-related disorder""","""U071""","""COVID-19"""
"""U071""","""U07""","""COVID-19""","""U07""","""Emergency use of U07"""
"""U071""","""U07""","""COVID-19""","""U070""","""Vaping-related disorder"""


In [126]:
df.filter(pl.col('code')=='Z189')['description'].to_list()

['Retained foreign body fragments, unspecified material']