In [1]:
## Data Analysis of pre-processed data

In [2]:

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.utils import encoders

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv

In [4]:
data = pd.read_csv(f"../data/raw/asts.csv", on_bad_lines='skip')
data.drop('BioSample', axis=1, inplace=True)
print(data.head())

        Organism group                                    Scientific name  \
0  Salmonella enterica  Salmonella enterica subsp. enterica serovar 4,...   
1  Salmonella enterica  Salmonella enterica subsp. enterica serovar Ke...   
2  Salmonella enterica  Salmonella enterica subsp. enterica serovar Ke...   
3  Salmonella enterica  Salmonella enterica subsp. enterica serovar Hadar   
4  Salmonella enterica                Salmonella enterica subsp. enterica   

        Isolation type Location Isolation source         Isolate Antibiotic  \
0  environmental/other      USA   chicken breast  PDT000003687.3   amikacin   
1  environmental/other      USA   chicken breast  PDT000003688.4   amikacin   
2  environmental/other      USA   chicken breast  PDT000003689.4   amikacin   
3  environmental/other      USA   chicken breast  PDT000003690.3   amikacin   
4  environmental/other      USA        pork chop  PDT000003691.3   amikacin   

  Resistance phenotype Measurement sign  MIC (mg/L)  Disk diff

In [5]:
# convert all columns to lowercase and replace spaces and '/' with underscores 
data.columns = map(str.lower, data.columns)
data.columns = data.columns.str.replace(' ', '_').str.replace('/', '_')
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457155 entries, 0 to 457154
Data columns (total 16 columns):
 #   Column                                       Non-Null Count   Dtype  
---  ------                                       --------------   -----  
 0   organism_group                               457155 non-null  object 
 1   scientific_name                              457155 non-null  object 
 2   isolation_type                               455418 non-null  object 
 3   location                                     436152 non-null  object 
 4   isolation_source                             428448 non-null  object 
 5   isolate                                      457155 non-null  object 
 6   antibiotic                                   457155 non-null  object 
 7   resistance_phenotype                         457155 non-null  object 
 8   measurement_sign                             451768 non-null  object 
 9   mic_(mg_l)                                   447153 non-nul

In [6]:
# analyse how all data looks like
# Identify categorical columns
categorical_cols = data.select_dtypes(include=["object"]).columns

# Count unique values in each categorical column
unique_counts = data[categorical_cols].nunique().sort_values(ascending=False)

print(unique_counts)

isolate                                        32605
create_date                                    13976
isolation_source                                1116
scientific_name                                  429
location                                         338
antibiotic                                       138
organism_group                                    44
laboratory_typing_method_version_or_reagent       23
laboratory_typing_platform                        19
vendor                                            14
resistance_phenotype                               6
measurement_sign                                   6
testing_standard                                   5
isolation_type                                     2
dtype: int64


## Data Encoding

| Column Name                                      | Unique Values | Encoding Recommendation |
|-------------------------------------------------|--------------|-------------------------|
| **`isolate`**                                   | 32,605       | **Drop / Frequency Encoding** (Likely an ID, so might not be useful) |
| **`create_date`**                               | 13,976       | **Drop or Convert to Date Features** (Extract year, month, etc.) |
| **`isolation_source`**                          | 1,116        | **Label Encoding / Target Encoding** |
| **`scientific_name`**                           | 429          | **Label Encoding / Target Encoding** |
| **`location`**                                  | 338          | **Label Encoding / Frequency Encoding** |
| **`antibiotic`**                                | 138          | **Label Encoding / Target Encoding** |
| **`organism_group`**                            | 44           | **One-Hot Encoding / Label Encoding** |
| **`laboratory_typing_method_version_or_reagent`** | 23           | **One-Hot Encoding / Label Encoding** |
| **`laboratory_typing_platform`**                | 19           | **One-Hot Encoding / Label Encoding** |
| **`vendor`**                                    | 14           | **One-Hot Encoding / Label Encoding** |
| **`resistance_phenotype`**                      | 6            | **One-Hot Encoding** |
| **`measurement_sign`**                          | 6            | **One-Hot Encoding** |
| **`testing_standard`**                          | 5            | **One-Hot Encoding** |
| **`isolation_type`**                            | 2            | **One-Hot Encoding** |


In [7]:
# drop un important columns
data.drop(['isolate'], axis=1, inplace=True)

In [8]:
# make every empty cell 0
# data = data.fillna(0)
print(data.isnull().sum())

organism_group                                      0
scientific_name                                     0
isolation_type                                   1737
location                                        21003
isolation_source                                28707
antibiotic                                          0
resistance_phenotype                                0
measurement_sign                                 5387
mic_(mg_l)                                      10002
disk_diffusion_(mm)                            452540
laboratory_typing_platform                     140429
vendor                                         164458
laboratory_typing_method_version_or_reagent    200454
testing_standard                                11420
create_date                                         0
dtype: int64


In [9]:
# converting values to encoders variables 
cat_enc = encoders.CategoricalEncoders()

isolation_type_encoded = cat_enc.one_hot_encode(data['isolation_type'])
data = pd.concat([data, isolation_type_encoded], axis=1)

print(data.head())

        organism_group                                    scientific_name  \
0  Salmonella enterica  Salmonella enterica subsp. enterica serovar 4,...   
1  Salmonella enterica  Salmonella enterica subsp. enterica serovar Ke...   
2  Salmonella enterica  Salmonella enterica subsp. enterica serovar Ke...   
3  Salmonella enterica  Salmonella enterica subsp. enterica serovar Hadar   
4  Salmonella enterica                Salmonella enterica subsp. enterica   

        isolation_type location isolation_source antibiotic  \
0  environmental/other      USA   chicken breast   amikacin   
1  environmental/other      USA   chicken breast   amikacin   
2  environmental/other      USA   chicken breast   amikacin   
3  environmental/other      USA   chicken breast   amikacin   
4  environmental/other      USA        pork chop   amikacin   

  resistance_phenotype measurement_sign  mic_(mg_l)  disk_diffusion_(mm)  \
0          susceptible               ==         1.0                  NaN   
1     

In [10]:
data.drop(['isolation_type'], axis=1, inplace=True)

In [11]:
#  **`antibiotic`**                                | 138          | **Label Encoding / Target Encoding** |
# | **`organism_group`**                            | 44           | **One-Hot Encoding / Label Encoding** |
# | **`laboratory_typing_method_version_or_reagent`** | 23           | **One-Hot Encoding / Label Encoding** |
# | **`laboratory_typing_platform`**                | 19           | **One-Hot Encoding / Label Encoding** |
# | **`vendor`**                                    | 14           | **One-Hot Encoding / Label Encoding** |
resistance_phenotype = cat_enc.one_hot_encode(data['resistance_phenotype'])
measurement_sign = cat_enc.one_hot_encode(data['measurement_sign'])
testing_standard  = cat_enc.one_hot_encode(data['testing_standard'])

In [12]:
data = pd.concat([data, resistance_phenotype, measurement_sign, testing_standard], axis=1)

In [13]:
target_phenotypes = data['resistance_phenotype']

data.drop(['resistance_phenotype', 'measurement_sign', 'testing_standard'], axis=1, inplace=True)

In [14]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457155 entries, 0 to 457154
Data columns (total 30 columns):
 #   Column                                           Non-Null Count   Dtype  
---  ------                                           --------------   -----  
 0   organism_group                                   457155 non-null  object 
 1   scientific_name                                  457155 non-null  object 
 2   location                                         436152 non-null  object 
 3   isolation_source                                 428448 non-null  object 
 4   antibiotic                                       457155 non-null  object 
 5   mic_(mg_l)                                       447153 non-null  float64
 6   disk_diffusion_(mm)                              4615 non-null    float64
 7   laboratory_typing_platform                       316726 non-null  object 
 8   vendor                                           292697 non-null  object 
 9   laboratory_typi

In [15]:
# print (data.columns)

label_encoded_headings = ["organism_group","laboratory_typing_method_version_or_reagent","laboratory_typing_platform","vendor"]

for heading in label_encoded_headings:
    encoded, encoder = cat_enc.label_encode(data['organism_group'])
    print(encoder.classes_, " : ", heading)
    data[heading] = encoded

['Acinetobacter baumannii' 'Aeromonas salmonicida' 'Bacillus cereus group'
 'Burkholderia cepacia complex' 'Campylobacter jejuni' 'Candidozyma auris'
 'Citrobacter freundii' 'Clostridioides difficile'
 'Corynebacterium striatum' 'Cronobacter' 'E.coli and Shigella'
 'Enterobacter asburiae' 'Enterobacter bugandensis' 'Enterobacter cloacae'
 'Enterobacter hormaechei' 'Enterobacter kobei' 'Enterobacter ludwigii'
 'Enterobacter mori' 'Enterobacter roggenkampii' 'Enterococcus faecalis'
 'Enterococcus faecium' 'Klebsiella oxytoca' 'Klebsiella pneumoniae'
 'Kluyvera_intermedia' 'Listeria monocytogenes' 'Morganella morganii'
 'Mycobacterium tuberculosis' 'Neisseria gonorrhoeae'
 'Neisseria meningitidis' 'Pasteurella multocida'
 'Pluralibacter gergoviae' 'Providencia alcalifaciens'
 'Pseudomonas aeruginosa' 'Pseudomonas putida' 'Salmonella enterica'
 'Serratia marcescens' 'Staphylococcus aureus'
 'Staphylococcus pseudintermedius' 'Stenotrophomonas maltophilia'
 'Streptococcus agalactiae' 'Strept

In [16]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457155 entries, 0 to 457154
Data columns (total 30 columns):
 #   Column                                           Non-Null Count   Dtype  
---  ------                                           --------------   -----  
 0   organism_group                                   457155 non-null  int64  
 1   scientific_name                                  457155 non-null  object 
 2   location                                         436152 non-null  object 
 3   isolation_source                                 428448 non-null  object 
 4   antibiotic                                       457155 non-null  object 
 5   mic_(mg_l)                                       447153 non-null  float64
 6   disk_diffusion_(mm)                              4615 non-null    float64
 7   laboratory_typing_platform                       457155 non-null  int64  
 8   vendor                                           457155 non-null  int64  
 9   laboratory_typi

In [17]:
data.drop("create_date", axis=1, inplace=True)

In [18]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457155 entries, 0 to 457154
Data columns (total 29 columns):
 #   Column                                           Non-Null Count   Dtype  
---  ------                                           --------------   -----  
 0   organism_group                                   457155 non-null  int64  
 1   scientific_name                                  457155 non-null  object 
 2   location                                         436152 non-null  object 
 3   isolation_source                                 428448 non-null  object 
 4   antibiotic                                       457155 non-null  object 
 5   mic_(mg_l)                                       447153 non-null  float64
 6   disk_diffusion_(mm)                              4615 non-null    float64
 7   laboratory_typing_platform                       457155 non-null  int64  
 8   vendor                                           457155 non-null  int64  
 9   laboratory_typi

In [22]:
resistance_target_encoded =  data[[
    'resistance_phenotype_intermediate',
    'resistance_phenotype_nonsusceptible',
    'resistance_phenotype_not defined',
    'resistance_phenotype_resistant',
    'resistance_phenotype_susceptible',
    'resistance_phenotype_susceptible-dose dependent'
]].copy()

encoded_scientific_name, encoder_scientific_name_dictionary = cat_enc.target_encode(
    data['scientific_name'], resistance_target_encoded)

encoded_isolation_source, encoder_isolation_source_dictionary = cat_enc.target_encode(
    data['isolation_source'], resistance_target_encoded)

encoded_location, encoder_location_dictionary = cat_enc.frequency_encode(data['location'])

encoded_antibiotic, encoder_antibiotic_dictionary = cat_enc.target_encode(data['antibiotic'], resistance_target_encoded)


In [23]:
# append the encoded columns to the data
data = pd.concat([data, encoded_scientific_name, encoded_isolation_source, encoded_location, encoded_antibiotic], axis=1)

In [24]:
# drop the original columns
data.drop(['scientific_name', 'isolation_source', 'location', 'antibiotic'], axis=1, inplace=True)

In [26]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457155 entries, 0 to 457154
Data columns (total 43 columns):
 #   Column                                                               Non-Null Count   Dtype  
---  ------                                                               --------------   -----  
 0   organism_group                                                       457155 non-null  int64  
 1   mic_(mg_l)                                                           447153 non-null  float64
 2   disk_diffusion_(mm)                                                  4615 non-null    float64
 3   laboratory_typing_platform                                           457155 non-null  int64  
 4   vendor                                                               457155 non-null  int64  
 5   laboratory_typing_method_version_or_reagent                          457155 non-null  int64  
 6   isolation_type_clinical                                              457155 non-null  int64 

In [27]:
# write the data to a csv file

data.to_csv(f"../data/processed/asts.csv", index=False)