# Data Import

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('microbiology_events_codes_3.csv')

In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'chartdate', 'charttime', 'spec_type_desc',
       'test_name', 'org_name', 'ab_name', 'dilution_text',
       'dilution_comparison', 'dilution_value', 'interpretation',
       'technician_id', 'qc_flag'],
      dtype='object')

In [4]:
df

Unnamed: 0,subject_id,hadm_id,chartdate,charttime,spec_type_desc,test_name,org_name,ab_name,dilution_text,dilution_comparison,dilution_value,interpretation,technician_id,qc_flag
0,10000980,26913865.0,2189-06-27 00:00:00,2189-06-27 10:52:00,MRSA SCREEN,MRSA SCREEN,,,,,,,TECH_063,QC_OK
1,10002155,23822395.0,2129-08-04 00:00:00,2129-08-04 17:04:00,MRSA SCREEN,MRSA SCREEN,,,,,,,TECH_095,QC_OK
2,10002155,23822395.0,2129-08-05 00:00:00,2129-08-05 15:54:00,URINE,Legionella Urinary Antigen,,,,,,,TECH_083,QC_OK
3,10002155,23822395.0,2129-08-05 00:00:00,2129-08-05 18:43:00,SPUTUM,GRAM STAIN,,,,,,,TECH_095,QC_OK
4,10002155,23822395.0,2129-08-05 00:00:00,2129-08-05 18:43:00,SPUTUM,RESPIRATORY CULTURE,,,,,,,TECH_077,QC_OK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15582,19997665,26052266.0,2173-11-09 00:00:00,2173-11-09 18:09:00,Staph aureus swab,Staph aureus Screen,,,,,,,TECH_051,QC_OK
15583,19997666,24256422.0,2168-08-03 00:00:00,2168-08-03 22:00:00,BLOOD CULTURE,"Blood Culture, Routine",,,,,,,TECH_086,QC_OK
15584,19997667,20372003.0,2144-09-21 00:00:00,2144-09-21 20:14:00,URINE,URINE CULTURE,PSEUDOMONAS AERUGINOSA,MEROPENEM,8,=,8.0,I,TECH_115,QC_FAIL
15585,19997668,20329436.0,2129-08-12 00:00:00,2129-08-12 15:56:00,CATHETER TIP-IV,WOUND CULTURE,,,,,,,TECH_061,QC_OK


### Drop Column

In [5]:
# dropped_df = df.drop(columns=['subject_id', 'hadm_id', 'chartdate', 'charttime', 'technician_id'])
dropped_df = df.drop(columns=['dilution_text', 'dilution_comparison'])

In [6]:
# dropped_df.describe()

### Data Cleaning

##### Duplicates: Only 1 Duplicate found.

In [7]:
removed = dropped_df[dropped_df.duplicated(keep='first')]
df_cleaned = dropped_df.drop_duplicates(keep='first')

In [8]:
removed

Unnamed: 0,subject_id,hadm_id,chartdate,charttime,spec_type_desc,test_name,org_name,ab_name,dilution_value,interpretation,technician_id,qc_flag
2455,11823798,23491105.0,2186-07-16 00:00:00,2186-07-16 00:25:00,BLOOD CULTURE,"Blood Culture, Routine",,,,,TECH_120,QC_OK


In [9]:
df_cleaned

Unnamed: 0,subject_id,hadm_id,chartdate,charttime,spec_type_desc,test_name,org_name,ab_name,dilution_value,interpretation,technician_id,qc_flag
0,10000980,26913865.0,2189-06-27 00:00:00,2189-06-27 10:52:00,MRSA SCREEN,MRSA SCREEN,,,,,TECH_063,QC_OK
1,10002155,23822395.0,2129-08-04 00:00:00,2129-08-04 17:04:00,MRSA SCREEN,MRSA SCREEN,,,,,TECH_095,QC_OK
2,10002155,23822395.0,2129-08-05 00:00:00,2129-08-05 15:54:00,URINE,Legionella Urinary Antigen,,,,,TECH_083,QC_OK
3,10002155,23822395.0,2129-08-05 00:00:00,2129-08-05 18:43:00,SPUTUM,GRAM STAIN,,,,,TECH_095,QC_OK
4,10002155,23822395.0,2129-08-05 00:00:00,2129-08-05 18:43:00,SPUTUM,RESPIRATORY CULTURE,,,,,TECH_077,QC_OK
...,...,...,...,...,...,...,...,...,...,...,...,...
15582,19997665,26052266.0,2173-11-09 00:00:00,2173-11-09 18:09:00,Staph aureus swab,Staph aureus Screen,,,,,TECH_051,QC_OK
15583,19997666,24256422.0,2168-08-03 00:00:00,2168-08-03 22:00:00,BLOOD CULTURE,"Blood Culture, Routine",,,,,TECH_086,QC_OK
15584,19997667,20372003.0,2144-09-21 00:00:00,2144-09-21 20:14:00,URINE,URINE CULTURE,PSEUDOMONAS AERUGINOSA,MEROPENEM,8.0,I,TECH_115,QC_FAIL
15585,19997668,20329436.0,2129-08-12 00:00:00,2129-08-12 15:56:00,CATHETER TIP-IV,WOUND CULTURE,,,,,TECH_061,QC_OK


##### Check for Whitespaces, Removing Whitespaces

In [10]:
# detect string columns
str_cols = df_cleaned.select_dtypes(include=['object']).columns
str_cols

Index(['chartdate', 'charttime', 'spec_type_desc', 'test_name', 'org_name',
       'ab_name', 'interpretation', 'technician_id', 'qc_flag'],
      dtype='object')

In [11]:
# show rows that will change
changed = df_cleaned[df_cleaned[str_cols].apply(lambda x: x != x.str.strip()).any(axis=1)]
# Modified rows before cleanup:
changed

Unnamed: 0,subject_id,hadm_id,chartdate,charttime,spec_type_desc,test_name,org_name,ab_name,dilution_value,interpretation,technician_id,qc_flag
0,10000980,26913865.0,2189-06-27 00:00:00,2189-06-27 10:52:00,MRSA SCREEN,MRSA SCREEN,,,,,TECH_063,QC_OK
1,10002155,23822395.0,2129-08-04 00:00:00,2129-08-04 17:04:00,MRSA SCREEN,MRSA SCREEN,,,,,TECH_095,QC_OK
2,10002155,23822395.0,2129-08-05 00:00:00,2129-08-05 15:54:00,URINE,Legionella Urinary Antigen,,,,,TECH_083,QC_OK
3,10002155,23822395.0,2129-08-05 00:00:00,2129-08-05 18:43:00,SPUTUM,GRAM STAIN,,,,,TECH_095,QC_OK
4,10002155,23822395.0,2129-08-05 00:00:00,2129-08-05 18:43:00,SPUTUM,RESPIRATORY CULTURE,,,,,TECH_077,QC_OK
...,...,...,...,...,...,...,...,...,...,...,...,...
15579,19997662,29338106.0,2182-08-22 00:00:00,2182-08-22 09:09:00,URINE,Legionella Urinary Antigen,,,,,TECH_082,QC_OK
15581,19997664,25289714.0,2186-01-02 00:00:00,2186-01-02 03:40:00,MRSA SCREEN,MRSA SCREEN,,,,,TECH_049,QC_OK
15582,19997665,26052266.0,2173-11-09 00:00:00,2173-11-09 18:09:00,Staph aureus swab,Staph aureus Screen,,,,,TECH_051,QC_OK
15583,19997666,24256422.0,2168-08-03 00:00:00,2168-08-03 22:00:00,BLOOD CULTURE,"Blood Culture, Routine",,,,,TECH_086,QC_OK


In [12]:
# clean whitespace
df_cleaned[str_cols] = df_cleaned[str_cols].apply(lambda x: x.str.strip())
df_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[str_cols] = df_cleaned[str_cols].apply(lambda x: x.str.strip())


Unnamed: 0,subject_id,hadm_id,chartdate,charttime,spec_type_desc,test_name,org_name,ab_name,dilution_value,interpretation,technician_id,qc_flag
0,10000980,26913865.0,2189-06-27 00:00:00,2189-06-27 10:52:00,MRSA SCREEN,MRSA SCREEN,,,,,TECH_063,QC_OK
1,10002155,23822395.0,2129-08-04 00:00:00,2129-08-04 17:04:00,MRSA SCREEN,MRSA SCREEN,,,,,TECH_095,QC_OK
2,10002155,23822395.0,2129-08-05 00:00:00,2129-08-05 15:54:00,URINE,Legionella Urinary Antigen,,,,,TECH_083,QC_OK
3,10002155,23822395.0,2129-08-05 00:00:00,2129-08-05 18:43:00,SPUTUM,GRAM STAIN,,,,,TECH_095,QC_OK
4,10002155,23822395.0,2129-08-05 00:00:00,2129-08-05 18:43:00,SPUTUM,RESPIRATORY CULTURE,,,,,TECH_077,QC_OK
...,...,...,...,...,...,...,...,...,...,...,...,...
15582,19997665,26052266.0,2173-11-09 00:00:00,2173-11-09 18:09:00,Staph aureus swab,Staph aureus Screen,,,,,TECH_051,QC_OK
15583,19997666,24256422.0,2168-08-03 00:00:00,2168-08-03 22:00:00,BLOOD CULTURE,"Blood Culture, Routine",,,,,TECH_086,QC_OK
15584,19997667,20372003.0,2144-09-21 00:00:00,2144-09-21 20:14:00,URINE,URINE CULTURE,PSEUDOMONAS AERUGINOSA,MEROPENEM,8.0,I,TECH_115,QC_FAIL
15585,19997668,20329436.0,2129-08-12 00:00:00,2129-08-12 15:56:00,CATHETER TIP-IV,WOUND CULTURE,,,,,TECH_061,QC_OK


#### Fix data types

##### Convert chartdate, charttime → datetime

In [13]:
df_cleaned['chartdate'] = pd.to_datetime(df_cleaned['chartdate'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['chartdate'] = pd.to_datetime(df_cleaned['chartdate'], errors='coerce')


##### Convert dilution_value → numeric (float)

In [14]:
df_cleaned['dilution_value'] = pd.to_numeric(df_cleaned['dilution_value'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['dilution_value'] = pd.to_numeric(df_cleaned['dilution_value'], errors='coerce')


##### Ensure categorical columns (spec_type_desc, test_name, interpretation) are strings.

In [15]:
import numpy as np

# Apply cleaning only to object (string) columns
for col in df_cleaned.select_dtypes(include=['object']).columns:
    df_cleaned[col] = (
        df_cleaned[col]
        .astype(str)  # ensure it's string
        .str.replace(r'\\xa0', ' ', regex=True)  # literal "\xa0"
        .str.replace(u'\xa0', ' ', regex=False)  # actual unicode non-breaking space
        .str.replace(r'[\t\n\r]+', ' ', regex=True)  # remove tabs/newlines
        .str.replace(r'[^\x00-\x7F]+', '', regex=True)  # remove non-ASCII
        .replace({'': np.nan})  # replace empty strings with NaN
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in th

##### Convert Them All to Strings

In [16]:
categorical_cols = ['spec_type_desc', 'test_name', 'interpretation', 'org_name', 'technician_id', 'qc_flag']

for col in categorical_cols:
    if col in df_cleaned.columns:
        df_cleaned[col] = df_cleaned[col].astype('string').str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[col] = df_cleaned[col].astype('string').str.strip()


##### Normalizing Case for Consistency

In [17]:
df_cleaned[categorical_cols] = df_cleaned[categorical_cols].apply(lambda x: x.str.upper())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[categorical_cols] = df_cleaned[categorical_cols].apply(lambda x: x.str.upper())


## Rough Work

### Analysis Done

In [18]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15586 entries, 0 to 15586
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   subject_id      15586 non-null  int64         
 1   hadm_id         15586 non-null  float64       
 2   chartdate       15586 non-null  datetime64[ns]
 3   charttime       15586 non-null  object        
 4   spec_type_desc  15586 non-null  string        
 5   test_name       15586 non-null  string        
 6   org_name        15586 non-null  string        
 7   ab_name         15586 non-null  object        
 8   dilution_value  4711 non-null   float64       
 9   interpretation  15586 non-null  string        
 10  technician_id   15586 non-null  string        
 11  qc_flag         15586 non-null  string        
dtypes: datetime64[ns](1), float64(2), int64(1), object(2), string(6)
memory usage: 1.5+ MB


In [19]:
df_cleaned.describe(include='all')

Unnamed: 0,subject_id,hadm_id,chartdate,charttime,spec_type_desc,test_name,org_name,ab_name,dilution_value,interpretation,technician_id,qc_flag
count,15586.0,15586.0,15586,15586,15586,15586,15586,15586.0,4711.0,15586,15586,15586
unique,,,,7709,42,90,83,28.0,,4,120,3
top,,,,2185-01-09 10:55:00,URINE,URINE CULTURE,NAN,,,NAN,TECH_037,QC_OK
freq,,,,34,4951,4795,10195,10766.0,,10766,172,13235
mean,15221200.0,25034790.0,2155-02-04 12:15:25.753882624,,,,,,7.06242,,,
min,10000980.0,20007900.0,2110-02-15 00:00:00,,,,,,0.06,,,
25%,12662770.0,22662300.0,2135-12-02 00:00:00,,,,,,0.5,,,
50%,15198230.0,25146490.0,2155-08-21 00:00:00,,,,,,1.0,,,
75%,17721890.0,27431160.0,2174-10-09 18:00:00,,,,,,4.0,,,
max,19997670.0,29999670.0,2209-12-20 00:00:00,,,,,,512.0,,,


##### Check for all string-looking empties across all columns

In [20]:
df_cleaned

Unnamed: 0,subject_id,hadm_id,chartdate,charttime,spec_type_desc,test_name,org_name,ab_name,dilution_value,interpretation,technician_id,qc_flag
0,10000980,26913865.0,2189-06-27,2189-06-27 10:52:00,MRSA SCREEN,MRSA SCREEN,NAN,,,NAN,TECH_063,QC_OK
1,10002155,23822395.0,2129-08-04,2129-08-04 17:04:00,MRSA SCREEN,MRSA SCREEN,NAN,,,NAN,TECH_095,QC_OK
2,10002155,23822395.0,2129-08-05,2129-08-05 15:54:00,URINE,LEGIONELLA URINARY ANTIGEN,NAN,,,NAN,TECH_083,QC_OK
3,10002155,23822395.0,2129-08-05,2129-08-05 18:43:00,SPUTUM,GRAM STAIN,NAN,,,NAN,TECH_095,QC_OK
4,10002155,23822395.0,2129-08-05,2129-08-05 18:43:00,SPUTUM,RESPIRATORY CULTURE,NAN,,,NAN,TECH_077,QC_OK
...,...,...,...,...,...,...,...,...,...,...,...,...
15582,19997665,26052266.0,2173-11-09,2173-11-09 18:09:00,STAPH AUREUS SWAB,STAPH AUREUS SCREEN,NAN,,,NAN,TECH_051,QC_OK
15583,19997666,24256422.0,2168-08-03,2168-08-03 22:00:00,BLOOD CULTURE,"BLOOD CULTURE, ROUTINE",NAN,,,NAN,TECH_086,QC_OK
15584,19997667,20372003.0,2144-09-21,2144-09-21 20:14:00,URINE,URINE CULTURE,PSEUDOMONAS AERUGINOSA,MEROPENEM,8.0,I,TECH_115,QC_FAIL
15585,19997668,20329436.0,2129-08-12,2129-08-12 15:56:00,CATHETER TIP-IV,WOUND CULTURE,NAN,,,NAN,TECH_061,QC_OK


In [35]:
df_cleaned.isnull().sum()

subject_id                    0
hadm_id                       0
chartdate                     0
charttime                     0
spec_type_desc                0
test_name                     0
org_name                      0
ab_name                       0
dilution_value            10875
interpretation                0
technician_id                 0
qc_flag                       0
interpretation_encoded        0
dtype: int64

##### Checking Subject id Duplicates

In [23]:
dic = {}
for i in df_cleaned['subject_id'].unique():
    if len(df_cleaned[df_cleaned['subject_id']==i])>1:
        # print(df[df['subject_id']==i].iloc[:, :3])
        dic[i] = len(df_cleaned[df_cleaned['subject_id']==i])

In [24]:
dict(sorted(dic.items(), key=lambda item: item[1], reverse= True))

{12492737: 122,
 13135946: 107,
 11204646: 89,
 15016682: 83,
 16571922: 73,
 12612603: 65,
 15094687: 64,
 19058876: 63,
 14383658: 61,
 17347569: 60,
 10274145: 56,
 19235707: 55,
 10297774: 54,
 10684958: 54,
 19615440: 53,
 13086918: 52,
 14861499: 52,
 18914188: 52,
 14023270: 51,
 16148712: 51,
 13408370: 50,
 13243522: 49,
 18257010: 48,
 19075045: 46,
 11504038: 44,
 13021449: 44,
 17660889: 44,
 12012265: 40,
 17857718: 39,
 19367944: 39,
 17725424: 38,
 18230098: 37,
 19270175: 36,
 14033331: 35,
 14363820: 35,
 16221250: 35,
 16762739: 35,
 19495094: 35,
 11684446: 34,
 15056680: 34,
 16261478: 34,
 19528617: 34,
 14640197: 33,
 15954569: 33,
 18912334: 33,
 10108435: 32,
 12172562: 32,
 13269747: 32,
 13537167: 32,
 15730033: 32,
 16218892: 32,
 16291750: 32,
 17738453: 32,
 15762152: 31,
 17486231: 31,
 18703566: 31,
 19324712: 31,
 10337761: 30,
 12654170: 30,
 15112972: 30,
 16047946: 30,
 17890530: 30,
 11281603: 29,
 11655439: 29,
 13297426: 29,
 17200755: 29,
 1827132

In [25]:
df_cleaned[df_cleaned['subject_id']==11823798]

Unnamed: 0,subject_id,hadm_id,chartdate,charttime,spec_type_desc,test_name,org_name,ab_name,dilution_value,interpretation,technician_id,qc_flag
2452,11823798,23491105.0,2186-07-14,2186-07-14 16:54:00,URINE,URINE CULTURE,NAN,,,NAN,TECH_005,QC_OK
2453,11823798,23491105.0,2186-07-14,2186-07-14 16:55:00,STAPH AUREUS SWAB,STAPH AUREUS SCREEN,NAN,,,NAN,TECH_050,QC_OK
2454,11823798,23491105.0,2186-07-16,2186-07-16 00:25:00,BLOOD CULTURE,"BLOOD CULTURE, ROUTINE",NAN,,,NAN,TECH_120,QC_OK
2456,11823798,23491105.0,2186-07-16,2186-07-16 00:47:00,URINE,URINE CULTURE,NAN,,,NAN,TECH_058,QC_OK
2457,11823798,23491105.0,2186-07-20,2186-07-20 17:56:00,URINE,URINE CULTURE,NAN,,,NAN,TECH_065,QC_OK
2458,11823798,23491105.0,2186-07-24,2186-07-24 12:08:00,URINE,URINE CULTURE,ENTEROCOCCUS SP.,AMPICILLIN,2.0,S,TECH_008,QC_OK
2459,11823798,23491105.0,2186-07-24,2186-07-24 12:08:00,URINE,URINE CULTURE,ENTEROCOCCUS SP.,NITROFURANTOIN,16.0,S,TECH_069,QC_OK
2460,11823798,23491105.0,2186-07-24,2186-07-24 12:08:00,URINE,URINE CULTURE,ENTEROCOCCUS SP.,TETRACYCLINE,16.0,R,TECH_033,QC_WARN
2461,11823798,23491105.0,2186-07-24,2186-07-24 12:08:00,URINE,URINE CULTURE,ENTEROCOCCUS SP.,VANCOMYCIN,1.0,S,TECH_023,QC_OK
2462,11823798,23491105.0,2186-07-24,2186-07-24 13:00:00,BLOOD CULTURE,"BLOOD CULTURE, ROUTINE",NAN,,,NAN,TECH_031,QC_OK


In [26]:
df_cleaned['spec_type_desc'].value_counts()

spec_type_desc
URINE                                                       4951
BLOOD CULTURE                                               3375
SPUTUM                                                      1847
MRSA SCREEN                                                 1241
STAPH AUREUS SWAB                                            645
PLEURAL FLUID                                                618
STOOL                                                        581
FLUID,OTHER                                                  449
SWAB                                                         344
TISSUE                                                       206
SEROLOGY/BLOOD                                               161
FLUID RECEIVED IN BLOOD CULTURE BOTTLES                      123
CATHETER TIP-IV                                              113
BLOOD (EBV)                                                  112
BRONCHOALVEOLAR LAVAGE                                       107
RAPID RESP

In [27]:
df_cleaned['ab_name'].value_counts()

ab_name
nan                     10766
GENTAMICIN                453
TRIMETHOPRIM/SULFA        389
TOBRAMYCIN                287
CEFTAZIDIME               285
CIPROFLOXACIN             284
MEROPENEM                 283
CEFEPIME                  282
NITROFURANTOIN            261
CEFTRIAXONE               248
PIPERACILLIN/TAZO         223
AMPICILLIN                206
AMPICILLIN/SULBACTAM      201
CEFAZOLIN                 197
TETRACYCLINE              186
LEVOFLOXACIN              173
OXACILLIN                 171
VANCOMYCIN                169
ERYTHROMYCIN              153
CLINDAMYCIN               146
RIFAMPIN                   57
PIPERACILLIN               41
CEFUROXIME                 36
PENICILLIN G               35
AMIKACIN                   27
LINEZOLID                  14
IMIPENEM                    9
DAPTOMYCIN                  4
Name: count, dtype: int64

In [28]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

# Fit and transform the 'color' column
df_cleaned['interpretation_encoded'] = encoder.fit_transform(df_cleaned['interpretation'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['interpretation_encoded'] = encoder.fit_transform(df_cleaned['interpretation'])


In [29]:
df_cleaned['interpretation'].dropna()

0        NAN
1        NAN
2        NAN
3        NAN
4        NAN
        ... 
15582    NAN
15583    NAN
15584      I
15585    NAN
15586      S
Name: interpretation, Length: 15586, dtype: string

In [30]:
df_cleaned['interpretation_encoded'].dropna()

0        1
1        1
2        1
3        1
4        1
        ..
15582    1
15583    1
15584    0
15585    1
15586    3
Name: interpretation_encoded, Length: 15586, dtype: int32

In [31]:
df_cleaned['dilution_value'].corr(df_cleaned['interpretation_encoded'])

-0.3733898243108889