Kernel: aidocs

In [None]:
from pymongo import MongoClient
import pandas as pd

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['MIMIC-IV']
collection = db['NLP-EXPANDED-prammed-postprocessed_translation']

# Query the collection
docs = collection.find({}, projection={'_id': 1, 'stay_id': 1, 'json_data_used': 1, 'dialogue': 1, 'final_dialogue': 1, 'final_to_english': 1})

# Create a list of dictionaries for each document
data = []
for doc in docs:
    row = {
        '_id': str(doc.get('_id', '')),
        'stay_id': doc.get('stay_id', ''),
        'json_data_used': doc.get('json_data_used', {}),
        'dialogue': doc.get('dialogue', ''),
        'final_dialogue': doc.get('final_dialogue', ''),
        'final_to_english': doc.get('final_to_english', '')
    }
    data.append(row)

# Create DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df.head())

In [None]:
df

In [None]:
import pandas as pd

def modify_json_data(row):
    # Define the structure of what to keep from json_data_used
    keys_to_keep = {
        "ED-Triage-prammed": ["temperature", "heartrate", "resprate", "o2sat", "sbp", "dbp", "pain", "chiefcomplaint"],
        "ED-VitalSigns-prammed": ["temperature", "heartrate", "resprate", "o2sat", "sbp", "dbp"],
        "ED-Diagnosis-prammed": ["icd_title"],
        "ED-Pyxis-prammed": ["name"],
        "ED-Medrecon-prammed": ["name"]
    }

    # Extract the json_data_used field from the row
    json_data_used = row.get('json_data_used', {})
    
    # Initialize a dictionary to store the filtered data
    filtered_json = {}

    # Loop through each key and extract the specified fields
    for key, fields in keys_to_keep.items():
        if key in json_data_used and json_data_used[key] is not None:
            filtered_json[key] = {field: json_data_used[key].get(field, None) for field in fields}
    
    # Return the modified json_data_used
    return filtered_json

In [None]:
df['json_data_used'] = df.apply(lambda row: modify_json_data(row), axis=1)

In [None]:
df

In [None]:

df.loc[0, 'json_data_used']

In [None]:
import re
import pandas as pd

def clean_text(text):
    # Normalize text by removing punctuation and converting to lower case
    return re.sub(r'[^\w\s]', '', text.lower())

def check_all_words_presence(row, fields):
    # Get icd_title from json_data_used, specifically from 'ED-Diagnosis-prammed'
    icd_title = row['json_data_used'].get('ED-Diagnosis-prammed', {}).get('icd_title', '')
    
    if not icd_title:  # If icd_title is missing or empty, return zeros for all checks
        return {f"{field}_count": 0 for field in fields}
    
    # Clean and split the icd_title into words
    words = re.split(r'\s+', clean_text(icd_title))
    
    # Initialize result dictionary
    results = {}
    
    # Check for presence of each word in each dialogue field
    for field in fields:
        text = clean_text(row.get(field, ''))  # Clean and get text of the field
        # Check if all words are present using regex search
        all_words_present = all(word in text for word in words)
        results[f"{field}_count"] = 1 if all_words_present else 0
    
    return pd.Series(results)

# Specify the dialogue fields to check
dialogue_fields = ['dialogue', 'final_dialogue', 'final_to_english']

# Apply the function to each row and assign the results to new columns with '_count' suffix
new_columns = df.apply(lambda row: check_all_words_presence(row, dialogue_fields), axis=1)
df = pd.concat([df, new_columns], axis=1)

# Print summary of hits for each field
for field in dialogue_fields:
    print(f"Total hits in '{field}_count': {df[f'{field}_count'].sum()}")


In [None]:
df