### Downloading all the csv files stored in S3

In [None]:
import boto3
import pandas as pd
from io import StringIO

# Initialize S3 client
s3 = boto3.client('s3')

# S3 bucket and prefix/folder name
bucket_name = 'lambdasam-bc866238ef-us-east-1'
prefix = 'mimic-iii-clinical-database-demo-1.4/'  # Folder containing CSV files

# List all CSV files in the S3 bucket under the specified prefix
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
csv_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.csv')]

# Initialize an empty dataframe for merging
merged_df = pd.DataFrame()

for csv_file in csv_files:
    # Read each CSV file from S3 into a dataframe
    obj = s3.get_object(Bucket=bucket_name, Key=csv_file)
    df = pd.read_csv(StringIO(obj['Body'].read().decode('utf-8')))
    
    # Drop duplicate columns except for 'row_id'
    duplicate_columns = [col for col in df.columns if col in merged_df.columns and col != 'row_id']
    df = df.drop(columns=duplicate_columns)
    
    # Concatenate horizontally
    merged_df = pd.concat([merged_df, df], axis=1)

print("merged")

# Save the merged dataframe to a CSV file
output_csv = 'merged_output.csv'
merged_df.to_csv(output_csv, index=False)

# Upload the merged CSV back to S3
s3.upload_file(output_csv, bucket_name, prefix + '/combined_output/merged_output.csv')

### Creating a filtered DataFrame with limited columns and rows with less than 10 NaN values

In [None]:
import pandas as pd

merged_df = pd.read_csv("merged_output.csv")

merged_df_selected = merged_df[["admittime", "dischtime", "deathtime", "admission_type", "discharge_location", "language", "ethnicity",
                               "diagnosis", "has_chartevents_data", "callout_service", "callout_status", "callout_outcome", "acknowledge_status",
                               "label", "description", "value", "resultstatus", "costcenter", "cpt_suffix", "sectionheader", "subsectionheader",
                               "drg_type", "drg_code", "drg_severity", "codesuffix", "short_title", "long_title", "abbreviation", "dbsource",
                               "linksto", "param_type", "fluid", "dilution_text", "dilution_comparison", "interpretation", "text", "gender",
                               "dob", "dod", "startdate", "enddate", "drug_type", "drug", "drug_name_poe", "drug_name_generic", "formulary_drug_cd",
                               "prod_strength", "location"]]


filtered_df = merged_df_selected[merged_df_selected.isna().sum(axis=1) < 10]
filtered_df.shape
filtered_df.to_csv("csv_to_text.csv", index=False)

### Saving rows from filtered DataFrame as txt files

In [None]:
import csv
import os

def convert_csv_rows_to_text(csv_file_path, output_directory):
    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)

    # Open the CSV file
    with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
        reader = csv.DictReader(csv_file)

        # Iterate over each row in the CSV file
        for i, row in enumerate(reader):
            # Combine the data in the row into a single string
            row_data = "\n".join([f"{key}: {value}" for key, value in row.items()])
            
            # Define the output file path
            output_file_path = os.path.join(output_directory, f"row_{i+1}.txt")
            
            # Write the row data to a text file
            with open(output_file_path, mode='w', encoding='utf-8') as text_file:
                text_file.write(row_data)
            
            print(f"Row {i+1} written to {output_file_path}")

csv_file_path = 'csv_to_text.csv'
output_directory = 'output_s3'
convert_csv_rows_to_text(csv_file_path, output_directory)

### Uploading patient text files to S3 bucket for AWS Comprehend Medical

In [None]:
import boto3
import os

# Initialize S3 client
s3 = boto3.client('s3')

# Define the S3 bucket and prefix (folder path)
bucket_name = 'lambdasam-bc866238ef-us-east-1'

directory = "output_s3"

for f in os.listdir(directory):
    if f.endswith("txt"):
        path = os.path.join(directory, f)
        s3.upload_file(path, bucket_name, f'text_files/{f}')
        print(f)
    else:
        print("not txt")

print("done")

### AWS Comprehend Medical

In [None]:
import boto3
import os

s3 = boto3.client('s3')
comprehend_medical = boto3.client('comprehendmedical')

def process_files(input_bucket, output_bucket, input_prefix, output_prefix):
    # List all files in the input S3 directory
    response = s3.list_objects_v2(Bucket=input_bucket, Prefix=input_prefix)
    
    # Iterate over each file
    for obj in response.get('Contents', []):
        file_key = obj['Key']
        
        # Download the file
        file_name = os.path.basename(file_key)
        local_file_path = '/tmp/' + file_name
        s3.download_file(input_bucket, file_key, local_file_path)
        
        # Read the file content
        with open(local_file_path, 'r') as file:
            text = file.read()
        
        # Use Comprehend Medical to detect entities
        result = comprehend_medical.detect_entities(Text=text)
        entities = result['Entities']
        
        # Prepare the output content
        output_content = ""
        for entity in entities:
            output_content += f"{entity['Type']}: {entity['Text']} (Confidence: {entity['Score']})\n"
        
        # Save the output to a new file
        output_file_name = file_name.replace('.txt', '_entities.txt')
        output_file_path = '/tmp/' + output_file_name
        with open(output_file_path, 'w') as output_file:
            output_file.write(output_content)
        
        # Upload the result to the output S3 directory
        s3.upload_file(output_file_path, output_bucket, f"{output_prefix}/{output_file_name}")
        print(f"Processed {file_key} and saved entities to {output_prefix}/{output_file_name}")

# Specify your S3 bucket names and prefixes
input_bucket = 'your-input-bucket-name'
output_bucket = 'your-output-bucket-name'
input_prefix = 'input-directory/'
output_prefix = 'output-directory'

# Run the processing function
process_files(input_bucket, output_bucket, input_prefix, output_prefix)

### Downloading and formatting output from Comprehend Medical

In [None]:
import boto3
import os

# Initialize S3 client
s3 = boto3.client('s3')

# Define the S3 bucket and prefix (folder path)
bucket_name = 'lambdasam-bc866238ef-us-east-1'
bucket_prefix = 'extracted_entities'
directory = "output_cm"

response = s3.list_objects_v2(Bucket = bucket_name, Prefix = bucket_prefix)

if 'Contents' in response:
    for obj in response['Contents']:
        key = obj['Key']
        if key.endswith('.out'):
            # Download the .out file
            local_file_path = os.path.join(directory, os.path.basename(key))
            s3.download_file(bucket_name, key, local_file_path)

            # Convert the .txt.json file to .txt
            if local_file_path.endswith('.txt.out'):
                new_file_path = local_file_path.replace('.txt.out', '.txt')
                os.rename(local_file_path, new_file_path)
                print(f'Converted: {local_file_path} -> {new_file_path}')
else:
    print("No .out files found in the S3 bucket.")

#### References:
https://docs.aws.amazon.com/IAM/latest/UserGuide/access_policies.html

https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-s3-bucket.html

https://docs.aws.amazon.com/prescriptive-guidance/latest/patterns/successfully-import-an-s3-bucket-as-an-aws-cloudformation-stack.html

https://docs.aws.amazon.com/service-authorization/latest/reference/list_amazoncloudfront.html

https://docs.aws.amazon.com/service-authorization/latest/reference/list_awscloudformation.html

https://docs.aws.amazon.com/service-authorization/latest/reference/list_amazoncomprehendmedical.html

https://docs.aws.amazon.com/comprehend-medical/latest/dev/comprehendmedical-gettingstarted.html

https://docs.aws.amazon.com/service-authorization/latest/reference/list_awsidentityandaccessmanagementiam.html

https://docs.aws.amazon.com/bedrock/latest/userguide/getting-started.html

https://docs.aws.amazon.com/service-authorization/latest/reference/list_amazonbedrock.html

https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/cloudformation.html

https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html