In [None]:
import openai
import pandas as pd
import re
import concurrent.futures

openai.api_key = 'YOUR_PERSONAL_API_KEY'

def extract_icd_code(text):
    # A basic pattern for ICD-10 codes (one uppercase letter followed by two digits, optionally followed by a dot and one or two digits)
    pattern = r"[A-Z]\d{2}(?:\.\d{1,2})?"
    codes = re.findall(pattern, text)
    return ", ".join(codes) if codes else "No ICD-10 code found"

def classify_cause_of_death(cause):
    # Split the string on commas that are not inside parentheses
    causes = re.split(',\s*(?![^()]*\))', cause)
    icd_codes = []
    for cause in causes:
        response = openai.ChatCompletion.create(
          model="gpt-3.5-turbo",
          messages=[
                {"role": "system", "content": "You are a knowledgeable model trained to classify causes of death according to ICD-10 categories."},
                {"role": "user", "content": f"Provide the ICD-10 code for the following cause of death: {cause.strip()}."}
            ]
        )
        icd_code = extract_icd_code(response['choices'][0]['message']['content'])
        icd_codes.append(icd_code)
    return cause, ", ".join(icd_codes)


from tqdm.notebook import tqdm
import concurrent.futures

def process_batches(causes_of_death, batch_size, classification_dict):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Create a dictionary of futures
        future_to_cause = {executor.submit(classify_cause_of_death, cause): cause for cause in causes_of_death}

        # Create a tqdm progress bar
        pbar = tqdm(total=len(causes_of_death))

        for future in concurrent.futures.as_completed(future_to_cause):
            cause = future_to_cause[future]
            try:
                cause, icd_code = future.result()
                classification_dict[cause] = icd_code
                print(f"Processed cause: {cause}, ICD-10 code: {icd_code}")
            except Exception as exc:
                print('%r generated an exception: %s' % (cause, exc))
            
            # Update the progress bar
            pbar.update(1)

        # Close the progress bar
        pbar.close()

causes_of_death = finaldata['deathcauses'].tolist()

classification_dict = {}

process_batches(causes_of_death, 5, classification_dict)

In [None]:
import csv
with open('classification_dict.csv', 'w') as file:
    writer = csv.writer(file)
    for key, value in classification_dict.items():
        writer.writerow([key, value])

In [None]:

openai.api_key = 'YOUR_PERSONAL_API_KEY'

def extract_icd_code(text):
    # A basic pattern for ICD-10 codes (one uppercase letter followed by two digits, optionally followed by a dot and one or two digits)
    pattern = r"[A-Z]\d{2}(?:\.\d{1,2})?"
    codes = re.findall(pattern, text)
    return ", ".join(codes) if codes else "No ICD-10 code found"

def classify_cause_of_death(cause):
    # Ensure that the cause is processed as a single entity
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a knowledgeable model trained to classify causes of death according to ICD-10 categories."},
            {"role": "user", "content": f"Provide the ICD-10 code for the following cause of death: {cause.strip()}."}
        ]
    )

    # Extract the ICD-10 code from the response
    icd_code = extract_icd_code(response['choices'][0]['message']['content'])

    return cause, icd_code


def process_batches(causes_of_death, batch_size, classification_dict):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Create a dictionary of futures
        future_to_cause = {executor.submit(classify_cause_of_death, cause): cause for cause in causes_of_death}

        # Create a tqdm progress bar
        pbar = tqdm(total=len(causes_of_death))

        for future in concurrent.futures.as_completed(future_to_cause):
            cause = future_to_cause[future]
            try:
                cause, icd_code = future.result()
                classification_dict[cause] = icd_code
                print(f"Processed cause: {cause}, ICD-10 code: {icd_code}")
            except Exception as exc:
                print('%r generated an exception: %s' % (cause, exc))
            
            # Update the progress bar
            pbar.update(1)

        # Close the progress bar
        pbar.close()

# Filter the dataframe to get rows where the "ICD-10 Code" column is 'nonefound' or NaN

finaldata3 = finaldata2.copy()

missing_data = finaldata3[finaldata3['ICD-10 Code'].isna() | (finaldata3['ICD-10 Code'] == 'nonefound')]

# Get the causes of death from these rows
causes_of_death_missing = missing_data['deathcauses'].tolist()

# Create a new dictionary to store the classifications
missing_classification_dict = {}

# Process the causes of death
process_batches(causes_of_death_missing, 5, missing_classification_dict)


In [None]:
import pandas as pd

# Read the .txt file
with open('ndoc3.txt', 'r') as file:
    lines = file.readlines()

# Initialize lists to store cause of death and ICD-10 code
cause_of_death_list = []
icd_10_code_list = []

# Process each line and extract cause of death and ICD-10 code
for line in lines:
    if ':' in line:
        pair = line.strip().split(':')
        cause_of_death = pair[0].strip()
        icd_10_code = pair[1].strip()
        cause_of_death_list.append(cause_of_death)
        icd_10_code_list.append(icd_10_code)
    else:
        print(f"Line does not follow the expected format: {line}")

# Create a DataFrame
data = {
    'Cause of Death': cause_of_death_list,
    'ICD-10 Code': icd_10_code_list
}
classdf1 = pd.DataFrame(data)

# Display the DataFrame
print(classdf1)