Found data under 'OTHER' in 'Interventions' have been largely misclassified. Adjust manually.

In [1]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('/Users/ellia.hsieh/Documents/GitHub/strategic-partner-identification/data/02-processed/processed-sleep-apnea.csv')

In [5]:
intv_types_exploded = df['Intervention Types'].str.split(', ').explode()
intv_types_counts = intv_types_exploded.value_counts(ascending=False)

print(intv_types_counts) # clinical trials numbers for each interverence

Intervention Types
DEVICE                 919
DRUG                   309
OTHER                  246
BEHAVIORAL             222
PROCEDURE              168
DIAGNOSTIC_TEST         88
DIETARY_SUPPLEMENT      12
COMBINATION_PRODUCT      6
BIOLOGICAL               3
RADIATION                1
GENETIC                  1
Name: count, dtype: int64


In [6]:
# See what's classified as 'OTHER'

# Set display options to show all rows

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_colwidth', None)  # Show full column width

df[df['Intervention Types'].str.contains('OTHER')]['Interventions'].str.extract(r'OTHER:\s*([^|]+)')

Unnamed: 0,0
6,placebo
12,Placebo
34,Standard care for OSA
42,Resistance training
45,Training
47,Sound stimulation
50,Standard of Care
55,Administration of gas mixtures
56,Induction of flow limitation
57,Induced central apneas


In [7]:
# Step 1: Create a mapping dictionary with correct categories and their associated terms

mapping_dict = {
    'DEVICE': ['CPAP', 'PAP', 'Nasal dilator', 'full face masks'],
    'BEHAVIORAL': ['exercise', 'lifestyle', 'exercises', 'activity', 'positioning',
                   'diet', 'yoga', 'Mouth closure', 'positioning'],
    'PROCEDURE': ['SPA treatment'],
    'DIAGNOSTIC_TEST': ['Polygraphy', 'Apneascan TM', 'Polysomnography', 'PSG', 'monitor',
                        'monitoring', 'polysomnogram', 'Sleep study', 'diagnostic test'],
}

# Step 2: Replace 'OTHER' entries in the 'Interventions' column (case-insensitive)
def replace_other_entries(intervention):
    if pd.isna(intervention):  # Check for NaN
        return intervention  # Return NaN unchanged
    parts = intervention.split('|')  # Split by '|'
    for i, part in enumerate(parts):
        if 'OTHER:' in part:
            # Get the content after 'OTHER:'
            content = part.split('OTHER: ')[1].strip().lower()  # Convert content to lower case
            # Check against mapping_dict
            for category, terms in mapping_dict.items():
                for term in terms:
                    if term.lower() in content:  # Check if term (in lower case) is in the content
                        # Replace 'OTHER:' with the correct category
                        parts[i] = part.replace('OTHER:', f'{category}:')
                        break  # Exit the loop once a match is found
    return '|'.join(parts)  # Join back the parts

# Apply the replacement function
df['Interventions'] = df['Interventions'].apply(replace_other_entries)

In [8]:
# Extract intervention types with the pattern

import re

def extract_intervention_types(interventions_str):
    matches = re.findall(r'(\w+):', interventions_str)
    
    return list(set(matches)) # remove duplicates and return as a list

df['Intervention Types'] = df['Interventions'].apply(lambda intvs: extract_intervention_types(intvs))

In [9]:
# Convert the list of intervention types in each row to a single string

df['Intervention Types'] = df['Intervention Types'].apply(lambda intv_types: ', '.join(intv_types) if intv_types else '')
df['Intervention Types'].unique()

array(['BEHAVIORAL', 'DIETARY_SUPPLEMENT', 'DEVICE', 'DIAGNOSTIC_TEST',
       'DRUG', 'DIETARY_SUPPLEMENT, OTHER', 'DEVICE, DRUG',
       'DEVICE, PROCEDURE', 'DEVICE, OTHER', 'PROCEDURE',
       'DEVICE, Device', 'OTHER, BEHAVIORAL', 'OTHER',
       'DEVICE, OTHER, DRUG', 'DIAGNOSTIC_TEST, OTHER',
       'PROCEDURE, BEHAVIORAL', 'DEVICE, DIETARY_SUPPLEMENT',
       'OTHER, DRUG', 'DIETARY_SUPPLEMENT, PROCEDURE',
       'DEVICE, DRUG, BEHAVIORAL', 'DRUG, PROCEDURE, BEHAVIORAL',
       'DEVICE, BEHAVIORAL', 'DEVICE, DIAGNOSTIC_TEST', 'PROCEDURE, DRUG',
       'COMBINATION_PRODUCT',
       'DEVICE, COMBINATION_PRODUCT, PROCEDURE, BEHAVIORAL',
       'DEVICE, COMBINATION_PRODUCT, DRUG', 'COMBINATION_PRODUCT, OTHER',
       'DIAGNOSTIC_TEST, DRUG',
       'COMBINATION_PRODUCT, PROCEDURE, BEHAVIORAL', '1, DRUG',
       'DEVICE, DIETARY_SUPPLEMENT, OTHER, DRUG', 'BEHAVIORAL, DRUG',
       'PROCEDURE, OTHER', 'DEVICE, DIAGNOSTIC_TEST, OTHER',
       'DEVICE, DIAGNOSTIC_TEST, PROCEDURE', 'DEV

In [21]:
# Clean data with strange format

df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'DEVICE' if x == 'Device, DEVICE' else x)
df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'DEVICE' if x == 'DEVICE, Device' else x)
df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'DRUG' if x == '1, DRUG' else x)
df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'DRUG' if x == 'DRUG, 1' else x)
df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'DEVICE' if x == 'F, A, DEVICE, D, C, H, B, G, E' else x)
df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'DEVICE' if x == 'F, A, G, B, E, D, DEVICE, H, C' else x)
df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'DEVICE' if x == 'DEVICE, D, C, E, A, F, H, G, B' else x)
df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'DEVICE' if x == 'G, DEVICE, H, A, D, E, B, F, C' else x)
df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'DEVICE' if x == 'A, C, H, DEVICE, E, F, G, B, D' else x)
df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'OTHER, DEVICE' if x == 'OTHER, devices' else x)
df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'DEVICE' if x == 'USA, DEVICE, Europe' else x)
df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'DRUG' if x == 'Comparator, DRUG' else x)
df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'DEVICE' if x == 'Europe, DEVICE, USA' else x)
df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'DEVICE' if x == 'Europe, USA, DEVICE' else x)
df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'DEVICE' if x == 'DEVICE, Europe, USA' else x)
df['Intervention Types'] = df['Intervention Types'].apply(lambda x: 'DIAGNOSTIC_TEST, DEVICE' if x == 'DIAGNOSTIC_TEST, devices' else x)

In [22]:
df['Intervention Types'].unique()

array(['BEHAVIORAL', 'DIETARY_SUPPLEMENT', 'DEVICE', 'DIAGNOSTIC_TEST',
       'DRUG', 'DIETARY_SUPPLEMENT, OTHER', 'DEVICE, DRUG',
       'DEVICE, PROCEDURE', 'DEVICE, OTHER', 'PROCEDURE',
       'OTHER, BEHAVIORAL', 'OTHER', 'DEVICE, OTHER, DRUG',
       'DIAGNOSTIC_TEST, OTHER', 'PROCEDURE, BEHAVIORAL',
       'DEVICE, DIETARY_SUPPLEMENT', 'OTHER, DRUG',
       'DIETARY_SUPPLEMENT, PROCEDURE', 'DEVICE, DRUG, BEHAVIORAL',
       'DRUG, PROCEDURE, BEHAVIORAL', 'DEVICE, BEHAVIORAL',
       'DEVICE, DIAGNOSTIC_TEST', 'PROCEDURE, DRUG',
       'COMBINATION_PRODUCT',
       'DEVICE, COMBINATION_PRODUCT, PROCEDURE, BEHAVIORAL',
       'DEVICE, COMBINATION_PRODUCT, DRUG', 'COMBINATION_PRODUCT, OTHER',
       'DIAGNOSTIC_TEST, DRUG',
       'COMBINATION_PRODUCT, PROCEDURE, BEHAVIORAL',
       'DEVICE, DIETARY_SUPPLEMENT, OTHER, DRUG', 'BEHAVIORAL, DRUG',
       'PROCEDURE, OTHER', 'DEVICE, DIAGNOSTIC_TEST, OTHER',
       'DEVICE, DIAGNOSTIC_TEST, PROCEDURE', 'DEVICE, OTHER, BEHAVIORAL',
    

In [23]:
intv_types_exploded = df['Intervention Types'].str.split(', ').explode()
intv_types_counts = intv_types_exploded.value_counts(ascending=False)

print(intv_types_counts) # clinical trials numbers for each interverence

Intervention Types
DEVICE                 919
DRUG                   309
OTHER                  246
BEHAVIORAL             222
PROCEDURE              168
DIAGNOSTIC_TEST         88
DIETARY_SUPPLEMENT      12
COMBINATION_PRODUCT      6
BIOLOGICAL               3
RADIATION                1
GENETIC                  1
Name: count, dtype: int64


In [25]:
# import os

# path = os.path.join('/Users/ellia.hsieh/Documents/GitHub/strategic-partner-identification/data/02-processed', 'processed-sleep-apnea-1.csv')
# df.to_csv(path, index=False)