#Predicting Patients’ Attrition Percentage in Clinical Trials

---


Project 3 - Group 3B



---

### **Secondary Goal:** Geographically map clinical trial facilities to visualize attrition rate between urban and rural areas

This notebook covers the steps for **Map Creation**

### Step 1. Create the Dataset to be used for Mapping

In [1]:
# Install necessary packages
!pip install pyzipcode
import requests
from pyzipcode import ZipCodeDatabase
import folium
import pandas as pd
import folium
from folium.plugins import MarkerCluster


Collecting pyzipcode
  Downloading pyzipcode-3.0.1.tar.gz (1.9 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m60.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyzipcode
  Building wheel for pyzipcode (setup.py) ... [?25l[?25hdone
  Created wheel for pyzipcode: filename=pyzipcode-3.0.1-py3-none-any.whl size=1932161 sha256=e19858a06d6b61e2400bd17ba795406e958f12b9e9aea2763d9f13d02e455515
  Stored in directory: /root/.cache/pip/wheels/33/11/bb/75313afc9ef5609e3997d374a6f6c4d48932fdb8ed8fe2ea77
Successfully built pyzipcode
Installing collected packages: pyzipcode
Successfully installed pyzipcode-3.0.1


In [2]:
# Mounting Google Drive in Google Colab
from google.colab import drive
import shutil
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Define path and load data
ct_attrition = "/content/drive/MyDrive/Georgetown University/HIDS/HIDS-6001/Project 3/Working Files/Data"
attrition_df = pd.read_csv(ct_attrition + "/ct_attrition_dataset.csv")

#### Get latitude, longitude and Zip Codes from the JSON file via ClinicalTrials.gov API

In [4]:
# Create a list of URLs from the 'nct_id' column
nct_list = attrition_df['nct_id']
url_series = ["https://clinicaltrials.gov/api/v2/studies/" + nct_id for nct_id in nct_list]

In [5]:
# List to store location data
location_data = []

# Loop through the list of URLs and make requests
for url in url_series:
    try:
        # Get the data from the API
        response = requests.get(url)
        response.raise_for_status()  # Ensure we catch any HTTP errors
        jsonContent = response.json()

         # Extract the nct_id
        nct_id = jsonContent.get('protocolSection', {}).get('identificationModule', {}).get('nctId', None)
        # Check if nct_id was extracted successfully
        if nct_id is None:
          print(f"nct_id not found for URL: {url}")
          continue  # Skip this iteration if nct_id is not found

        # Extract the locations using the correct JSON path
        locations = jsonContent.get('protocolSection', {}).get('contactsLocationsModule', {}).get('locations', [])
        if locations:
            for location in locations:
                # Extract relevant fields
                facility_name = location.get('facility', None)
                zip_code = location.get('zip', None)

                # Extract latitude and longitude from geoPoint
                geo_point = location.get('geoPoint', {})
                latitude = geo_point.get('lat', None)
                longitude = geo_point.get('lon', None)

                # Append data if latitude and longitude are available
                if latitude is not None and longitude is not None:
                    location_data.append({
                        'nct_id': nct_id,
                        'facility_name': facility_name,
                        'zip_code': zip_code,
                        'latitude': latitude,
                        'longitude': longitude
                    })
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred for {url}: {http_err}")
    except requests.exceptions.RequestException as req_err:
        print(f"Request exception occurred for {url}: {req_err}")
    except Exception as e:
        print(f"An error occurred for {url}: {e}")


# Convert the collected location data into a DataFrame
location_df = pd.DataFrame(location_data)

# Output the resulting DataFrame
print(location_df)

            nct_id                                      facility_name  \
0      NCT00001596  National Institutes of Health Clinical Center,...   
1      NCT00001723  National Institutes of Health Clinical Center,...   
2      NCT00003869               North Central Cancer Treatment Group   
3      NCT00003901                           Providence Cancer Center   
4      NCT00003901                    Mobile Infirmary Medical Center   
...            ...                                                ...   
66390  NCT03099096                           GSK Investigational Site   
66391  NCT03099096                           GSK Investigational Site   
66392  NCT03099096                           GSK Investigational Site   
66393  NCT03099096                           GSK Investigational Site   
66394  NCT03099096                           GSK Investigational Site   

         zip_code  latitude  longitude  
0           20892  38.98067  -77.10026  
1           20892  38.98067  -77.10026  


In [6]:
# Merge with the attrition dataframe
attritionFinal_df = pd.merge(attrition_df, location_df, on='nct_id', how='left')

In [7]:
# Output the merged DataFrame
attritionFinal_df

Unnamed: 0,nct_id,dropout_percentage_all,facility_name,zip_code,latitude,longitude
0,NCT00001596,17.391304,"National Institutes of Health Clinical Center,...",20892,38.98067,-77.10026
1,NCT00001723,8.000000,"National Institutes of Health Clinical Center,...",20892,38.98067,-77.10026
2,NCT00003869,21.877891,North Central Cancer Treatment Group,55905,44.02163,-92.46990
3,NCT00003901,0.859599,Providence Cancer Center,36608,30.69436,-88.04305
4,NCT00003901,0.859599,Mobile Infirmary Medical Center,36640-0460,30.69436,-88.04305
...,...,...,...,...,...,...
66480,NCT03099096,0.628931,GSK Investigational Site,LE3 9QP,52.63860,-1.13169
66481,NCT03099096,0.628931,GSK Investigational Site,BD96RJ,53.79391,-1.75206
66482,NCT03099096,0.628931,GSK Investigational Site,OX37LE,51.75222,-1.25596
66483,NCT03099096,0.628931,GSK Investigational Site,PL6 8DH,50.37153,-4.14305


In [8]:
attritionFinal_df[['latitude','latitude', 'zip_code']].isna().sum()

Unnamed: 0,0
latitude,90
latitude,90
zip_code,19663


#### Get Location Type (Urban and Rural) from the RUCA dataset

In [9]:
# Load the RUCA dataset from Excel
ruca_df = pd.read_excel(ct_attrition + "/2006_Complete_Excel_RUCA_file.xls")

# Inspect the first few rows of the dataset
print(ruca_df.head())

   ZIPA  ZIPN  RUCA2.0  COMMFLAG  STNAME
0     1     1     10.0         0  Alaska
1     2     2     10.0         0  Alaska
2     3     3     10.0         0  Alaska
3     4     4     10.0         0  Alaska
4     5     5     10.0         0  Alaska


In [10]:
def classify_location(ruca_code):
    # Urban focused RUCA codes
    urban_codes = [1.0, 1.1, 2.0, 2.1, 3.0, 4.1, 5.1, 7.1, 8.1, 10.1]
    # Large Rural City/Town (micropolitan) focused RUCA codes
    large_rural_codes = [4.0, 4.2, 5.0, 5.2, 6.0, 6.1]
    # Small Rural Town focused RUCA codes
    small_rural_codes = [7.0, 7.2, 7.3, 7.4, 8.0, 8.2, 8.3, 8.4, 9.0, 9.1, 9.2]
    # Isolated Small Rural Town focused RUCA codes
    isolated_rural_codes = [10.0, 10.2, 10.3, 10.4, 10.5, 10.6]

    if ruca_code in urban_codes:
        return 'Urban'
    elif ruca_code in large_rural_codes:
        return 'Large Rural City/Town (micropolitan)'
    elif ruca_code in small_rural_codes:
        return 'Small Rural Town'
    elif ruca_code in isolated_rural_codes:
        return 'Isolated Small Rural Town'
    else:
        return 'Unknown'  # In case there's a missing or invalid RUCA code

# Apply the classification function to the RUCA dataset
ruca_df['location_type'] = ruca_df['RUCA2.0'].apply(classify_location)

# Inspect the dataset with the new classification
print(ruca_df.head())

   ZIPA  ZIPN  RUCA2.0  COMMFLAG  STNAME              location_type
0     1     1     10.0         0  Alaska  Isolated Small Rural Town
1     2     2     10.0         0  Alaska  Isolated Small Rural Town
2     3     3     10.0         0  Alaska  Isolated Small Rural Town
3     4     4     10.0         0  Alaska  Isolated Small Rural Town
4     5     5     10.0         0  Alaska  Isolated Small Rural Town


#### Merge Data retrieved from API with data from RUCA to create Final Dataset

In [11]:
# Ensure that the 'zip_code' column is formatted properly
attritionFinal_df['zip_code'] = attritionFinal_df['zip_code'].astype(str).str.zfill(5)

ruca_df['ZIPN'] = ruca_df['ZIPN'].astype(str).str.zfill(5)  # Ensure leading zeros

# Merge the clinical trials data with the RUCA dataset on the ZIP code column
merged_df = pd.merge(attritionFinal_df, ruca_df[['ZIPN', 'location_type']],
                     left_on='zip_code', right_on='ZIPN', how='left')

### Step 2. Create the Facility Locations Map

In [12]:
# Drop rows with NaN latitude or longitude
valid_df = merged_df.dropna(subset=['latitude', 'longitude'])

# Create a base map centered on the average latitude and longitude of valid data
map_center = [valid_df['latitude'].mean(), valid_df['longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=5)

In [13]:
# Create color categories based on dropout percentage
def get_attrition_color(dropout_percentage):
    if dropout_percentage < 20:
        return 'green'
    elif 20 <= dropout_percentage < 50:
        return 'orange'
    elif 50 <= dropout_percentage < 80:
        return 'red'
    else:
        return 'darkred'

In [14]:
# Create the map
m = folium.Map(location=[37.0902, -95.7129], zoom_start=4)

# Create a MarkerCluster and add it to the map
marker_cluster = MarkerCluster().add_to(m)

In [15]:
for idx, row in valid_df.iterrows():
    dropout_percentage = round(row['dropout_percentage_all'], 2)  # Round dropout percentage to 2 decimal places
    attrition_color = get_attrition_color(dropout_percentage)  # Get color based on dropout percentage

    popup_info = (
        f"Trial ID: {row['nct_id']}<br>"
        f"Facility Name: {row['facility_name']}<br>"
        f"Dropout Percentage: {dropout_percentage}%<br>"
        f"Zip Code: {row['zip_code']}<br>"
        f"Location Type: {row['location_type']}"
    )

    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=folium.Popup(popup_info, max_width=300),
        icon=folium.Icon(color=attrition_color)
    ).add_to(marker_cluster)

## Export the Map

In [16]:
# Save the map to an HTML file
map_file = "clinical_trials_map.html"
m.save(map_file)