<a href="https://colab.research.google.com/github/gcosma/ACO-FS/blob/master/DECODEPaperFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tool for analysing condition progression in patients with Multiple Long Term Conditions and Intellectual Disabilities

In [None]:
#@title Step 1: You need to make some installations. This is very simple. On the next line you will see a little triangle that's in a little black circle on the left of this box. Click on it. When the triangle turns to a green tick you will see a message that says "Done! Now click on Step 2".
# Install necessary packages
!pip -q install pandas numpy networkx matplotlib pyvis ipywidgets ipython

# Display a message after installation
print("Done! Now click on Step 2.")

Done! Now click on Step 2.


In [None]:
#@title Step 2: The arrow on the left just before the start of this sentence is to hide and show the code. If you click it accidentally, just click it again and the code will be hidden. Below this sentence there is another arrow in a circle. To execute the code click on the arrow that's in the cirle.
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import colorsys
from pyvis.network import Network
from itertools import combinations
import os
from google.colab import files
from IPython.display import HTML, display
import base64
import io
import colorsys

# Global variables
global_data = None
total_patients_in_group = 0
gender = ''
age_group = ''
results_df = None

# Complete dictionary of conditions and their categories
condition_categories = {
    "Anaemia": "Blood",
    "Cardiac Arrhythmias": "Circulatory",
    "Coronary Heart Disease": "Circulatory",
    "Heart Failure": "Circulatory",
    "Hypertension": "Circulatory",
    "Peripheral Vascular Disease": "Circulatory",
    "Barretts Oesophagus": "Digestive",
    "Chronic Constipation": "Digestive",
    "Chronic Diarrhoea": "Digestive",
    "Cirrhosis": "Digestive",
    "Dysphagia": "Digestive",
    "Inflammatory Bowel Disease": "Digestive",
    "Reflux Disorders": "Digestive",
    "Hearing Loss": "Ear",
    "Addisons Disease": "Endocrine",
    "Diabetes": "Endocrine",
    "Polycystic Ovary Syndrome": "Endocrine",
    "Thyroid Disorders": "Endocrine",
    "Visual Impairment": "Eye",
    "Chronic Kidney Disease": "Genitourinary",
    "Menopausal And Perimenopausal": "Genitourinary",
    "Dementia": "Mental",
    "Mental Illness": "Mental",
    "Tourette": "Mental",
    "Chronic Arthritis": "Musculoskeletal",
    "Chronic Pain Conditions": "Musculoskeletal",
    "Osteoporosis": "Musculoskeletal",
    "Cancer": "Neoplasms",
    "Cerebral Palsy": "Nervous",
    "Epilepsy": "Nervous",
    "Insomnia": "Nervous",
    "Multiple Sclerosis": "Nervous",
    "Neuropathic Pain": "Nervous",
    "Parkinsons": "Nervous",
    "Stroke": "Nervous",
    "Bronchiectasis": "Respiratory",
    "Chronic Airway Diseases": "Respiratory",
    "Chronic Pneumonia": "Respiratory",
    "Interstitial Lung Disease": "Respiratory",
    "Psoriasis": "Skin"
}

# Generate color mapping
category_colors = None

def load_file():
    print("Please upload your CSV file.")
    uploaded = files.upload()

    if not uploaded:
        print("No file uploaded. Exiting.")
        return None, None, None, None

    file_name = next(iter(uploaded))
    file_content = uploaded[file_name]

    try:
        data = pd.read_csv(io.BytesIO(file_content))
        total_patients = data['TotalPatientsInGroup'].iloc[0]

        print(f"Uploaded file: {file_name}")

        # Normalize the filename for comparison
        file_name_lower = file_name.lower()

        # Check for gender
        if 'females' in file_name_lower:
            gender = 'Female'
        elif 'males' in file_name_lower:
            gender = 'Male'
        else:
            gender = 'Unknown Gender'

        # Check for age group
        if 'below45' in file_name_lower:
            age_group = '<45'
        elif '45to64' in file_name_lower:
            age_group = '45-64'
        elif '65plus' in file_name_lower:
            age_group = '65+'
        else:
            age_group = 'Unknown Age Group'

        print(f"File loaded successfully. Total patients: {total_patients}")
        print(f"Gender: {gender}, Age Group: {age_group}")

        return data, total_patients, gender, age_group

    except Exception as e:
        print(f"Error loading file: {str(e)}")
        return None, None, None, None

def initialize_data(data, total_patients, gender_info, age_group_info):
    global global_data, total_patients_in_group, gender, age_group, category_colors
    global_data = data
    total_patients_in_group = total_patients
    gender = gender_info
    age_group = age_group_info
    category_colors = assign_colors_to_categories(condition_categories)

def create_graph(data, min_percentage, min_frequency):
    G = nx.DiGraph()
    for _, row in data.iterrows():
        if row['Percentage'] >= min_percentage and row['PairFrequency'] >= min_frequency:
            G.add_edge(row['ConditionA'], row['ConditionB'],
                       weight=row['MedianDurationYears'],
                       odds_ratio=row['OddsRatio'],
                       pair_frequency=row['PairFrequency'],
                       percentage=row['Percentage'])
    return G

def generate_colorblind_friendly_colors(n):
    """
    Generate a colorblind-friendly color palette for the network graph.
    """
    base_colors = [
        "#1f77b4",  # Muted blue
        "#ff7f0e",  # Safety orange
        "#2ca02c",  # Cooked asparagus green
        "#d62728",  # Brick red
        "#9467bd",  # Muted purple
        "#8c564b",  # Chestnut brown
        "#e377c2",  # Raspberry yogurt pink
        "#7f7f7f",  # Middle gray
        "#bcbd22",  # Curry yellow-green
        "#17becf"   # Blue-teal
    ]

    if n <= len(base_colors):
        return base_colors[:n]

    # If we need more colors, generate them by adjusting lightness and saturation
    colors = base_colors.copy()
    while len(colors) < n:
        for base_color in base_colors:
            if len(colors) >= n:
                break
            h, l, s = colorsys.rgb_to_hls(*[x/255 for x in bytes.fromhex(base_color[1:])])
            # Adjust lightness and saturation
            l = min(1, l * 1.2)  # Make it slightly lighter
            s = max(0, s * 0.8)  # Reduce saturation slightly
            rgb = colorsys.hls_to_rgb(h, l, s)
            new_color = "#{:02x}{:02x}{:02x}".format(int(rgb[0]*255), int(rgb[1]*255), int(rgb[2]*255))
            if new_color not in colors:
                colors.append(new_color)

    return colors[:n]

def assign_colors_to_categories(categories):
    unique_categories = list(set(categories.values()))
    colors = generate_colorblind_friendly_colors(len(unique_categories))
    return dict(zip(unique_categories, colors))

# Update the global variable
category_colors = None

# In the initialize_data function, update the line:
category_colors = assign_colors_to_categories(condition_categories)


def fixed_edge_color():
    return '#006400'  # Dark green color

def create_pyvis_network(G, height="800px"):
    net = Network(notebook=True, bgcolor='#f0f0f0', font_color='black', cdn_resources='in_line', height=height, width="90%")

    # Add nodes with increased size and category-based colors
    for node in G.nodes:
        category = condition_categories.get(node, "Other")
        color = category_colors.get(category, "#CCCCCC")  # Default to light gray if category not found
        net.add_node(node, label=node, title=f"{node}\nCategory: {category}", size=30, color=color)

    # Calculate edge widths based on pair frequency
    pair_frequencies = [G.edges[edge]['pair_frequency'] for edge in G.edges]
    if pair_frequencies:
        min_freq, max_freq = min(pair_frequencies), max(pair_frequencies)
    else:
        min_freq, max_freq = 0, 1  # Avoid division by zero

    def scale_edge_width(freq):
        if max_freq - min_freq == 0:
            return 1  # Default width if frequencies are the same
        # Scale edge width between 1 and 5
        return 1 + 4 * (freq - min_freq) / (max_freq - min_freq)

    # Add edges with labels and scaled width
    for edge in G.edges(data=True):
        label = f"OR {edge[2]['odds_ratio']:.1f}\n({edge[2]['weight']:.1f}y)"
        color = fixed_edge_color()  # Use fixed dark green color
        edge_width = scale_edge_width(edge[2]['pair_frequency'])
        net.add_edge(edge[0], edge[1],
                     label=label,
                     title=f"Total Odds Ratio: {edge[2]['odds_ratio']:.2f}\nMedian duration: {edge[2]['weight']:.2f} years\nFrequency: {edge[2]['pair_frequency']}\nPrevalence: {edge[2]['percentage']:.2f}%",
                     value=edge_width,
                     color=color,
                     arrows='to',
                     font={'size': 12})  # Static font size of 12

    # Customize the network
    net.set_edge_smooth('dynamic')
    net.show_buttons(filter_=['physics'])

    # Adjust physics for a more spread-out layout
    net.barnes_hut(gravity=-3000, central_gravity=0.3, spring_length=200)

    return net

def generate_caption():
    min_edge_weight = global_data['OddsRatio'].min()
    max_edge_weight = global_data['OddsRatio'].max()
    min_percentage = global_data['Percentage'].min()
    min_patients = int(min_percentage * total_patients_in_group / 100)
    total_weight = global_data['OddsRatio'].sum()

    caption = f"""
    <h3>Condition Progression Network</h3>
    <ul>
        <li>Gender: {gender}</li>
        <li>Age Group: {age_group}</li>
        <li>Total Patients with Diagnoses in this Group: {total_patients_in_group}</li>
        <li>Odds Ratio Range: [{min_edge_weight:.2f}, {max_edge_weight:.2f}]</li>
        <li>Minimum Prevalence Threshold: {min_percentage:.2f}% ({min_patients} patients)</li>
        <li>Number of Condition Pairs Shown: {len(global_data)}</li>
        <li>Total Sum of Odds Ratios: {total_weight:.2f}</li>
    </ul>
    """
    return caption

def update_graph(min_percentage, min_frequency):
    global global_data

    if global_data is None or global_data.empty:
        print("No data available. Please load a valid CSV file.")
        return

    print("Updating graph...")

    print(f"Min Percentage: {min_percentage}, Min Frequency: {min_frequency}")

    # Filter the data based on the input values
    filtered_data = global_data[(global_data['Percentage'] >= min_percentage) &
                                (global_data['PairFrequency'] >= min_frequency)]

    # Create graph with filtered data
    G = create_graph(filtered_data, min_percentage, min_frequency)

    if G.number_of_nodes() == 0 or G.number_of_edges() == 0:
        print("No data matches the filter criteria. Please adjust the filters.")
        return

    try:
        # Create the graph visualization with pyvis
        net = create_pyvis_network(G, height="800px")

        # Generate a unique filename for this graph
        filename = f"network_graph_{np.random.randint(0, 1000000)}.html"

        # Save the graph to the file
        net.save_graph(filename)

        # Generate the caption
        caption = generate_caption()

        # Combine caption and graph
        with open(filename, 'r', encoding='utf-8') as f:
            html_content = f.read()
        full_html = f"{caption}\n{html_content}"

        # Encode the HTML content
        b64 = base64.b64encode(full_html.encode()).decode()

        # Display the graph using HTML
        display(HTML(f"""
        <iframe src="data:text/html;base64,{b64}" width="100%" height="800px"></iframe>
        """))

        # Provide download link for the full HTML file
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(full_html)
        files.download(filename)

        print(f"Graph saved as {filename}. You can download it using the link above.")

    except Exception as e:
        print(f"An error occurred while creating the graph: {str(e)}")

    print(f"Graph updated: {len(G.nodes())} nodes, {len(G.edges())} edges.")

def analyze_condition_combinations(min_percentage, min_frequency):
    global results_df

    if global_data is None or global_data.empty:
        print("No data available. Please load a valid CSV file.")
        return

    data = global_data[(global_data['Percentage'] >= min_percentage) &
                       (global_data['PairFrequency'] >= min_frequency)].copy()

    data.loc[:, 'ConditionA'] = data['ConditionA'].str.replace(r'\s*\([^)]*\)', '', regex=True)
    data.loc[:, 'ConditionB'] = data['ConditionB'].str.replace(r'\s*\([^)]*\)', '', regex=True)
    data.loc[:, 'ConditionA'] = data['ConditionA'].str.replace('_', ' ')
    data.loc[:, 'ConditionB'] = data['ConditionB'].str.replace('_', ' ')

    unique_conditions = pd.unique(data[['ConditionA', 'ConditionB']].values.ravel('K'))

    pair_frequency_map = {}
    condition_frequency_map = {}

    for _, row in data.iterrows():
        key1 = f"{row['ConditionA']}_{row['ConditionB']}"
        key2 = f"{row['ConditionB']}_{row['ConditionA']}"
        pair_frequency_map[key1] = row['PairFrequency']
        pair_frequency_map[key2] = row['PairFrequency']

        condition_frequency_map[row['ConditionA']] = condition_frequency_map.get(row['ConditionA'], 0) + row['PairFrequency']
        condition_frequency_map[row['ConditionB']] = condition_frequency_map.get(row['ConditionB'], 0) + row['PairFrequency']

    result_data = []

    for k in range(3, min(8, len(unique_conditions) + 1)):
        for comb in combinations(unique_conditions, k):
            pair_frequencies = [pair_frequency_map.get(f"{a}_{b}", 0) for a, b in combinations(comb, 2)]
            frequency = min(pair_frequencies)
            prevalence = (frequency / total_patients_in_group) * 100

            observed = frequency
            expected = total_patients_in_group
            for condition in comb:
                expected *= (condition_frequency_map[condition] / total_patients_in_group)
            odds_ratio = observed / expected if expected != 0 else float('inf')

            result_data.append({
                'Combination': ' + '.join(comb),
                'NumConditions': len(comb),
                'Minimum Pair Frequency': frequency,
                'Prevalence of the combination (%)': prevalence,
                'Total odds ratio': odds_ratio
            })

    results_df = pd.DataFrame(result_data)
    results_df = results_df.sort_values('Prevalence of the combination (%)', ascending=False)
    results_df = results_df[results_df['Prevalence of the combination (%)'] > 0]

    print(f'Analysis complete. {len(results_df)} combinations found.')
    return results_df

def save_results_to_csv(filename="condition_combinations.csv"):
    global results_df
    if results_df is not None and not results_df.empty:
        results_df.to_csv(filename, index=False)
        print(f"Results saved to {filename}")

        # For Colab, we need to provide a download link
        files.download(filename)
    else:
        print("No results available to save. Please run the analysis first.")

def display_legend():
    print("\nSystem Legend:")
    unique_categories = sorted(set(condition_categories.values()))
    max_category_length = max(len(category) for category in unique_categories)

    for category in unique_categories:
        color = category_colors.get(category, "#FFFFFF")  # Default to white if category not found
        r, g, b = int(color[1:3], 16), int(color[3:5], 16), int(color[5:7], 16)

        # Print color box and category name
        print(f"\033[48;2;{r};{g};{b}m  \033[0m \033[38;2;{r};{g};{b}m{category.ljust(max_category_length)}\033[0m")

    #print("\nConditions by Category:")
    #for category in unique_categories:
     #   conditions = [condition for condition, cat in condition_categories.items() if cat == category]
      #  color = category_colors.get(category, "#FFFFFF")
       # r, g, b = int(color[1:3], 16), int(color[3:5], 16), int(color[5:7], 16)
        #print(f"\033[38;2;{r};{g};{b}m{category}:\033[0m")
        #for condition in sorted(conditions):
         #   print(f"  - {condition}")
    #print("\n")

# Main execution block
if __name__ == "__main__":
    try:
        # Load the file
        data, total_patients, gender, age_group = load_file()

        if data is not None:
            # Initialize the data
            initialize_data(data, total_patients, gender, age_group)

            # Display the legend once at the beginning
            display_legend()

            while True:
                # Calculate ranges
                min_freq_range = (global_data['PairFrequency'].min(), global_data['PairFrequency'].max())
                min_percentage_range = (global_data['Percentage'].min(), global_data['Percentage'].max())

                # Get user input for minimum pair frequency and percentage with validation
                while True:
                    try:
                        min_frequency = int(input(f"Enter the minimum pair frequency [{min_freq_range[0]}-{min_freq_range[1]}]: "))
                        if min_freq_range[0] <= min_frequency <= min_freq_range[1]:
                            break
                        else:
                            print(f"Error: Value must be between {min_freq_range[0]} and {min_freq_range[1]}. Please try again.")
                    except ValueError:
                        print("Error: Please enter a valid integer.")

                while True:
                    try:
                        min_percentage = float(input(f"Enter the minimum percentage (%) [{min_percentage_range[0]:.2f}-{min_percentage_range[1]:.2f}]: "))
                        if min_percentage_range[0] <= min_percentage <= min_percentage_range[1]:
                            break
                        else:
                            print(f"Error: Value must be between {min_percentage_range[0]:.2f} and {min_percentage_range[1]:.2f}. Please try again.")
                    except ValueError:
                        print("Error: Please enter a valid number.")

                # Update and display the graph
                update_graph(min_percentage, min_frequency)

                # Analyze condition combinations
                results = analyze_condition_combinations(min_percentage, min_frequency)

                # Display total number of condition combinations and all combinations
                total_combinations = len(results)
                print(f"\nTotal number of condition combinations: {total_combinations}")
                print("All condition combinations:")

                # Display all results without truncation
                pd.set_option('display.max_rows', None)
                pd.set_option('display.max_columns', None)
                pd.set_option('display.width', None)
                pd.set_option('display.max_colwidth', None)
                display(results)

                # Reset display options to default
                pd.reset_option('display.max_rows')
                pd.reset_option('display.max_columns')
                pd.reset_option('display.width')
                pd.reset_option('display.max_colwidth')

                # Ask if user wants to save results
                save_choice = input("Do you want to save the results to a CSV file? (yes/no): ").lower()
                if save_choice == 'yes':
                    save_results_to_csv()

                # Ask if the user wants to run again with different parameters
                run_again = input("Do you want to run the analysis again with different parameters? (yes/no): ").lower()
                if run_again != 'yes':
                    break

            print("Analysis completed. Thank you for using the script!")
        else:
            print("Failed to load data. Please run the script again and upload a valid CSV file.")

    except KeyboardInterrupt:
        print("\nScript execution interrupted by user. Exiting gracefully...")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
    finally:
        print("Script execution completed.")

print("You can run the cell again to start a new analysis with a different file.")

Please upload your CSV file.


Saving Males_fdr_significant_high_freq_odds_ratio_analysis_45to64.csv to Males_fdr_significant_high_freq_odds_ratio_analysis_45to64.csv
Uploaded file: Males_fdr_significant_high_freq_odds_ratio_analysis_45to64.csv
File loaded successfully. Total patients: 3969
Gender: Male, Age Group: 45-64

System Legend:
[48;2;214;39;40m  [0m [38;2;214;39;40mBlood          [0m
[48;2;236;156;86m  [0m [38;2;236;156;86mCirculatory    [0m
[48;2;255;127;14m  [0m [38;2;255;127;14mDigestive      [0m
[48;2;66;178;66m  [0m [38;2;66;178;66mEar            [0m
[48;2;44;160;44m  [0m [38;2;44;160;44mEndocrine      [0m
[48;2;227;119;194m  [0m [38;2;227;119;194mEye            [0m
[48;2;127;127;127m  [0m [38;2;127;127;127mGenitourinary  [0m
[48;2;148;103;189m  [0m [38;2;148;103;189mMental         [0m
[48;2;23;190;207m  [0m [38;2;23;190;207mMusculoskeletal[0m
[48;2;31;119;180m  [0m [38;2;31;119;180mNeoplasms      [0m
[48;2;140;86;75m  [0m [38;2;140;86;75mNervous        [0m


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Graph saved as network_graph_736388.html. You can download it using the link above.
Graph updated: 14 nodes, 21 edges.
Analysis complete. 19 combinations found.

Total number of condition combinations: 19
All condition combinations:


Unnamed: 0,Combination,NumConditions,Minimum Pair Frequency,Prevalence of the combination (%),Total odds ratio
97,Hypertension + Anaemia + Reflux Disorders,3,177,4.459562,2.403988
78,Hypertension + Chronic Kidney Disease + Anaemia,3,175,4.409171,2.524552
121,Hypertension + Mental Illness + Reflux Disorders,3,170,4.283195,2.359502
82,Hypertension + Chronic Kidney Disease + Cardiac Arrhythmias,3,144,3.628118,4.462193
136,Hypertension + Chronic Airway Diseases + Reflux Disorders,3,128,3.224994,4.054378
0,Diabetes + Hypertension + Chronic Kidney Disease,3,127,3.199798,3.148325
81,Hypertension + Chronic Kidney Disease + Mental Illness,3,126,3.174603,1.857501
146,Chronic Kidney Disease + Anaemia + Mental Illness,3,123,3.099017,5.153811
91,Hypertension + Anaemia + Mental Illness,3,123,3.099017,2.501931
727,Hypertension + Anaemia + Mental Illness + Reflux Disorders,4,123,3.099017,10.376346


Do you want to save the results to a CSV file? (yes/no): no
Do you want to run the analysis again with different parameters? (yes/no): no
Analysis completed. Thank you for using the script!
Script execution completed.
You can run the cell again to start a new analysis with a different file.
