In [1]:
import csv

def unique_subcellular_locations(csv_path):
    unique_locations = set()

    with open(csv_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
     
        for row in reader:
            locations = row['Subcellular location'].split(';')
            for location in locations:
                sub_locations = location.split(',')
                for sub_location in sub_locations:
                    clean_location = sub_location.strip().lower()
                    if clean_location:
                        unique_locations.add(clean_location)
    
    return list(unique_locations)

def save_locations_to_csv(locations, output_csv_path):
    with open(output_csv_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Subcellular Location'])
        for location in locations:
            writer.writerow([location])

csv_path = "/Volumes/dax-hd/project-data/search-files/uniprot-data.csv"
output_csv_path = "/Volumes/dax-hd/project-data/search-files/unique_subcellular_locations.csv"

unique_locations = unique_subcellular_locations(csv_path)
save_locations_to_csv(unique_locations, output_csv_path)

print("Unique Subcellular Locations saved to CSV.")

Unique Subcellular Locations saved to CSV.


In [5]:
import csv
from collections import Counter
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
import matplotlib.cm as cm

csv_path = "/Volumes/dax-hd/project-data/search-files/uniprot-data.csv"

def categorize_location(location):
    location = location.lower()
    cellular_membranes = [
        "cell membrane", "host cell membrane", "cell outer membrane", "cell inner membrane"
    ]
    organelle_membranes = [
        "mitochondrion membrane", "mitochondrion inner membrane", "mitochondrion outer membrane",
        "chloroplast membrane", "chloroplast inner membrane", "chloroplast outer membrane",
        "peroxisome membrane", "lysosome membrane", "vacuole membrane", "nucleus membrane",
        "nucleus inner membrane", "nucleus outer membrane", "endoplasmic reticulum membrane",
        "endoplasmic reticulum-golgi intermediate compartment membrane", "golgi apparatus membrane",
        "secretory vesicle membrane", "cytoplasmic vesicle membrane", "melanosome membrane",
        "cytolytic granule membrane", "acrosome membrane", "autophagosome membrane", 
        "extracellular vesicle", "bacterial extracellular vesicle"
    ]
    specialized_membranes = [
        "synaptic cell membrane", "postsynaptic cell membrane", "presynaptic cell membrane",
        "tight junction", "gap junction", "adherens junction", "desmosome", "hemidesmosome"
    ]
    cytoplasmic = [
    "cytoplasm", "cytosol", "cytoplasmic granule", "cytoplasmic vesicle", "cytoplasmic vesicle membrane",
    "cytoplasmic ribonucleoprotein granule", "glycosome", "peroxisome", "lysosome", "zymogen granule", 
    "chromaffin granule", "secretory vesicle", "multivesicular body", "autophagosome", "synaptic vesicle",
    "clathrin-coated vesicle", "copi-coated vesicle", "copii-coated vesicle", "endosome", "phagosome",
    "extracellular vesicle", "microvillus", "actin patch", "stress fiber", "stress granule", "p-body",
    "cytoskeleton", "microtubule organizing center", "spindle", "centrosome", "centriole", "cilium", "flagellum"
    ]
    mitochondria_related = [
        "mitochondrion", "mitochondrion nucleoid", "mitochondrion matrix"
    ]
    chloroplasts_related = [
        "chloroplast", "chloroplast stroma", "chloroplast nucleoid", 
        "chloroplast thylakoid membrane", "chloroplast thylakoid lumen"
    ]
    er_and_golgi_related = [
        "endoplasmic reticulum", "smooth endoplasmic reticulum membrane", 
        "rough endoplasmic reticulum", "rough endoplasmic reticulum lumen",
        "golgi apparatus", "golgi stack", "golgi stack membrane"
    ]
    lysosomes_and_vesicles = [
        "lysosome", "secretory vesicle", "cytoplasmic vesicle", 
        "multivesicular body", "autophagosome", "zymogen granule", 
        "chromaffin granule", "synaptic vesicle"
    ]

    extracellular_spaces = [
        "extracellular space", "host extracellular space", "extracellular matrix", 
        "extracellular exosome", "interphotoreceptor matrix"
    ]
    secreted = ["secreted", "extracellular vesicle"]

    cytoskeleton_components = [
        "cytoskeleton", "microtubule organizing center", "spindle", "spindle pole", 
        "centrosome", "centriole", "kinetochore", "cilium", "cilium basal body", 
        "cilium axoneme", "flagellum", "flagellum axoneme", "bacterial flagellum", 
        "archaeal flagellum", "actin patch", "microvillus", "microvillus membrane"
    ]
    mobility_and_attachment = [
        "pseudopodium", "pseudopodium membrane", "lamellipodium", "lamellipodium membrane", 
        "filopodium", "filopodium tip", "growth cone", "growth cone membrane", "focal adhesion", 
        "invadopodium", "invadopodium membrane"
    ]

    nuclear_locations = [
        "nucleus", "nucleoplasm", "nucleus lamina", "nucleolus", "cajal body", "nuclear pore complex"
    ]
    chromosome_related = ["chromosome", "centromere", "telomere", "kinetochore"]

    endosomes = [
        "endosome", "early endosome", "late endosome", "recycling endosome", 
        "host endosome", "multivesicular body", "prevacuolar compartment", "endosome lumen"
    ]
    phagosomes_and_autophagosomes = [
        "phagosome", "autophagosome", "autophagosome lumen", "autolysosome membrane"
    ]

    photosynthesis_related = ["cellular thylakoid membrane", "chlorosome", "chlorosome envelope"]
    cell_junctions = ["cell junction", "septate junction", "paranodal septate junction"]
    miscellaneous = [
        "peroxisome", "glyoxysome", "plastid", "amyloplast", "hydrogenosome", "vacuole",
        "sarcoplasmic reticulum", "sarcoplasmic reticulum lumen", "plastoglobule", "synapse",
        "synaptic cleft", "postsynapse", "presynapse", "myelin membrane", "bud", "bud neck", 
        "bud tip", "spore", "spore wall", "capsule", "pilus", "fimbrium", "vesicle", "coated pit", 
        "clathrin-coated vesicle"
    ]

    host_specific = [
        "host cell", "host nucleus", "host mitochondrion", "host chloroplast", "host cytoplasm",
        "host membrane", "host cytoskeleton", "host endoplasmic reticulum", "host golgi apparatus", 
        "host endosome", "host lysosome", "host perinuclear region"
    ]
    pathogen_specific = ["parasitophorous vacuole", "parasitophorous vacuole membrane"]

    nuclear_keywords = nuclear_locations + chromosome_related
    cytoplasmic_keywords = miscellaneous + cytoplasmic
    membrane_keywords = cellular_membranes + organelle_membranes + specialized_membranes
    mitochondrial_keywords = mitochondria_related
    extracellular_keywords = extracellular_spaces + secreted
    golgi_keywords = er_and_golgi_related
    er_keywords = er_and_golgi_related
    structural_keywords = cytoskeleton_components + mobility_and_attachment + cell_junctions
    viral_keywords = host_specific + pathogen_specific
    chloroplast_keywords = photosynthesis_related + chloroplasts_related
    vesicle_keywords = endosomes + lysosomes_and_vesicles + phagosomes_and_autophagosomes

    # Check for the category each location belongs to
    if any(keyword in location for keyword in nuclear_keywords):
        return 'Nuclear'
    elif any(keyword in location for keyword in cytoplasmic_keywords):
        return 'Cytoplasmic'
    elif any(keyword in location for keyword in membrane_keywords):
        return 'Membranal'
    elif any(keyword in location for keyword in mitochondrial_keywords):
        return 'Mitochondrial'
    elif any(keyword in location for keyword in extracellular_keywords):
        return 'Extracellular'
    elif any(keyword in location for keyword in golgi_keywords):
        return 'Golgi'
    elif any(keyword in location for keyword in er_keywords):
        return 'Endoplasmic Reticulum'
    elif any(keyword in location for keyword in structural_keywords):
        return 'Structural'
    elif any(keyword in location for keyword in viral_keywords):
        return 'Viral'
    elif any(keyword in location for keyword in chloroplast_keywords):
        return 'Chloroplastic'
    elif any(keyword in location for keyword in vesicle_keywords):
        return 'Vesicular'
    else:
        return 'Other'


def count_categories(csv_path):
    category_counts = Counter()

    with open(csv_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)

        for row in reader:
            locations = row['Subcellular location'].split(';')
            for location in locations:
                sub_locations = location.split(',')
                for sub_location in sub_locations:
                    clean_location = sub_location.strip().lower()
                    if clean_location:
                        category = categorize_location(clean_location)
                        category_counts[category] += 1

    return category_counts

category_counts = count_categories(csv_path)

colors = ['#ed5054', '#3f93b4', '#e0ac24', '#fff785', '#dbcfc2','#c14448', '#50b378', '#5faed4', '#b58c20', '#8c489f', '#49a9a2']

fig = go.Figure(data=[go.Pie(
    labels=list(category_counts.keys()),  # Convert dict_keys to list
    values=list(category_counts.values()),  # Convert dict_values to list
    marker=dict(colors=colors, line=dict(color='black', width=1)),
    textinfo='percent+label',
    pull=[0.1] * len(category_counts)  # Slightly pull slices out
)])

fig.update_layout(
    title_text='Distribution of Subcellular Locations',
    title_font=dict(size=20),
    font=dict(family='Andale Mono', size=18, color='black')
)

fig.write_html("pichart.html")
