In [None]:
import numpy as np
import pandas as pd
from Bio import SeqIO
import matplotlib.pyplot as plt

def extract_isoform_of(description: str) -> str:
    """
    Extracts protein of origin from the description.

    :param description: The description part of the fasta entry.
    :type description: str
    :return: Native protein.
    :rtype: str
    """
    isoform_parts = description.split("Isoform of")
    if len(isoform_parts) > 1:
        isoform_info = isoform_parts[1].strip().split(",")[0]
        return isoform_info.strip()
    return None

def extract_functional_traits(description: str) -> str:
    """
    Extracts functional traits from the description.

    :param description: The description part of the fasta entry.
    :type description: str
    :return: Functional traits.
    :rtype: str
    """
    comma_index = description.find(",")
    os_index = description.find("OS=")
    if comma_index != -1 and os_index != -1:
        return description[comma_index + 1:os_index].strip()
    else:
        return None


def extract_organism_name(description: str) -> str:
    """
    Extracts organism name from the description.

    :param description: The description part of the fasta entry.
    :type description: str
    :return: Organism name.
    :rtype: str
    """
    organism_start = description.find("OS=")
    if organism_start != -1:
        organism_end = description.find(" ", organism_start + 3)
        if organism_end != -1:
            organism_end = description.find(" ", organism_end + 1)
            if organism_end == -1:
                organism_end = len(description)
            return description[organism_start + 3:organism_end].strip()
    return None


def extract_gene_name(description: str) -> str:
    """
    Extracts gene name from the description.

    :param description: The description part of the fasta entry.
    :type description: str
    :return: Gene name.
    :rtype: str
    """
    gene_start = description.find("GN=")
    if gene_start != -1:
        gene_end = description.find(" ", gene_start)
        if gene_end == -1:
            gene_end = len(description)
        return description[gene_start + 3:gene_end].strip()
    else:
        return None

def fasta_to_df(file_path: str) -> pd.DataFrame:
    """
    Convert a FASTA file to a pandas DataFrame.

    :param file_path: Path to the FASTA file.
    :type file_path: str
    :return: DataFrame containing FASTA information.
    :rtype: pd.DataFrame
    """

    data = []
    for record in SeqIO.parse(file_path, "fasta"):
        header_info = record.id.split("|")
        description = record.description
        data.append({
            "db": header_info[0],
            "unique_identifier": header_info[1],
            "entry_name": header_info[2],
            "isoform_of": extract_isoform_of(description),
            "functional_traits": extract_functional_traits(description),
            "organism_name": extract_organism_name(description),
            "gene_name": extract_gene_name(description),
            "protein_sequence": str(record.seq)
        })

    fasta = pd.DataFrame(data)
    return fasta


class IsoRetriever:
    def __init__(self, file_path):
        self.file_path = file_path
        self._read_csv()

    def _read_csv(self):
        data_df = pd.read_csv(self.file_path)
        
        if "UniprotA" in data_df.columns and "UniprotB" in data_df.columns:
            self._read_csv_as_v2(data_df)
        else:
            self._read_csv_as_v1(data_df)

    def _read_csv_as_v1(self, data_df):
        uniprotA_293t = data_df[["UniprotA-293T", "SymbolA-293T"]].dropna().rename(columns={"UniprotA-293T": "isoform", "SymbolA-293T": "gene_name"})
        uniprotB_293t = data_df[["UniprotB-293T", "SymbolB-293T"]].dropna().rename(columns={"UniprotB-293T": "isoform", "SymbolB-293T": "gene_name"})
        uniprotA_hct116 = data_df[["UniprotA-HCT116", "SymbolA-HCT116"]].dropna().rename(columns={"UniprotA-HCT116": "isoform", "SymbolA-HCT116": "gene_name"})
        uniprotB_hct116 = data_df[["UniprotB-HCT116", "SymbolB-HCT116"]].dropna().rename(columns={"UniprotB-HCT116": "isoform", "SymbolB-HCT116": "gene_name"})        
        merged_df = pd.concat([uniprotA_293t, uniprotB_293t, uniprotA_hct116, uniprotB_hct116], ignore_index=True)
        self.unique_isoforms_df = merged_df.drop_duplicates(subset=["isoform"]).dropna()

    def _read_csv_as_v2(self, data_df):
        uniprotA_df = data_df[["UniprotA", "SymbolA"]].dropna().rename(columns={"UniprotA": "isoform", "SymbolA": "gene_name"})
        uniprotB_df = data_df[["UniprotB", "SymbolB"]].dropna().rename(columns={"UniprotB": "isoform", "SymbolB": "gene_name"})
        merged_df = pd.concat([uniprotA_df, uniprotB_df], ignore_index=True)
        self.unique_isoforms_df = merged_df.drop_duplicates(subset=["isoform"]).dropna()

    def check_duplicates_iso_retriever(self): 
        duplicates_exist = self.unique_isoforms_df.duplicated(subset=["isoform"]).any()
        if duplicates_exist:
            print("DETECTED duplicates in 'isoform'.")
        else:
            print("There're NO duplicated values in 'isoform'.")
        return duplicates_exist

    def get_unique_isoforms_df(self):
        return self.unique_isoforms_df


class iso_lib_matcher:
    def __init__(self, lib_df, dataset_df):
        self.lib_df = lib_df
        self.dataset_df = dataset_df
        self.filter()

    def filter(self):
        lib_column_2 = "unique_identifier"
        lib_column_4 = "isoform_of"
        lib_column_7 = "gene_name"
        dataset_column = "isoform"
        self.filtered_df = self.lib_df[self.lib_df[lib_column_2].isin(self.dataset_df[dataset_column].dropna())][[lib_column_4, lib_column_2, lib_column_7]]
        self.filtered_df.columns = ["canonical_protein", "isoform", lib_column_7]

    def check_duplicates_iso_lib_match(self): 
        duplicates_exist = self.filtered_df.duplicated(subset=["isoform"]).any()
        if duplicates_exist:
            print("DETECTED duplicates in 'isoform'.")
        else:
            print("There're NO duplicates in 'isoform'.")
        return duplicates_exist

    def get_filtered_df(self):
        return self.filtered_df

class can_lib_matcher:
    def __init__(self, dataset_df, lib_df):
        self.dataset_df = dataset_df
        self.lib_df = lib_df
        self.filter()

    def filter(self):
        dataset_column_1 = "isoform"
        dataset_column_2 = "gene_name_x"
        lib_column_4 = "isoform_of"
        merged_df = pd.merge(self.dataset_df, self.lib_df, how="inner", left_on=dataset_column_1, right_on=lib_column_4)
        self.filtered_df = merged_df[[lib_column_4, dataset_column_1, dataset_column_2]].drop_duplicates(subset=[dataset_column_1])
        self.filtered_df.columns = ["canonical_protein", dataset_column_1, "gene_name"]

    def check_duplicates_can_lib_match(self): 
        duplicates_exist = self.filtered_df.duplicated(subset=["isoform"]).any()
        if duplicates_exist:
            print("DETECTED duplicates in 'isoform'.")
        else:
            print("There're NO duplicates in 'isoform'.")
        return duplicates_exist

    def get_filtered_df(self):
        return self.filtered_df

class gene_lib_matcher:
    def __init__(self, dataset_df, lib_df):
        self.dataset_df = dataset_df
        self.lib_df = lib_df
        self.filter()

    def filter(self):
        dataset_column_1 = "isoform"
        dataset_column_2 = "gene_name"
        lib_column_4 = "isoform_of"
        lib_column_7 = "gene_name"
        merged_df = pd.merge(self.dataset_df, self.lib_df, how="inner", left_on=dataset_column_2, right_on=lib_column_7)
        self.filtered_df = merged_df[[lib_column_4, dataset_column_1, dataset_column_2]].drop_duplicates(subset=[dataset_column_1])
        self.filtered_df.columns = ["canonical_protein", dataset_column_1, dataset_column_2]

    def check_duplicates_gene_lib_match(self): 
        duplicates_exist = self.filtered_df.duplicated(subset=["isoform"]).any()
        if duplicates_exist:
            print("DETECTED duplicates in 'isoform'.")
        else:
            print("There're NO duplicates in 'isoform'.")
        return duplicates_exist

    def get_filtered_df(self):
        return self.filtered_df
    
class addition:
    def __init__(self, dataset_1, dataset_2, dataset_3, column_drop_duplicates):
        self.dataset_1 = dataset_1
        self.dataset_2 = dataset_2
        self.dataset_3 = dataset_3
        self.column_drop_duplicates = column_drop_duplicates
        self.merge()

    def merge(self):
        merged_df = pd.concat([self.dataset_1, self.dataset_2, self.dataset_3], ignore_index=True)
        self.merged_df = merged_df.drop_duplicates(subset=[self.column_drop_duplicates])

    def check_duplicates_lib_matches(self): 
        duplicates_exist = self.merged_df.duplicated(subset=["isoform"]).any()
        if duplicates_exist:
            print("DETECTED duplicates in 'isoform'.")
        else:
            print("There're NO duplicates in 'isoform'.")
        return duplicates_exist
    
    def get_merged_df(self):
        return self.merged_df
    

class subtract:
    def __init__(self, add_df, subtract_df, add_column, subtract_column):
        self.add_df = add_df
        self.subtract_df = subtract_df
        self.add_column = add_column
        self.subtract_column = subtract_column

    def subtraction(self):
        sum = self.add_df[~self.add_df[self.add_column].isin(self.subtract_df[self.subtract_column])]
        return sum
    
class SortUndefined:
    def __init__(self, dataset):
        self.dataset = dataset

    def sort(self):
        data_column_1 = "isoform"
        data_column_2 = "gene_name"
        modified_df = self.dataset.copy()
        modified_df['new_column'] = modified_df[data_column_2]
        modified_df = modified_df.loc[:, ['new_column', data_column_1, data_column_2]]
        modified_df.columns = ["canonical_protein", data_column_1, data_column_2]
        
        return modified_df
    
class IsoformCount:
    def __init__(self, dataset):
        self.dataset = dataset
        self.iso_count()

    def iso_count(self):
        isoform_dict = {}

        for index, row in self.dataset.iterrows():
            data_column_1 = row['canonical_protein']
            data_column_2 = row['isoform']
            if data_column_1 in isoform_dict:
                isoform_dict[data_column_1].append(data_column_2)
            else:
                isoform_dict[data_column_1] = [data_column_2]

        for data_column_1, data_column_2_values in isoform_dict.items():
            for i, data_column_2_value in enumerate(data_column_2_values, start=1):
                self.dataset.loc[self.dataset['canonical_protein'] == data_column_1, f'isoform_{i}'] = data_column_2_value

        self.dataset.drop_duplicates(subset=['canonical_protein'], inplace=True)
        dataset_count = self.dataset[['canonical_protein'] + [col for col in self.dataset.columns if col.startswith('isoform_')]].copy()
        dataset_count.reset_index(drop=True, inplace=True)
        dataset_count['isoform_count'] = dataset_count.filter(like='isoform_').notna().sum(axis=1)
        self.dataset_count = dataset_count

    def print_number_of_isoforms(self):
        total_isoform_count = self.dataset_count['isoform_count'].sum()
        print("Total Isoform Count:", total_isoform_count)

    def get_dataset_count(self):
        return self.dataset_count
    
class IsoformDistribution:
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def iso_dis(self):
        count_df = self.dataframe['isoform_count'].value_counts().reset_index()
        count_df.columns = ['isoform_count', '# of entries']
        count_df = count_df.sort_values(by='isoform_count')
        total_entries = count_df['# of entries'].sum()
        count_df['frequency (%)'] = count_df['# of entries'] / total_entries * 100

        return count_df, total_entries

    def plot_distribution(self):
        count_df, _ = self.iso_dis()
        fig, ax1 = plt.subplots(figsize=(10, 6))

        ax1.bar(count_df['isoform_count'], count_df['frequency (%)'], color='skyblue')
        ax1.set_title('Isoform Distribution')
        ax1.set_xlabel('Number of Isoforms')
        ax1.set_ylabel('Frequency (%)')
        ax1.grid(True)
        ax1.set_xticks(range(int(count_df['isoform_count'].max()) + 1))
        ax1.tick_params(axis='both', which='major')

        zoomed_data = count_df[count_df['frequency (%)'] < 1]
        ax2 = ax1.inset_axes([0.6, 0.6, 0.3, 0.3])  
        ax2.bar(zoomed_data['isoform_count'], zoomed_data['frequency (%)'], color='skyblue')
        ax2.set_xlabel('Number of Isoforms')
        ax2.set_ylabel('Frequency (%)')
        ax2.set_xticks(zoomed_data['isoform_count'])
        ax2.tick_params(axis='both', which='major')
        ax2.set_ylim(0, 1.2 * zoomed_data['frequency (%)'].max())

        plt.tight_layout()
        plt.show()    


class protein_matcher:
    def __init__(self, dataset, protein_list, dataset_column, protein_list_column):
        self.dataset = dataset
        self.protein_list = protein_list
        self.dataset_column = dataset_column
        self.protein_list_column = protein_list_column
        self.filter()

    def filter(self):
        self.filtered_df = self.dataset[self.dataset[self.dataset_column].isin(self.protein_list[self.protein_list_column].dropna())]

    def check_duplicates_protein_match(self): 
        duplicates_exist = self.filtered_df.duplicated(subset=["canonical_protein"]).any()
        if duplicates_exist:
            print("DETECTED duplicates in 'canonical_protein'.")
        else:
            print("There're NO duplicates in 'canonical_protein'.")
        return duplicates_exist

    def get_filtered_df(self):
        return self.filtered_df

class IsoformExtractor:
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.extract_isoforms()

    def extract_isoforms(self):
        isoform_columns = [col for col in self.dataframe.columns if col.startswith("isoform_")]
        isoform_values = []
        for col in isoform_columns:
            isoform_values.extend(self.dataframe[col].dropna().unique())
        isoform_df = pd.DataFrame({"isoform": isoform_values})
        isoform_df = isoform_df[~isoform_df['isoform'].astype(str).str.isdigit()]
        self.unique_isoforms_df = isoform_df.drop_duplicates(subset=["isoform"]).reset_index(drop=True)

    def get_unique_isoforms_df(self):
        return self.unique_isoforms_df
    
class UniqueInteractionsHandler:
    def __init__(self, dataframe):
        self.dataframe = dataframe.dropna(subset=['UniprotA', 'UniprotB'])

    def process_ppi(self, protein_match_df):
        match_list = []
        unique_isoforms = set(protein_match_df['isoform'].dropna())
        for index, row in self.dataframe.iterrows():
            if row['UniprotA'] in unique_isoforms and row['UniprotB'] in unique_isoforms:
                match_list.append(row)

        self.dataframe = pd.DataFrame(match_list)
        unique_uniprot_values = pd.concat([self.dataframe['UniprotA'], self.dataframe['UniprotB']]).dropna().unique()
        return pd.DataFrame({'isoform': unique_uniprot_values})

    def get_filtered_df(self):
        return self.dataframe

    def extract_unique_uniprot(self):
        unique_uniprot_values = pd.concat([self.dataframe['UniprotA'], self.dataframe['UniprotB']]).dropna().unique()
        return pd.DataFrame({'isoform': unique_uniprot_values})

    def get_filtered_df(self):
        return self.dataframe

class fill_list:
    def __init__(self, dataset, protein_list, dataset_column, protein_list_column):
        self.dataset = dataset
        self.protein_list = protein_list
        self.dataset_column = dataset_column
        self.protein_list_column = protein_list_column
        self.filter()

    def filter(self):
        dataset_column_1 = "canonical_protein"
        dataset_column_2 = "isoform"
        dataset_column_3 = "gene_name"
        self.filtered_df = self.dataset[self.dataset[self.dataset_column].isin(self.protein_list[self.protein_list_column].dropna())][[dataset_column_1, dataset_column_2, dataset_column_3]]

    def check_duplicates_protein_match(self): 
        duplicates_exist = self.filtered_df.duplicated(subset=["canonical_protein"]).any()
        if duplicates_exist:
            print("DETECTED duplicates in 'canonical_protein'.")
        else:
            print("There're NO duplicates in 'canonical_protein'.")
        return duplicates_exist

    def get_filtered_df(self):
        return self.filtered_df



### LOAD FILES

In [None]:
iso_lib = fasta_to_df("UP000005640_9606_additional.fasta")
data_hct116 = "Unique_interactions_HCT116.csv"
data_293t = "Unique_interactions_293T.csv"
data_shared = "Shared_interactions.csv"

### MAP TO ISOFORM LIBRARY

In [None]:
#Unique interactions HCT-116
iso_uppi_hct116 = IsoRetriever(data_hct116).get_unique_isoforms_df() 
iso_lib_match_hct116 = iso_lib_matcher(iso_lib, iso_uppi_hct116).get_filtered_df()
can_lib_match_hct116 = can_lib_matcher(iso_uppi_hct116, iso_lib).get_filtered_df()
gene_lib_match_hct116 = gene_lib_matcher(iso_uppi_hct116, iso_lib).get_filtered_df()
lib_match_hct116 = addition(iso_lib_match_hct116, can_lib_match_hct116, gene_lib_match_hct116, "isoform").get_merged_df()
undefined_hct116 = subtract(iso_uppi_hct116, lib_match_hct116, "isoform", "isoform").subtraction()
id_hct116 = SortUndefined(undefined_hct116).sort()
index_hct116 = addition(lib_match_hct116, id_hct116, None, "isoform").get_merged_df()
original_index_hct116 = index_hct116.copy()
proteins_hct116 = IsoformCount(original_index_hct116).get_dataset_count()

#Unique interactions 293T
iso_uppi_293t = IsoRetriever(data_293t).get_unique_isoforms_df()
iso_lib_match_293t = iso_lib_matcher(iso_lib, iso_uppi_293t).get_filtered_df()
can_lib_match_293t = can_lib_matcher(iso_uppi_293t, iso_lib).get_filtered_df()
gene_lib_match_293t = gene_lib_matcher(iso_uppi_293t, iso_lib).get_filtered_df()
lib_match_293t = addition(iso_lib_match_293t, can_lib_match_293t, gene_lib_match_293t, "isoform").get_merged_df()
undefined_293t = subtract(iso_uppi_293t, lib_match_293t, "isoform", "isoform").subtraction()
id_293t = SortUndefined(undefined_293t).sort()
index_293t = addition(lib_match_293t, id_293t, None, "isoform").get_merged_df()
original_index_293t = index_293t.copy()
proteins_293t = IsoformCount(original_index_293t).get_dataset_count()

#Shared interactions
iso_sppi = IsoRetriever(data_shared).get_unique_isoforms_df() 
iso_lib_match_shared = iso_lib_matcher(iso_lib, iso_sppi).get_filtered_df()
can_lib_match_shared = can_lib_matcher(iso_sppi, iso_lib).get_filtered_df()
gene_lib_match_shared = gene_lib_matcher(iso_sppi, iso_lib).get_filtered_df()
lib_match_shared = addition(iso_lib_match_shared, can_lib_match_shared, gene_lib_match_shared, "isoform").get_merged_df()
undefined_shared = subtract(iso_sppi, lib_match_shared, "isoform", "isoform").subtraction()
id_shared = SortUndefined(undefined_shared).sort()
index_shared = addition(lib_match_shared, id_shared, None, "isoform").get_merged_df()
proteins_shared = IsoformCount(index_shared).get_dataset_count()

### ISOFORM DISTRIBUTION

In [None]:
#Unique interactions HCT-116
print("HCT-116: Unique Interactions")
IsoformDistribution(proteins_hct116).plot_distribution()

#Unique interactions 293T
print("293T: Unique Interactions")
IsoformDistribution(proteins_293t).plot_distribution()

#Shared interactions
print("Shared Interactions")
IsoformDistribution(proteins_shared).plot_distribution()

### UNIQUE INTERACTIONS COMBINED

In [None]:
#Unique interactions HCT-116
protein_match_hct116 = protein_matcher(proteins_hct116, proteins_293t, "canonical_protein", "canonical_protein").get_filtered_df()
iso_ext_hct116 = IsoformExtractor(protein_match_hct116).get_unique_isoforms_df()
handler_hct116 = UniqueInteractionsHandler(pd.read_csv(data_hct116))
iso_hand_hct116 = handler_hct116.process_ppi(iso_ext_hct116)
fill_red_ppi_hct116 = fill_list(index_hct116, iso_hand_hct116, "isoform", "isoform").get_filtered_df()


#Unique interactions 293T
protein_match_293t = protein_matcher(proteins_293t, proteins_hct116, "canonical_protein", "canonical_protein").get_filtered_df()
iso_ext_293t = IsoformExtractor(protein_match_293t).get_unique_isoforms_df()
handler_293t = UniqueInteractionsHandler(pd.read_csv(data_293t))
iso_hand_293t = handler_293t.process_ppi(iso_ext_293t)
fill_red_ppi_293t = fill_list(index_293t, iso_hand_293t, "isoform", "isoform").get_filtered_df()

#Combined
index_uppi = addition(fill_red_ppi_hct116, fill_red_ppi_293t, None, "isoform").get_merged_df()
proteins_uppi = IsoformCount(index_uppi).get_dataset_count()

### ISOFORM DISTRIBUTION UNIQUE INTERACTIONS

In [None]:
print("Combined Unique Interactions")
IsoformDistribution(proteins_uppi).plot_distribution()