In [None]:
import pandas as pd

data = pd.read_csv("data/breast_cancer/METABRIC_RNA_Mutation.csv")

data

# Fill numeric columns with their mean
for col in data.select_dtypes(include='number').columns:
    data[col] = data[col].fillna(data[col].mean())

# Fill object (string) columns with their mode
for col in data.select_dtypes(include='object').columns:
    mode = data[col].mode()
    if not mode.empty:
        data[col] = data[col].fillna(mode[0])

In [None]:
mutations = set()
for i in data.columns:
    if "mut" in i:
        mutations.add(i)
mutations.remove("mutation_count")

In [None]:
labels = pd.get_dummies(data["cancer_type_detailed"])
labels = labels.drop(columns="Breast",axis=1)
labels
data = pd.concat([data.reset_index(drop=True), labels.reset_index(drop=True)], axis=1)
data


In [None]:
features = ['Breast Invasive Ductal Carcinoma','Breast Invasive Lobular Carcinoma','Breast Invasive Mixed Mucinous Carcinoma','Breast Mixed Ductal and Lobular Carcinoma','Metaplastic Breast Cancer',
    'age_at_diagnosis', 'er_status_measured_by_ihc_subtype_Negative','er_status_measured_by_ihc_subtype_Positve', 'er_status_subtype_Negative', 'er_status_subtype_Positive', 'her2_status_measured_by_snp6_subtype_GAIN', 'her2_status_measured_by_snp6_subtype_LOSS', 'her2_status_measured_by_snp6_subtype_NEUTRAL','her2_status_measured_by_snp6_subtype_UNDEF',
    'lymph_nodes_examined_positive', 'mutation_count', 'neoplasm_histologic_grade', 'nottingham_prognostic_index', 'pr_status_subtype_Negative', 'pr_status_subtype_Positive', 'pr_statussubtype_Negative', 'pr_statussubtype_Positive',
    'primary_tumor_laterality_subtype_Left', 'primary_tumor_laterality_subtype_Right',
    'tumor_size', 'tumor_stage']    

for i in mutations:
    features.append(i)

In [None]:
for i in data:
    if i not in features:
        data = data.drop(columns=i,axis=1)

data

In [None]:
gene_functions = {
    "PIK3CA": "Encodes a catalytic subunit of PI3K, driving cell growth and survival signaling; frequently mutated in cancers.",
    "TP53": "Tumor suppressor gene that regulates DNA repair, apoptosis, and cell cycle; commonly inactivated in cancer.",
    "CDH1": "Encodes E-cadherin, a protein crucial for cell-cell adhesion; loss leads to increased cancer invasiveness.",
    "MAP2K4": "Part of the MAPK pathway, involved in stress and apoptosis signaling; mutations may disrupt cell death regulation.",
    "USP9X": "Deubiquitinase that stabilizes proteins by preventing degradation; mutations affect cell survival and signaling.",
    "CACNA2D3": "Modulates calcium channels affecting cell signaling; often silenced in cancer and may regulate apoptosis.",
    "MUC16": "Encodes a large mucin protein (also known as CA125); overexpressed in tumors and may help in immune evasion."
}


for i in features:
    found = False
    if "mut" in i:
        for k in gene_functions:
            if k.lower() in i:
                found = True 
                break
        if found == False:
            features.remove(i)
features

    


In [None]:
for i in gene_functions:
  for k in features:
      if i.lower() in k:
         print(True)

In [None]:
df = data.copy()
maxUsage = dict()
maxAdditionalColumns = 2
current = 0
for i in df:
    current=0
    if "mut" in i: maxUsage[i] = 0
    if "mut" in i and "count" not in i:
        dummies = pd.get_dummies(data[i], prefix=i+"_")
        dummies = dummies.drop_duplicates()
        df = df.drop(columns=i,axis=1)
                
        for k in dummies:
            if current >= maxAdditionalColumns: break
            df = pd.concat([df.reset_index(drop=True),dummies[k].reset_index(drop=True)],axis=1)
            current +=1
        print(i)





            

df = df.drop_duplicates()
df
       

In [None]:
df.to_csv("data/patientMutations3.csv")

In [None]:
newF = list()
for i in df:
    newF.append(i)
newF

In [None]:
newF[:25]

dff = df.copy()

In [None]:
import numpy as np

def is_nan_or_empty(df):
    if df.empty:
        return True
    elif df.isna().all().all():
        return True
    else:
        return False


In [None]:
#dff.to_csv("patientMutationsData.csv")
for i in dff:
    print(is_nan_or_empty(dff[i]),i)

In [None]:
newData = pd.read_csv("data/patientMutations3.csv")

newData = newData.fillna(newData.mean(numeric_only=True))

for col in newData.columns:
    if newData[col].dtype in ['float64', 'int64']:
        newData[col] = newData[col].fillna(newData[col].mean())
    else:
        newData[col] = newData[col].fillna(newData[col].mode().iloc[0])

newData
            

In [None]:
newData.to_csv("redo.csv")


In [None]:
originalDF = pd.read_csv("data/breast_cancer/METABRIC_RNA_Mutation.csv")
dummies = pd.get_dummies(originalDF['cancer_type_detailed'])

# Only proceed if they match
if len(originalDF) == len(dummies):
    originalDF = pd.concat(
        [originalDF.reset_index(drop=True), dummies.reset_index(drop=True)],
        axis=1
    )
else:
    print("Mismatch in number of rows. Cannot concat.")


In [None]:
dummies = pd.get_dummies(originalDF['type_of_breast_surgery'],i+"_")
originalDF = originalDF.drop(columns="type_of_breast_surgery",axis=1)
# Only proceed if they match
if len(originalDF) == len(dummies):
    originalDF = pd.concat(
        [originalDF.reset_index(drop=True), dummies.reset_index(drop=True)],
        axis=1
    )
else:
    print("Mismatch in number of rows. Cannot concat.")
originalDF = originalDF.drop(columns="cancer_type_detailed",axis=1)
originalDF

In [None]:
features = [ 
        'age_at_diagnosis',
        'cellularity',
        'er_status_measured_by_ihc',
        'her2_status_measured_by_snp6',
        'hormone_therapy',
        'tumor_size',
        'tumor_stage',
        'Breast Invasive Ductal Carcinoma',
        'Breast Invasive Lobular Carcinoma',
        'Breast Invasive Mixed Mucinous Carcinoma',
        'Breast Mixed Ductal and Lobular Carcinoma',
        'Metaplastic Breast Cancer',
        'radio_therapy',
        'death_from_cancer'
    ]
    
labels = [
                'BREAST CONSERVING',
                'overall_survival_months',
                'MASTECTOMY'
]

for i in features:
    if originalDF[i].dtype == "object":
        dummies = pd.get_dummies(originalDF[i])
        # Only proceed if they match
        if len(originalDF) == len(dummies):
            originalDF = pd.concat(
                [originalDF.reset_index(drop=True), dummies.reset_index(drop=True)],
                axis=1
            )
        else:
            print("Mismatch in number of rows. Cannot concat.")
        originalDF = originalDF.drop(columns=i,axis=1)

originalDF


In [None]:
originalDF.to_csv("data/breast_cancer/originalDF.csv")