In [1]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv('breast_cancer_survival.csv') #upload dataset
df = df.reset_index()
df = df.rename(columns={'index':'Patient_ID'}) #create a patient ID column to use as key for the remaining steps 
df.sample(5)

Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
209,209,62,FEMALE,0.27503,0.8812,-0.51371,-0.023585,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,24-Jun-19,15-Jan-20,Alive
150,150,45,FEMALE,-0.27807,1.4291,-0.57895,0.22264,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,06-Mar-19,10-Sep-20,Dead
29,29,62,FEMALE,0.4569,0.73944,-0.63177,-0.060096,II,Infiltrating Lobular Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,20-Feb-19,18-Apr-19,Alive
202,202,71,FEMALE,0.39409,1.7054,0.035642,1.441,II,Infiltrating Lobular Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,26-Nov-18,,
41,41,75,FEMALE,0.17164,0.029656,-0.1589,0.67471,I,Infiltrating Lobular Carcinoma,Positive,Positive,Negative,Simple Mastectomy,24-Feb-17,05-Apr-17,Alive


In [23]:
#1. Melt
df_melt = df.melt(id_vars = ['Patient_ID'], value_vars= ['ER status', 'PR status', 'HER2 status'], var_name = 'Marker', value_name = 'Marker Status') #split out to just patients and markers using melt and rename columns to make this understandble
df_melt #this isn't super useful by itself but might help make a graph easier to create 

Unnamed: 0,Patient_ID,Marker,Marker Status
0,0,ER status,Positive
1,1,ER status,Positive
2,2,ER status,Positive
3,3,ER status,Positive
4,4,ER status,Positive
...,...,...,...
997,329,HER2 status,Positive
998,330,HER2 status,Positive
999,331,HER2 status,Negative
1000,332,HER2 status,Negative


In [29]:
#2. Pivot
#hopefully it is okay to use pivot_table because there isn't a great reason to us pivot with this dataframe as there is no longitudinal data
df_pivot = pd.pivot_table(df, values = "Patient_ID", index = 'Histology', columns = 'Tumour_Stage', aggfunc = 'count', fill_value = 0) #this now shows the count of patients by tumour stage at dx and histology 
df_pivot

Tumour_Stage,I,II,III
Histology,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Infiltrating Ductal Carcinoma,50,124,59
Infiltrating Lobular Carcinoma,11,56,22
Mucinous Carcinoma,3,9,0


In [32]:
#3. Aggregation
df_agg = df[['Protein1', 'Protein2', 'Protein3', 'Protein4']] #limit to just the columns we want to use
df_agg = df_agg.agg(['min', 'max', 'mean']) #aggregate minimum, maximum, and mean of each protein expression across the whole dataset
df_agg


Unnamed: 0,Protein1,Protein2,Protein3,Protein4
min,-2.3409,-0.97873,-1.6274,-2.0255
max,1.5936,3.4022,2.1934,1.6299
mean,-0.029991,0.946896,-0.090204,0.009819


In [40]:
#4. Iterate
for x in df.__iter__(): #iterate over all the column names 
    print(x) #print all the column names 

df

Patient_ID
Age
Gender
Protein1
Protein2
Protein3
Protein4
Tumour_Stage
Histology
ER status
PR status
HER2 status
Surgery_type
Date_of_Surgery
Date_of_Last_Visit
Patient_Status


Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,0,42,FEMALE,0.952560,2.15000,0.007972,-0.048340,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,20-May-18,26-Aug-18,Alive
1,1,54,FEMALE,0.000000,1.38020,-0.498030,-0.507320,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,26-Apr-18,25-Jan-19,Dead
2,2,63,FEMALE,-0.523030,1.76400,-0.370190,0.010815,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Lumpectomy,24-Aug-18,08-Apr-20,Alive
3,3,78,FEMALE,-0.876180,0.12943,-0.370380,0.132190,I,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,16-Nov-18,28-Jul-20,Alive
4,4,42,FEMALE,0.226110,1.74910,-0.543970,-0.390210,II,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Lumpectomy,12-Dec-18,05-Jan-19,Alive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,329,59,FEMALE,0.024598,1.40050,0.024751,0.280320,II,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Lumpectomy,15-Jan-19,27-Mar-20,Alive
330,330,41,FEMALE,0.100120,-0.46547,0.472370,-0.523870,I,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Modified Radical Mastectomy,25-Jul-18,23-Apr-19,Alive
331,331,54,FEMALE,0.753820,1.64250,-0.332850,0.857860,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Simple Mastectomy,26-Mar-19,11-Oct-19,Dead
332,332,74,FEMALE,0.972510,1.42680,-0.366570,-0.107820,II,Infiltrating Lobular Carcinoma,Positive,Positive,Negative,Lumpectomy,26-Nov-18,05-Dec-18,Alive


In [42]:
#4. Iterrows, in case we meant that instead of df.__iter__(), which does not do so m
for index, row in df.iterrows(): #iterrate over each row in the dataframe 
    print(f"Patient {row['Patient_ID']} had a {row['Surgery_type']} on {row['Date_of_Surgery']}") #print out a list of all the cases and the surgeries they had on which date
df

Patient 0 had a Other on 20-May-18
Patient 1 had a Other on 26-Apr-18
Patient 2 had a Lumpectomy on 24-Aug-18
Patient 3 had a Other on 16-Nov-18
Patient 4 had a Lumpectomy on 12-Dec-18
Patient 5 had a Modified Radical Mastectomy on 25-Jun-18
Patient 6 had a Lumpectomy on 27-Oct-18
Patient 7 had a Modified Radical Mastectomy on 19-Jul-18
Patient 8 had a Other on 15-Jun-18
Patient 9 had a Other on 20-Jun-19
Patient 10 had a Modified Radical Mastectomy on 26-Nov-18
Patient 11 had a Other on 14-Feb-17
Patient 12 had a Other on 27-Mar-18
Patient 13 had a Modified Radical Mastectomy on 08-Aug-18
Patient 14 had a Modified Radical Mastectomy on 09-Jun-18
Patient 15 had a Lumpectomy on 16-May-18
Patient 16 had a Other on 10-May-18
Patient 17 had a Modified Radical Mastectomy on 15-May-19
Patient 18 had a Other on 10-Jun-19
Patient 19 had a Other on 15-Feb-18
Patient 20 had a Simple Mastectomy on 15-Jul-18
Patient 21 had a Modified Radical Mastectomy on 14-Feb-19
Patient 22 had a Simple Mastecto

Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,0,42,FEMALE,0.952560,2.15000,0.007972,-0.048340,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,20-May-18,26-Aug-18,Alive
1,1,54,FEMALE,0.000000,1.38020,-0.498030,-0.507320,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,26-Apr-18,25-Jan-19,Dead
2,2,63,FEMALE,-0.523030,1.76400,-0.370190,0.010815,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Lumpectomy,24-Aug-18,08-Apr-20,Alive
3,3,78,FEMALE,-0.876180,0.12943,-0.370380,0.132190,I,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,16-Nov-18,28-Jul-20,Alive
4,4,42,FEMALE,0.226110,1.74910,-0.543970,-0.390210,II,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Lumpectomy,12-Dec-18,05-Jan-19,Alive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,329,59,FEMALE,0.024598,1.40050,0.024751,0.280320,II,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Lumpectomy,15-Jan-19,27-Mar-20,Alive
330,330,41,FEMALE,0.100120,-0.46547,0.472370,-0.523870,I,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Modified Radical Mastectomy,25-Jul-18,23-Apr-19,Alive
331,331,54,FEMALE,0.753820,1.64250,-0.332850,0.857860,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Simple Mastectomy,26-Mar-19,11-Oct-19,Dead
332,332,74,FEMALE,0.972510,1.42680,-0.366570,-0.107820,II,Infiltrating Lobular Carcinoma,Positive,Positive,Negative,Lumpectomy,26-Nov-18,05-Dec-18,Alive


In [20]:
#5. Groupby
survival_by_stage = pd.DataFrame(df.groupby(['Tumour_Stage', 'Patient_Status'])['Patient_ID'].count()).reset_index() #show how many breast cancer survivor there are that were originally diagnosed at each stage
stage_totals = survival_by_stage.groupby('Tumour_Stage')['Patient_ID'].transform('sum') #to make this more meaningful, add the percentages instead of the raw numbers
survival_by_stage['Percentage'] = (survival_by_stage['Patient_ID'] / stage_totals * 100).round(1)
survival_by_stage #we can see that as the stage increases there are slighly lower percentage of survivors 


Unnamed: 0,Tumour_Stage,Patient_Status,Patient_ID,Percentage
0,I,Alive,51,83.6
1,I,Dead,10,16.4
2,II,Alive,144,79.1
3,II,Dead,38,20.9
4,III,Alive,60,76.9
5,III,Dead,18,23.1
