In [1]:
# Import the required libraries & modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

In [2]:
gbc_patients = pd.read_excel("GBC.xlsx")

### Names of the features in the dataset

In [3]:
gbc_patients.columns

Index(['MR_NO', 'Comorbidities_final', 'Thyroidism', 'Diabetes', 'HTN',
       'Family_cancer_history', 'ECOG_final', 'Gall_stone_final', 'Pathology',
       'Tumour_grade_final', 'Cancer_presentation_final', 'Tumour_stage_final',
       'TNM_classification', 'Metastasis_final', 'Site_metastasis',
       'Mets_site_final', 'Cholecystectomy_TMC', 'Outside_cholecystectomy',
       'Surgery_final', 'Treatment_final', 'Treatment_clean', 'Treatment',
       'Status', 'Date_event', 'Date_diagnosis', 'AGE', 'GENDER', 'COUNTRY',
       'STATE_NAME', 'CITY_NAME', 'PINCODE', 'First_Visit_Date', 'HEIGHT',
       'WEIGHT', 'BSA_DETAILS', 'BMI_DETAILS', 'BILIRUBIN_CONJUGATED',
       'BILIRUBIN_UNCONJUGATED', 'BILIRUBIN_TOTAL', 'AST_SGOT', 'ALT_SGPT',
       'ALK_PHOS_ALP', 'GAMMA_GT', 'ALBUMIN', 'CREATININE_from_UnC_test',
       'UREA_from_UnC_test', 'ESTIMATED =_AVERAGE_GLUCOSE_eAG', 'HbA1c',
       'CREATININE_from_CREATININE_test', 'Diabetes_final'],
      dtype='object')

### Patient's count based on gender

In [4]:
gbc_patients['GENDER'].value_counts()

FEMA    462
MALE    236
Name: GENDER, dtype: int64

In [5]:
gbc_patients['GENDER'] = gbc_patients['GENDER'].replace('FEMA', 'FEMALE')


In [6]:
gbc_patients['GENDER'].value_counts()

FEMALE    462
MALE      236
Name: GENDER, dtype: int64

#### Overview of Dataset

In [7]:
gbc_patients.head()

Unnamed: 0,MR_NO,Comorbidities_final,Thyroidism,Diabetes,HTN,Family_cancer_history,ECOG_final,Gall_stone_final,Pathology,Tumour_grade_final,...,ALT_SGPT,ALK_PHOS_ALP,GAMMA_GT,ALBUMIN,CREATININE_from_UnC_test,UREA_from_UnC_test,ESTIMATED =_AVERAGE_GLUCOSE_eAG,HbA1c,CREATININE_from_CREATININE_test,Diabetes_final
0,MR/17/000051,,No,No,Yes,No,1,Yes,Adenocarcinoma,Grade 3,...,28.0,147.0,21.0,4.7,,,137.0,6.4,0.64,Yes
1,MR/17/000165,,No,No,No,No,Not known,No,Adenocarcinoma,Grade X,...,27.0,83.0,76.0,3.4,,,,,,No
2,MR/17/000504,,Hypothyroidism,Yes,Yes,Not known,Not known,Not known,Adenocarcinoma,Grade 3,...,,,,,,,,,,Yes
3,MR/17/000600,,No,No,Yes,No,1,No,Adenocarcinoma,Grade X,...,98.0,481.0,83.0,4.1,,,117.0,5.7,1.08,Yes
4,MR/17/000665,,No,No,No,No,1,Not known,Adenocarcinoma,Grade X,...,78.0,206.0,481.0,3.2,,,,,0.6,No


#### Datatypes of features and their non missing values count

In [8]:
gbc_patients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 50 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   MR_NO                            698 non-null    object        
 1   Comorbidities_final              566 non-null    object        
 2   Thyroidism                       657 non-null    object        
 3   Diabetes                         659 non-null    object        
 4   HTN                              659 non-null    object        
 5   Family_cancer_history            657 non-null    object        
 6   ECOG_final                       660 non-null    object        
 7   Gall_stone_final                 657 non-null    object        
 8   Pathology                        698 non-null    object        
 9   Tumour_grade_final               646 non-null    object        
 10  Cancer_presentation_final        642 non-null    object       

### Missing Values in the original data ###

In [9]:
# Check the total missing values in each column.
print("Total NULL Values in each columns")
print("**************************************")
print(gbc_patients.isnull().sum())

Total NULL Values in each columns
**************************************
MR_NO                                0
Comorbidities_final                132
Thyroidism                          41
Diabetes                            39
HTN                                 39
Family_cancer_history               41
ECOG_final                          38
Gall_stone_final                    41
Pathology                            0
Tumour_grade_final                  52
Cancer_presentation_final           56
Tumour_stage_final                  78
TNM_classification                  80
Metastasis_final                    66
Site_metastasis                    165
Mets_site_final                     68
Cholecystectomy_TMC                209
Outside_cholecystectomy             41
Surgery_final                       38
Treatment_final                     56
Treatment_clean                     68
Treatment                           68
Status                             114
Date_event                    

### Supplementary Table S1 - Description of the gallbladder cancer patient dataset features

In [10]:
import pandas as pd

# Load the dataset
df = pd.read_excel('GBC.xlsx')

# Create the metadata table
metadata = [
    # Category, Feature Name, Description, Data Type, Missing Values (Count)
    ["Patient Information", "Sample ID", "ID of the specific patient", "Object", 0],
    ["", "AGE", "Age of the patient", "int64", 0],
    ["", "GENDER", "Gender of the patient", "Object", 0],
    ["", "WEIGHT", "Weight of the patient", "float64", df['WEIGHT'].isna().sum()],
    ["", "HEIGHT", "Height of the patient", "float64", df['HEIGHT'].isna().sum()],
    ["", "BSA_DETAILS", "Body Surface Area (BSA) details", "float64", df['BSA_DETAILS'].isna().sum()],
    ["", "BMI_DETAILS", "Body Mass Index (BMI) details", "float64", df['BMI_DETAILS'].isna().sum()],
    
    ["Comorbidities and History", "Comorbidities_final", "Final comorbidities", "Object", df['Comorbidities_final'].isna().sum()],
    ["", "Thyroidism", "Presence of thyroidism", "Object", df['Thyroidism'].isna().sum()],
    ["", "Diabetes", "Presence of diabetes", "Object", df['Diabetes'].isna().sum()],
    ["", "HTN", "Hypertension", "Object", df['HTN'].isna().sum()],
    ["", "Family_cancer_history", "Family history of cancer", "Object", df['Family_cancer_history'].isna().sum()],
    ["", "Diabetes_final", "Final status of diabetes", "Object", df['Diabetes_final'].isna().sum()],
    
    ["Cancer Specific Information", "ECOG_final", "Eastern Cooperative Oncology Group performance status", "Object", df['ECOG_final'].isna().sum()],
    ["", "Gall_stone_final", "Final status of gall stones", "Object", df['Gall_stone_final'].isna().sum()],
    ["", "Pathology", "Pathology of the cancer", "Object", df['Pathology'].isna().sum()],
    ["", "Tumour_grade_final", "Final grade of the tumor", "Object", df['Tumour_grade_final'].isna().sum()],
    ["", "Cancer_presentation_final", "Final presentation of the cancer", "Object", df['Cancer_presentation_final'].isna().sum()],
    ["", "Tumour_stage_final", "Final stage of the tumor", "Object", df['Tumour_stage_final'].isna().sum()],
    ["", "TNM_classification", "TNM classification of the cancer", "Object", df['TNM_classification'].isna().sum()],
    ["", "Metastasis_final", "Final status of metastasis", "Object", df['Metastasis_final'].isna().sum()],
    ["", "Site_metastasis", "Site of metastasis", "Object", df['Site_metastasis'].isna().sum()],
    ["", "Mets_site_final", "Final site of metastasis", "Object", df['Mets_site_final'].isna().sum()],
    
    ["Treatment Information", "Cholecystectomy_TMC", "Cholecystectomy at TMC", "Object", df['Cholecystectomy_TMC'].isna().sum()],
    ["", "Outside_cholecystectomy", "Cholecystectomy outside TMC", "Object", df['Outside_cholecystectomy'].isna().sum()],
    ["", "Surgery_final", "Final status of surgery", "Object", df['Surgery_final'].isna().sum()],
    ["", "Treatment_final", "Final treatment", "Object", df['Treatment_final'].isna().sum()],
    ["", "Treatment_clean", "Cleaned treatment data", "Object", df['Treatment_clean'].isna().sum()],
    ["", "Treatment", "Treatment", "Object", df['Treatment'].isna().sum()],
    
    ["Target Variable", "Status", "Status of the treatment (Alive, Dead, Not Known, etc.)", "Object", df['Status'].isna().sum()],
    
    ["Event Dates", "Date_event", "Date of a relevant event (Death or Survival)", "datetime64[ns]", df['Date_event'].isna().sum()],
    ["", "Date_diagnosis", "Date of diagnosis", "datetime64[ns]", df['Date_diagnosis'].isna().sum()],
    
    ["Laboratory Tests", "BILIRUBIN_CONJUGATED", "Conjugated bilirubin", "float64", df['BILIRUBIN_CONJUGATED'].isna().sum()],
    ["", "BILIRUBIN_UNCONJUGATED", "Unconjugated bilirubin", "float64", df['BILIRUBIN_UNCONJUGATED'].isna().sum()],
    ["", "BILIRUBIN_TOTAL", "Total bilirubin", "float64", df['BILIRUBIN_TOTAL'].isna().sum()],
    ["", "AST_SGOT", "Aspartate aminotransferase", "float64", df['AST_SGOT'].isna().sum()],
    ["", "ALT_SGPT", "Alanine aminotransferase", "float64", df['ALT_SGPT'].isna().sum()],
    ["", "ALK_PHOS_ALP", "Alkaline phosphatase", "float64", df['ALK_PHOS_ALP'].isna().sum()],
    ["", "GAMMA_GT", "Gamma-glutamyltransferase", "float64", df['GAMMA_GT'].isna().sum()],
    ["", "ALBUMIN", "Albumin", "float64", df['ALBUMIN'].isna().sum()],
    ["", "CREATININE_from_UnC_test", "Creatinine from unclassified test", "float64", df['CREATININE_from_UnC_test'].isna().sum()],
    ["", "UREA_from_UnC_test", "Urea from unclassified test", "float64", df['UREA_from_UnC_test'].isna().sum()],
    ["", "ESTIMATED =_AVERAGE_GLUCOSE_eAG", "Estimated average glucose", "float64", df['ESTIMATED =_AVERAGE_GLUCOSE_eAG'].isna().sum()],
    ["", "HbA1c", "Hemoglobin A1c", "float64", df['HbA1c'].isna().sum()],
    ["", "CREATININE_from_CREATININE_test", "Creatinine from creatinine test", "float64", df['CREATININE_from_CREATININE_test'].isna().sum()],
]

# Create DataFrame from metadata
metadata_df = pd.DataFrame(metadata, columns=["Category", "Feature Name", "Description", "Data Type", "Missing Values (Count)"])

# Rename columns to match your requested format
metadata_df['Feature Name'] = metadata_df['Feature Name'].replace({
    'BILIRUBIN_CONJUGATED': 'BILIRUBIN CONJUGATED',
    'BILIRUBIN_UNCONJUGATED': 'BILIRUBIN UNCONJUGATED',
    'BILIRUBIN_TOTAL': 'BILIRUBIN TOTAL',
    'AST_SGOT': 'AST - SGOT',
    'ALT_SGPT': 'ALT - SGPT',
    'ALK_PHOS_ALP': 'ALK PHOS [ALP]',
    'GAMMA_GT': 'GAMMA GT',
    'ESTIMATED =_AVERAGE_GLUCOSE_eAG': 'ESTIMATED AVERAGE GLUCOSE (eAG)'
})

# Display the metadata table
metadata_df



Unnamed: 0,Category,Feature Name,Description,Data Type,Missing Values (Count)
0,Patient Information,Sample ID,ID of the specific patient,Object,0
1,,AGE,Age of the patient,int64,0
2,,GENDER,Gender of the patient,Object,0
3,,WEIGHT,Weight of the patient,float64,132
4,,HEIGHT,Height of the patient,float64,132
5,,BSA_DETAILS,Body Surface Area (BSA) details,float64,132
6,,BMI_DETAILS,Body Mass Index (BMI) details,float64,132
7,Comorbidities and History,Comorbidities_final,Final comorbidities,Object,132
8,,Thyroidism,Presence of thyroidism,Object,41
9,,Diabetes,Presence of diabetes,Object,39


### Supplementary Table 2 - Status of the patients ###

In [11]:
gbc_patients["Status"].value_counts()

Dead         460
Not Known     63
Alive         34
Live          26
No             1
Name: Status, dtype: int64

In [12]:
status_counts = gbc_patients['Status'].value_counts()
# Getting the total number of occurrences
total_occurrences = len(gbc_patients)

# Calculating the percentage of occurrence for each category
percentage_occurrences = (status_counts / total_occurrences) * 100

print(percentage_occurrences)

Dead         65.902579
Not Known     9.025788
Alive         4.871060
Live          3.724928
No            0.143266
Name: Status, dtype: float64


**Above is the percentage distribution of patients w.r.t their survival outcome**

**The dataset is quite IMBALANCED - Hence difficult to make an accurate prediction model**

### Renaming the features to make them much clear

In [13]:

new_column_names = {
    'Thyroidism': 'Thyroid',
    'HTN': 'Hypertension',
    'ALK PHOS [ALP]': 'Alkaline Phosphatase',
    'GAMMA GT': 'Gamma-glutamyl-transferase',
    'BSA_DETAILS':'BSA_BODY_SURFACE_AREA',
    'ECOG_final': 'ECOG',
    'Gall_stone_final':'Gallstones',
    'Tumour_stage_final': 'Tumour stage',
    'Tumour_grade_final': 'Tumour grade',
    'Mets_site_final': 'Metastasis site',
    'Surgery_final': 'Surgery',
    'Treatment_final':'Treatment administered',
    'Diabetes_final':'Diabetes',
    'BMI_DETAILS': 'BMI-BODY MASS INDEX',
    'GAMMA_GT': 'Gamma-glutamyl-transferase',
    'Cancer_presentation_final':'Cancer presentation',
    'BILIRUBIN_CONJUGATED': 'BILIRUBIN CONJUGATED',
    'BILIRUBIN_UNCONJUGATED':'BILIRUBIN UNCONJUGATED',
    'BILIRUBIN_TOTAL':'BILIRUBIN TOTAL',
    'AST_SGOT':'AST-SGOT',
    'ALT_SGPT':'ALT-SGPT',
    'CREATININE_from_UnC_test': 'Creatinine (UnC test)',
    'UREA_from_UnC_test': 'Urea (UnC test)',
    'ESTIMATED AVERAGE GLUCOSE (eAG)': 'ESTIMATED AVERAGE GLUCOSE (eAG)',
    'HbA1c': 'HbA1c',
    'ALBUMIN':'ALBUMIN',
    'CREATININE_from_CREATININE_test': 'Creatinine (creatinine test)',
        
}


gbc_patients.rename(columns=new_column_names, inplace=True)

### Supplementary Table S3 -  Summary statistics for kidney and glycemic markers (is made on excel sheet using the below analyses - only for patients having Status = "Dead" or "Alive")

In [14]:
from tabulate import tabulate

# List of numerical features in the dataset
numerical_features = ['AGE', 'HEIGHT', 'WEIGHT', 'BSA_BODY_SURFACE_AREA', 'BMI-BODY MASS INDEX', 
                      'BILIRUBIN CONJUGATED', 'BILIRUBIN UNCONJUGATED', 'BILIRUBIN TOTAL', 
                      'AST-SGOT', 'ALT-SGPT', 'ALK_PHOS_ALP', 'Gamma-glutamyl-transferase', 'ALBUMIN', 
                      'Creatinine (UnC test)', 'Urea (UnC test)', 'ESTIMATED =_AVERAGE_GLUCOSE_eAG', 
                      'HbA1c', 'Creatinine (creatinine test)']

# Iterate over each numerical feature
for feature in numerical_features:
    # Calculate summary statistics
    summary_stats = gbc_patients.groupby('Status')[feature].describe()

    # Convert the summary statistics DataFrame to a more visually appealing table
    summary_table = summary_stats.reset_index()
    summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 
                                                  'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 
                                                  'max': 'Max'})
    summary_table.set_index('Status', inplace=True)

    # Print the visually appealing table
    print(f"Summary Statistics for {feature}:")
    print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))
    print("\n")  # Add newline for better readability


Summary Statistics for AGE:
╒═══════════╤═════════╤═════════╤══════════════════════╤═══════╤═══════╤══════════╤═══════╤═══════╕
│ Status    │   Count │    Mean │   Standard Deviation │   Min │   25% │   Median │   75% │   Max │
╞═══════════╪═════════╪═════════╪══════════════════════╪═══════╪═══════╪══════════╪═══════╪═══════╡
│ Alive     │      34 │ 60.5588 │              9.24479 │    39 │    55 │     61.5 │ 66.75 │    82 │
├───────────┼─────────┼─────────┼──────────────────────┼───────┼───────┼──────────┼───────┼───────┤
│ Dead      │     460 │ 58.1609 │             10.4758  │    20 │    51 │     59   │ 65    │    87 │
├───────────┼─────────┼─────────┼──────────────────────┼───────┼───────┼──────────┼───────┼───────┤
│ Live      │      26 │ 55.8846 │             13.3456  │    23 │    48 │     58   │ 67.5  │    74 │
├───────────┼─────────┼─────────┼──────────────────────┼───────┼───────┼──────────┼───────┼───────┤
│ No        │       1 │ 37      │            nan       │    37 │    37 │

### Extra EDA for understanding the dataset - NOT USEFUL

### 1. Continuos Variables ###
#### 1.1 AGE ###

In [None]:
sns.distplot(gbc_patients["AGE"])

In [None]:
gbc_patients["AGE"].median()

In [None]:
gbc_patients["AGE"].mean()

In [None]:
# Calculate summary statistics
summary_stats = gbc_patients.groupby('Status')['AGE'].describe()
print(summary_stats)

In [None]:
# Boxplot
sns.boxplot(x='Status', y='AGE', data=gbc_patients)

**1- The ages of the live and dead patients does not differ much in the given dataset**

**2- The difference between the median and mean ages in seperate "Status" categories is also very less**

In [None]:
# Histograms or Density Plots
sns.displot(data=gbc_patients, x='AGE', hue='Status', kind='kde', fill=True)


#### 1.2. Weight/ Height/ BMI/ BSA

In [None]:
sns.distplot(gbc_patients["WEIGHT"])

In [None]:
gbc_patients["WEIGHT"].median()

In [None]:
gbc_patients["WEIGHT"].mean()

In [None]:
from tabulate import tabulate

# Calculate summary statistics
summary_stats = gbc_patients.groupby('Status')['WEIGHT'].describe()

# Convert the summary statistics DataFrame to a more visually appealing table
summary_table = summary_stats.reset_index()
summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 'max': 'Max'})
summary_table.set_index('Status', inplace=True)

# Print the visually appealing table
print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))


#### 1.3. Height

In [None]:
sns.distplot(gbc_patients["HEIGHT"])

In [None]:
from tabulate import tabulate

# Calculate summary statistics
summary_stats = gbc_patients.groupby('Status')['HEIGHT'].describe()

# Convert the summary statistics DataFrame to a more visually appealing table
summary_table = summary_stats.reset_index()
summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 'max': 'Max'})
summary_table.set_index('Status', inplace=True)

# Print the visually appealing table
print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))


#### 1.4. BSA_DETAILS

In [None]:
sns.distplot(gbc_patients["BSA_DETAILS"])

In [None]:
from tabulate import tabulate

# Calculate summary statistics
summary_stats = gbc_patients.groupby('Status')['BSA_DETAILS'].describe()

# Convert the summary statistics DataFrame to a more visually appealing table
summary_table = summary_stats.reset_index()
summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 'max': 'Max'})
summary_table.set_index('Status', inplace=True)

# Print the visually appealing table
print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))


#### 1.5. BMI_DETAILS

In [None]:
sns.distplot(gbc_patients["BMI_DETAILS"])

In [None]:
from tabulate import tabulate

# Calculate summary statistics
summary_stats = gbc_patients.groupby('Status')['BMI_DETAILS'].describe()

# Convert the summary statistics DataFrame to a more visually appealing table
summary_table = summary_stats.reset_index()
summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 'max': 'Max'})
summary_table.set_index('Status', inplace=True)

# Print the visually appealing table
print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))


#### 1.6. HbA1c

In [None]:
sns.distplot(gbc_patients["HbA1c"])

In [None]:
from tabulate import tabulate

# Calculate summary statistics
summary_stats = gbc_patients.groupby('Status')['HbA1c'].describe()

# Convert the summary statistics DataFrame to a more visually appealing table
summary_table = summary_stats.reset_index()
summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 'max': 'Max'})
summary_table.set_index('Status', inplace=True)

# Print the visually appealing table
print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))


In [None]:
gbc_patients.columns

### 2. LFT (Liver Function Test)  vs Status ###

In [None]:
lft = gbc_patients[["AST_SGOT","ALT_SGPT", "ALK_PHOS_ALP", "ALBUMIN","BILIRUBIN_TOTAL", "GAMMA_GT","Status"]]

In [None]:
lft.shape

#### 2.1 LFT - AST vs Status ####

In [None]:
sns.distplot(lft["AST_SGOT"])

In [None]:
from tabulate import tabulate

# Calculate summary statistics
summary_stats = lft.groupby('Status')['AST_SGOT'].describe()

# Convert the summary statistics DataFrame to a more visually appealing table
summary_table = summary_stats.reset_index()
summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 'max': 'Max'})
summary_table.set_index('Status', inplace=True)

# Print the visually appealing table
print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))


**The values of AST - SGOT ranges from 12 to 2203**

**The mean and median values of AST - SGOT for dead patients are mostly higher**


#### 2.2 LFT - ALT - SGPT vs Status

In [None]:
sns.distplot(lft["ALT_SGPT"])

In [None]:
from tabulate import tabulate

# Calculate summary statistics
summary_stats = lft.groupby('Status')['ALT_SGPT'].describe()

# Convert the summary statistics DataFrame to a more visually appealing table
summary_table = summary_stats.reset_index()
summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 'max': 'Max'})
summary_table.set_index('Status', inplace=True)

# Print the visually appealing table
print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))


**The values of ALT - SGPT ranges from 9 to 779**

**The mean and median values of ALT - SGPT for dead patients are mostly higher**

#### 2.3 LFT - ALK PHOS [ALP] vs Status

In [None]:
sns.distplot(lft["ALK_PHOS_ALP"])

In [None]:
from tabulate import tabulate

# Calculate summary statistics
summary_stats = lft.groupby('Status')['ALK_PHOS_ALP'].describe()

# Convert the summary statistics DataFrame to a more visually appealing table
summary_table = summary_stats.reset_index()
summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 'max': 'Max'})
summary_table.set_index('Status', inplace=True)

# Print the visually appealing table
print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))


**The values of ALK PHOS [ALP] ranges from 44 to 2740**

**The mean and median values of ALK PHOS [ALP] for dead patients are mostly higher than the alive patients**

In [None]:
# Histograms or Density Plots
sns.displot(data=lft, x='ALK_PHOS_ALP', hue='Status', kind='kde', fill=True)


#### 2.4. LFT- ALBUMIN vs Status

In [None]:
sns.distplot(lft["ALBUMIN"])

In [None]:
from tabulate import tabulate

# Calculate summary statistics
summary_stats = lft.groupby('Status')['ALBUMIN'].describe()

# Convert the summary statistics DataFrame to a more visually appealing table
summary_table = summary_stats.reset_index()
summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 'max': 'Max'})
summary_table.set_index('Status', inplace=True)

# Print the visually appealing table
print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))


In [None]:
# Histograms or Density Plots
sns.displot(data=lft, x='ALBUMIN', hue='Status', kind='kde', fill=True)


**The values of ALBUMIN ranges from 1.8 to 5**

**The mean and median values of ALBUMIN for dead patients are mostly lower than the alive patients**

#### 2.5. LFT- BILIRUBIN TOTAL vs Status

In [None]:
sns.distplot(lft["BILIRUBIN_TOTAL"])

In [None]:
from tabulate import tabulate

# Calculate summary statistics
summary_stats = lft.groupby('Status')['BILIRUBIN_TOTAL'].describe()

# Convert the summary statistics DataFrame to a more visually appealing table
summary_table = summary_stats.reset_index()
summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 'max': 'Max'})
summary_table.set_index('Status', inplace=True)

# Print the visually appealing table
print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))


In [None]:
# Histograms or Density Plots
sns.displot(data=lft, x='BILIRUBIN_TOTAL', hue='Status', kind='kde', fill=True)


**The values of BILIRUBIN TOTAL ranges from 0.1 to 33.4**

**The mean values of BILIRUBIN TOTAL for dead patients are mostly higher than the alive patients**

#### 2.6. LFT - GAMMA GT vs Status

In [None]:
sns.distplot(lft["GAMMA_GT"])

In [None]:
from tabulate import tabulate

# Calculate summary statistics
summary_stats = lft.groupby('Status')['GAMMA_GT'].describe()

# Convert the summary statistics DataFrame to a more visually appealing table
summary_table = summary_stats.reset_index()
summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 'max': 'Max'})
summary_table.set_index('Status', inplace=True)

# Print the visually appealing table
print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))


In [None]:
# Histograms or Density Plots
sns.displot(data=lft, x='GAMMA_GT', hue='Status', kind='kde', fill=True)


**The values of GAMMA GT ranges from 11 to 3776**

**The mean and median values of GAMMA GT for dead patients are mostly higher than the alive patients**

### SUMMARY Statistics of LFT Test

In [None]:
lft = lft.drop("Status", axis=1) #Dropping Status

import pandas as pd

# Compute median for each feature
median_values = lft.median()

# Compute IQR for each feature
Q1 = lft.quantile(0.25)
Q3 = lft.quantile(0.75)
IQR = Q3 - Q1

# Count non-null values for each feature
non_null_count = lft.count()

# Format quartile values for display
Q1_str = Q1.apply(lambda x: f'{x:.2f}')
Q3_str = Q3.apply(lambda x: f'{x:.2f}')

# Create a DataFrame to display the results in tabular form
result_table = pd.DataFrame({
    "Patient's Count": non_null_count.values,
    'Feature': median_values.index,
    'Median': median_values.values,
    'Q1 (0.25)': Q1_str.values,
    'Q3 (0.75)': Q3_str.values,
    'IQR': IQR.values
})

# Print the visually appealing table
print(tabulate(result_table, headers='keys', tablefmt='fancy_grid'))


### 3. KFT (Kidney Function Test) vs Status

In [None]:
gbc_patients.info()

In [None]:
kft = gbc_patients[["CREATININE_from_UnC_test","CREATININE_from_CREATININE_test", "UREA_from_UnC_test", "Status"]]

In [None]:
kft.shape

In [None]:
kft.isnull().sum()

#### 3.1. KFT- CREATININE_from_UnC_test

In [None]:
sns.distplot(kft["CREATININE_from_UnC_test"])

In [None]:
from tabulate import tabulate

# Calculate summary statistics
summary_stats = kft.groupby('Status')['CREATININE_from_UnC_test'].describe()

# Convert the summary statistics DataFrame to a more visually appealing table
summary_table = summary_stats.reset_index()
summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 'max': 'Max'})
summary_table.set_index('Status', inplace=True)

# Print the visually appealing table
print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))


In [None]:
# Histograms or Density Plots
sns.displot(data=kft, x='CREATININE_from_UnC_test', hue='Status', kind='kde', fill=True)


#### 3.2. KFT- CREATININE_from_CREATININE_test

In [None]:
sns.distplot(kft["CREATININE_from_CREATININE_test"])

In [None]:
from tabulate import tabulate

# Calculate summary statistics
summary_stats = kft.groupby('Status')['CREATININE_from_CREATININE_test'].describe()

# Convert the summary statistics DataFrame to a more visually appealing table
summary_table = summary_stats.reset_index()
summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 'max': 'Max'})
summary_table.set_index('Status', inplace=True)

# Print the visually appealing table
print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))


In [None]:
# Histograms or Density Plots
sns.displot(data=kft, x='CREATININE_from_CREATININE_test', hue='Status', kind='kde', fill=True)


#### 3.3. KFT- UREA_from_UnC_test

In [None]:
sns.distplot(kft["UREA_from_UnC_test"])

In [None]:
from tabulate import tabulate

# Calculate summary statistics
summary_stats = kft.groupby('Status')['UREA_from_UnC_test'].describe()

# Convert the summary statistics DataFrame to a more visually appealing table
summary_table = summary_stats.reset_index()
summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 'max': 'Max'})
summary_table.set_index('Status', inplace=True)

# Print the visually appealing table
print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))


In [None]:
# Histograms or Density Plots
sns.displot(data=kft, x='UREA_from_UnC_test', hue='Status', kind='kde', fill=True)


In [None]:
# Filter columns with object data type
categorical_columns = gbc_patients.select_dtypes(include=['object']).columns

# Calculate counts of each category in categorical features
category_counts = {}

for column in categorical_columns:
    counts = gbc_patients[column].value_counts()
    category_counts[column] = counts

# Print the counts of each category in categorical features
for column, counts in category_counts.items():
    print(f'Counts for {column}:')
    print(counts)
    print()

In [None]:
###### FILTERED DATA ######

In [None]:
# Filter the data based on the specified conditions
df_filter = gbc_patients[(gbc_patients['Status'].isin(['Alive', 'Live'])) |
                                   (~gbc_patients['Date_event'].isna() & ~gbc_patients['Date_diagnosis'].isna())]

print('Shape of the filtered data:', df_filter.shape)
df_filter.head(30)

In [None]:
df_filter.isnull().sum()

In [None]:
df_filter = df_filter.drop(columns=['MR_NO','Comorbidities_final', 'TNM_classification', 'Metastasis_final', 'Site_metastasis', 'Cholecystectomy_TMC', 'Outside_cholecystectomy', 'Treatment_clean', 'Treatment', 'Date_event', 'Date_diagnosis', 'COUNTRY', 'STATE_NAME', 'CITY_NAME', 'PINCODE','BSA_DETAILS', 'HEIGHT','WEIGHT','First_Visit_Date', 'CREATININE_from_UnC_test', 'UREA_from_UnC_test', 'ESTIMATED =_AVERAGE_GLUCOSE_eAG', 'HbA1c', 'CREATININE_from_CREATININE_test'])


In [None]:
df_filter.shape

In [None]:
# Export DataFrame to an Excel file
df_filter.to_excel('working_gbc_1.xlsx', index=False)

In [None]:

new_column_names = {
    'Thyroidism': 'Thyroid',
    'HTN': 'Hypertension',
    'ALK PHOS [ALP]': 'Alkaline Phosphatase',
    'GAMMA GT': 'Gamma-glutamyl-transferase',
    'BSA_DETAILS':'BSA_BODY_SURFACE_AREA',
    'ECOG_final': 'ECOG',
    'Gall_stone_final':'Gallstones',
    'Tumour_stage_final': 'Tumour stage',
    'Tumour_grade_final': 'Tumour grade',
    'Mets_site_final': 'Metastasis site',
    'Surgery_final': 'Surgery',
    'Treatment_final':'Treatment administered',
    'Diabetes_final':'Diabetes',
    'BMI_DETAILS': 'BMI-BODY MASS INDEX',
    'GAMMA_GT': 'Gamma-glutamyl-transferase',
    'Cancer_presentation_final':'Cancer presentation',
    'BILIRUBIN_CONJUGATED': 'BILIRUBIN CONJUGATED',
    'BILIRUBIN_UNCONJUGATED':'BILIRUBIN UNCONJUGATED',
    'BILIRUBIN_TOTAL':'BILIRUBIN TOTAL',
    'AST_SGOT':'AST-SGOT',
    'ALT_SGPT':'ALT-SGPT',
    'CREATININE_from_UnC_test': 'Creatinine (UnC test)',
    'UREA_from_UnC_test': 'Urea (UnC test)',
    'ESTIMATED AVERAGE GLUCOSE (eAG)': 'ESTIMATED AVERAGE GLUCOSE (eAG)',
    'HbA1c': 'HbA1c',
    'ALBUMIN':'ALBUMIN',
    'CREATININE_from_CREATININE_test': 'Creatinine (creatinine test)',
        
}


gbc_patients.rename(columns=new_column_names, inplace=True)

In [None]:
import pandas as pd

# Assuming gbc_patients is your DataFrame
gbc_patients['Status'] = gbc_patients['Status'].replace({'Live': 'Alive'})


In [None]:
gbc_patients.head()

In [None]:
gbc_patients.columns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# List of numerical features in the dataset
numerical_features = ['AGE', 'BILIRUBIN CONJUGATED', 'BILIRUBIN UNCONJUGATED', 'BILIRUBIN TOTAL', 
                      'AST-SGOT', 'ALT-SGPT', 'ALK_PHOS_ALP', 'Gamma-glutamyl-transferase', 'ALBUMIN']

# Create a single plot for all numerical features
plt.figure(figsize=(12, 10))

# Iterate over each numerical feature
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(3,3, i)  # Create subplot
    sns.histplot(data=gbc_patients, x=feature, kde=True)  # Plot distribution
    plt.title(f'Distribution of\n{feature}', fontsize=11)  # Set title with line break and smaller font size
    plt.xticks(rotation=0)  # Rotate x-axis labels

#plt.subplots_adjust(hspace=1, wspace=5)  # Adjust horizontal and vertical space between subplots

plt.tight_layout(pad=3.0)
plt.savefig('Figure 1.png')  # Save the image as Figure 1
plt.show()


In [None]:
gbc_patients.columns

In [None]:

new_column_names = {
    'CREATININE_from_UnC_test': 'Creatinine (UnC test)',
    'UREA_from_UnC_test': 'Urea (UnC test)',
    'ESTIMATED AVERAGE GLUCOSE (eAG)': 'ESTIMATED AVERAGE GLUCOSE (eAG)',
    'HbA1c': 'HbA1c',
    'ALBUMIN': 'ALBUMIN',
    'CREATININE_from_CREATININE_test': 'Creatinine (creatinine test)',
    
}


gbc_patients.rename(columns=new_column_names, inplace=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# List of numerical features in the dataset
numerical_features = ['WEIGHT', 'HEIGHT', 'BMI-BODY MASS INDEX','Creatinine (UnC test)', 'Urea (UnC test)', 
                      'ESTIMATED =_AVERAGE_GLUCOSE_eAG', 'HbA1c', 'Creatinine (creatinine test)']

# Create a single plot for all numerical features
plt.figure(figsize=(12, 10))

# Iterate over each numerical feature
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(3,3, i)  # Create subplot
    sns.histplot(data=gbc_patients, x=feature, kde=True)  # Plot distribution
    plt.title(f'\n{feature}', fontsize=11)  # Set title with line break and smaller font size
    plt.xticks(rotation=0)  # Rotate x-axis labels

#plt.subplots_adjust(hspace=1, wspace=5)  # Adjust horizontal and vertical space between subplots

plt.tight_layout(pad=3.0)
plt.savefig('Figure S1.png')  # Save the image as Figure 1
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# List of numerical features in the dataset
numerical_features = ['WEIGHT', 'HEIGHT', 'BMI-BODY MASS INDEX','Creatinine (UnC test)', 'Urea (UnC test)', 
                      'ESTIMATED =_AVERAGE_GLUCOSE_eAG', 'HbA1c', 'Creatinine (creatinine test)']

# Create a single plot for all numerical features
plt.figure(figsize=(12, 10))

# Iterate over each numerical feature
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(3,3, i)  # Create subplot
    if feature == 'BMI-BODY MASS INDEX':
        sns.histplot(data=gbc_patients, x=feature, kde=True)  # Plot distribution
        plt.xlim(0, 100)  # Set x-axis limits for BMI
    else:
        sns.histplot(data=gbc_patients, x=feature, kde=True)  # Plot distribution
    #plt.title(f'\n{feature}', fontsize=11)  # Set title with line break and smaller font size
    plt.xticks(rotation=0)  # Rotate x-axis labels

plt.tight_layout(pad=3.0)
plt.savefig('Figure S1.png')  # Save the image as Figure 1
plt.show()


In [None]:
gbc_patients['Status'].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# List of numerical features in the dataset
numerical_features = ['AGE', 'HEIGHT', 'WEIGHT', 'BSA_BODY_SURFACE_AREA', 'BMI-BODY MASS INDEX', 
                      'BILIRUBIN CONJUGATED', 'BILIRUBIN UNCONJUGATED', 'BILIRUBIN TOTAL', 
                      'AST-SGOT', 'ALT-SGPT', 'ALK_PHOS_ALP', 'Gamma-glutamyl-transferase', 'ALBUMIN', 
                      'Creatinine (UnC test)', 'Urea (UnC test)', 'ESTIMATED =_AVERAGE_GLUCOSE_eAG', 
                      'HbA1c', 'Creatinine (creatinine test)']

# Calculate the number of rows needed
num_rows = (len(numerical_features) + 2) // 3  # Ceiling division to ensure enough rows

# Create a plot with a maximum of three subplots per row
plt.figure(figsize=(20, 5 * num_rows))

# Iterate over each numerical feature
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(num_rows, 3, i)  # Create subplot
    sns.boxplot(x='Status', y=feature, data=gbc_patients)  # Plot boxplot
    plt.title(f'Boxplot of {feature} by Status', fontsize=10)  # Set title with smaller font size
    plt.xticks(rotation=45)  # Rotate x-axis labels

plt.tight_layout()
plt.show()


In [None]:
gbc_patients.columns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Define numerical features and the target variable
numerical_features = ['AGE', 'BILIRUBIN CONJUGATED', 'BILIRUBIN UNCONJUGATED', 
                      'BILIRUBIN TOTAL', 'AST-SGOT', 'ALT-SGPT', 'ALK_PHOS_ALP', 'Gamma-glutamyl-transferase', 'ALBUMIN']

target_variable = 'Status'

# Create a DataFrame with numerical features and target variable
data_for_plots = gbc_patients[numerical_features + [target_variable]]

# Set up the figure and axes
num_rows = 3
num_cols = 3
fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(15, 15), constrained_layout=True)

# Choose a color palette
colors = sns.color_palette("Set2")

# Iterate over numerical features
for i, feature in enumerate(numerical_features):
    row_index = i // num_cols
    col_index = i % num_cols
    # Plot histogram with KDE
    sns.histplot(data=gbc_patients, x=feature, hue=target_variable, kde=True, ax=axes[row_index, col_index], palette=colors)
    axes[row_index, col_index].set_title(f'{feature} by {target_variable}')
    axes[row_index, col_index].set_xlabel(feature)
    axes[row_index, col_index].set_ylabel('Frequency')

# Adjust layout with padding
plt.tight_layout(pad=3)
plt.show()

# Save the data to an Excel file
excel_filename = 'numerical_data_for_plots_numerical_stacked_520.xlsx'
data_for_plots.to_excel(excel_filename, index=False)
print(f"Data saved to '{excel_filename}'")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# List of numerical features in the dataset
numerical_features = ['AGE', 'HEIGHT', 'WEIGHT', 'BSA_BODY_SURFACE_AREA', 'BMI-BODY MASS INDEX', 
                      'BILIRUBIN CONJUGATED', 'BILIRUBIN UNCONJUGATED', 'BILIRUBIN TOTAL', 
                      'AST-SGOT', 'ALT-SGPT', 'ALK_PHOS_ALP', 'Gamma-glutamyl-transferase', 'ALBUMIN', 
                      'Creatinine (UnC test)', 'Urea (UnC test)', 'ESTIMATED =_AVERAGE_GLUCOSE_eAG', 
                      'HbA1c', 'Creatinine (creatinine test)']

# Calculate the number of rows and columns needed
num_features = len(numerical_features)
num_cols = min(2, num_features)
num_rows = (num_features + num_cols - 1) // num_cols

# Create a plot with a maximum of two subplots per row
plt.figure(figsize=(15, 5 * num_rows))

# Iterate over each numerical feature
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(num_rows, num_cols, i)  # Create subplot
    sns.histplot(data=gbc_patients, x=feature, hue='Status', kde=True, fill=True)  # Plot histogram with density plot
    plt.title(f'Histogram of {feature} by Status', fontsize=10)  # Set title with smaller font size
    plt.xticks(rotation=45)  # Rotate x-axis labels

plt.tight_layout()
plt.show()


In [None]:
from tabulate import tabulate

# List of numerical features in the dataset
numerical_features = ['AGE', 'HEIGHT', 'WEIGHT', 'BSA_BODY_SURFACE_AREA', 'BMI-BODY MASS INDEX', 
                      'BILIRUBIN CONJUGATED', 'BILIRUBIN UNCONJUGATED', 'BILIRUBIN TOTAL', 
                      'AST-SGOT', 'ALT-SGPT', 'ALK_PHOS_ALP', 'Gamma-glutamyl-transferase', 'ALBUMIN', 
                      'Creatinine (UnC test)', 'Urea (UnC test)', 'ESTIMATED =_AVERAGE_GLUCOSE_eAG', 
                      'HbA1c', 'Creatinine (creatinine test)']

# Iterate over each numerical feature
for feature in numerical_features:
    # Calculate summary statistics
    summary_stats = gbc_patients.groupby('Status')[feature].describe()

    # Convert the summary statistics DataFrame to a more visually appealing table
    summary_table = summary_stats.reset_index()
    summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 
                                                  'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 
                                                  'max': 'Max'})
    summary_table.set_index('Status', inplace=True)

    # Print the visually appealing table
    print(f"Summary Statistics for {feature}:")
    print(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))
    print("\n")  # Add newline for better readability


In [None]:
#!pip install XlsxWriter

In [None]:
from tabulate import tabulate
import pandas as pd

# List of numerical features in the dataset
numerical_features = ['AGE', 'HEIGHT', 'WEIGHT', 'BSA_BODY_SURFACE_AREA', 'BMI-BODY MASS INDEX', 
                      'BILIRUBIN CONJUGATED', 'BILIRUBIN UNCONJUGATED', 'BILIRUBIN TOTAL', 
                      'AST-SGOT', 'ALT-SGPT', 'ALK_PHOS_ALP', 'Gamma-glutamyl-transferase', 'ALBUMIN', 
                      'Creatinine (UnC test)', 'Urea (UnC test)', 'ESTIMATED =_AVERAGE_GLUCOSE_eAG', 
                      'HbA1c', 'Creatinine (creatinine test)']

# Create a directory to save output files
import os
output_dir = 'output_files'
os.makedirs(output_dir, exist_ok=True)

# Iterate over each numerical feature
for feature in numerical_features:
    # Calculate summary statistics
    summary_stats = gbc_patients.groupby('Status')[feature].describe()

    # Convert the summary statistics DataFrame to a more visually appealing table
    summary_table = summary_stats.reset_index()
    summary_table = summary_table.rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 
                                                  'min': 'Min', '25%': '25%', '50%': 'Median', '75%': '75%', 
                                                  'max': 'Max'})
    summary_table.set_index('Status', inplace=True)

    # Save the visually appealing table to text file
    text_output_path = os.path.join(output_dir, f'{feature}_summary_statistics.txt')
    with open(text_output_path, 'w', encoding='utf-8') as text_file:
        text_file.write(f"Summary Statistics for {feature}:\n")
        text_file.write(tabulate(summary_table, headers='keys', tablefmt='fancy_grid'))

    # Save the visually appealing table to Excel file
    excel_output_path = os.path.join(output_dir, f'{feature}_summary_statistics.xlsx')
    summary_table.to_excel(excel_output_path)


In [None]:
new_column_names = {
    'Thyroidism': 'Thyroid',
    'HTN': 'Hypertension',
    'Family_cancer_history': 'Family_cancer_history',
    'ECOG_final': 'ECOG',
    'Gall_stone_final': 'Gallstones',
    'Pathology': 'Pathology',
    'Tumour_grade_final': 'Tumour grade',
    'Cancer_presentation_final': 'Cancer_presentation',
    'Tumour_stage_final': 'Tumour_stage',
    'Mets_site_final': 'Metastatic_site',
    'Surgery_final': 'Surgery',
    'Treatment_final': 'Treatment_administered',
    'GENDER': 'GENDER',
    'Diabetes_final': 'DIABETES'
}

gbc_patients.rename(columns=new_column_names, inplace=True)


In [None]:
gbc_patients.head()

In [None]:
gbc_patients.columns

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# List of selected categorical features
selected_categorical_features = ['Thyroid', 'Hypertension', 'Family_cancer_history', 'ECOG', 'Gallstones', 
                                 'Pathology', 'Tumour grade', 'Cancer presentation', 'Tumour stage', 
                                 'Metastasis site', 'Surgery', 'Treatment administered', 'GENDER']


# Plot bar plots for each categorical feature
for feature in selected_categorical_features:
    plt.figure(figsize=(10, 6))
    
    # Plot the distribution of the feature
    plt.subplot(2, 1, 1)
    sns.countplot(data=gbc_patients, x=feature)
    plt.title(f'Distribution of {feature}')
    plt.xticks(rotation=45)
    
    # Plot missing values
    plt.subplot(2, 1, 2)
    sns.countplot(data=gbc_patients, x=feature, hue='Status', palette='Set2')
    plt.title(f'{feature} Distribution w.r.t. Status')
    plt.xticks(rotation=45)
    
    # Show plot
    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# List of selected categorical features
selected_categorical_features = ['Thyroid', 'Hypertension', 'Family_cancer_history', 'ECOG', 'Gallstones', 
                                 'Pathology', 'Tumour grade', 'Cancer presentation', 'Tumour stage', 
                                 'Metastasis site', 'Surgery', 'Treatment administered', 'GENDER']

# Set the number of rows for subplots
num_rows = len(selected_categorical_features)

# Create subplots for each categorical feature
fig, axes = plt.subplots(num_rows, 1, figsize=(12, 5*num_rows))

# Plot stacked bar plots for each categorical feature
for i, feature in enumerate(selected_categorical_features):
    sns.countplot(data=gbc_patients, x=feature, hue='Status', ax=axes[i], palette='Set2', dodge=False)
    axes[i].set_title(f'{feature} Distribution with respect to Status')
    axes[i].set_xlabel('')
    axes[i].set_ylabel('Count')
    axes[i].legend(title='Status', loc='upper right')

# Adjust layout and show plot
plt.tight_layout()
plt.show()


In [None]:
# List of categories to keep
categories_to_keep = ['Adenocarcinoma', 'Adenosquamous carcinoma', 'Neuroendocrine carcinoma']

# Replace categories other than those to keep with "Others"
gbc_patients['Pathology'] = gbc_patients['Pathology'].where(gbc_patients['Pathology'].isin(categories_to_keep), 'Others')



In [None]:
# List of categories to keep
gbc_patients['Metastasis site'] = gbc_patients['Metastasis site'].replace(['Liver', 'Liver & Lungs', 'Liver & Bone'], 'Liver or Liver+others(lungs,bones)')


# List of categories to keep
categories_to_keep = ['Not applicable','Liver or Liver+others(lungs,bones)', 'Peritoneum & Omentum', 'Lungs']

# Replace categories other than those to keep with "Others"
gbc_patients['Metastasis site'] = gbc_patients['Metastasis site'].where(gbc_patients['Metastasis site'].isin(categories_to_keep), 'Others')


gbc_patients['Metastasis site'].value_counts()


In [None]:
#problamatic Names therefore changing the names of a few categories

# Replace the specific values in the 'Mets_site_final' column
gbc_patients['Metastasis site'] = gbc_patients['Metastasis site'].replace({
    'Liver or Liver+others(lungs,bones)': 'Liver and Near organs',
    'Peritoneum & Omentum': 'Peritoneum and Omentum'
})

# Display the updated DataFrame
print(gbc_patients['Metastasis site'].value_counts())


In [None]:
import matplotlib.pyplot as plt

# List of selected categorical features
selected_categorical_features = ['Thyroid', 'Hypertension', 'Family_cancer_history', 'ECOG', 'Gallstones', 
                                 'Pathology', 'Tumour grade', 'Cancer presentation', 'Tumour stage', 
                                 'Metastasis site', 'Surgery', 'Treatment administered', 'GENDER']



# Calculate the total number of rows in the dataset
total_rows = len(gbc_patients)

# Set up the subplot layout
num_rows = len(selected_categorical_features) // 3 + (1 if len(selected_categorical_features) % 3 != 0 else 0)
num_cols = min(3, len(selected_categorical_features))

# Define slightly larger plot size
figsize_factor = 1.3
figsize = (15 * figsize_factor, 5 * num_rows * figsize_factor)

# Create subplots
fig, axs = plt.subplots(num_rows, num_cols, figsize=figsize, facecolor='white')

# Flatten the axs if needed
axs = axs.flatten() if num_rows > 1 else [axs]

# Plot each categorical feature
for i, feature in enumerate(selected_categorical_features):
    # Count values for each category
    value_counts = gbc_patients[feature].value_counts(dropna=False)
    missing_values_count = gbc_patients[feature].isnull().sum()

    # Plot pie chart for feature distribution
    if not value_counts.empty:
        wedges, texts, autotexts = axs[i].pie(value_counts, autopct='%1.1f%%', startangle=90)
        labels = [f'{l} ({c})' for l, c in zip(value_counts.index, value_counts.values)]  # Combine label and count

        # Set legend with labels and missing value count for each category
        axs[i].legend(wedges, labels, title=feature, loc='center left', bbox_to_anchor=(1, 0, 0.5, 1))

        # Set title for the subplot with missing value count as a separate line
        axs[i].set_title(f'{feature} \nMissing Values: {missing_values_count}')

# Remove empty subplots
for i in range(len(selected_categorical_features), num_rows * num_cols):
    fig.delaxes(axs[i])

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# List of selected categorical features
selected_categorical_features = ['Thyroid', 'Hypertension', 'Family_cancer_history', 'ECOG', 'Gallstones', 
                                 'Pathology', 'Tumour grade', 'Cancer presentation', 'Tumour stage', 
                                 'Metastasis site', 'Surgery', 'Treatment administered', 'GENDER']
# Set up the subplot layout
num_cols = 1
num_rows = len(selected_categorical_features)

# Plot each categorical feature individually
for feature in selected_categorical_features:
    # Count values for each category
    value_counts = gbc_patients[feature].value_counts(dropna=False)
    missing_values_count = gbc_patients[feature].isnull().sum()

    # Plot pie chart for feature distribution
    if not value_counts.empty:
        fig, ax = plt.subplots(figsize=(11,9), facecolor='white')
        wedges, texts, autotexts = ax.pie(value_counts, autopct='%1.1f%%', startangle=90)
        labels = [f'{l} ({c})' for l, c in zip(value_counts.index, value_counts.values)]  # Combine label and count

        # Set legend with labels and missing value count for each category
        ax.legend(wedges, labels, title=feature, loc='center left', bbox_to_anchor=(1, 0, 0.5, 1))

        # Set title for the subplot with missing value count as a separate line
        ax.set_title(f'{feature} \nMissing Values: {missing_values_count}')

        # Adjust layout
        plt.tight_layout()

        # Show plot
        plt.show()


In [None]:
import matplotlib.pyplot as plt

# List of selected categorical features
selected_categorical_features = ['Thyroid', 'Hypertension', 'Family_cancer_history', 'ECOG', 'Gallstones', 
                                 'Pathology', 'Tumour grade', 'Cancer presentation', 'Tumour stage', 
                                 'Metastasis site', 'Surgery', 'Treatment administered', 'GENDER']
# Set up the subplot layout
num_cols = 1
num_rows = len(selected_categorical_features)

# Plot each categorical feature individually
for feature in selected_categorical_features:
    # Count values for each category
    value_counts = gbc_patients[feature].value_counts(dropna=False)
    missing_values_count = gbc_patients[feature].isnull().sum()

    # Plot pie chart for feature distribution
    if not value_counts.empty:
        fig, ax = plt.subplots(figsize=(10, 7), facecolor='white')
        wedges, texts, autotexts = ax.pie(value_counts, autopct='%1.1f%%', startangle=90)
        labels = [f'{l} ({c})' for l, c in zip(value_counts.index, value_counts.values)]  # Combine label and count

        # Set legend with labels and missing value count for each category
        legend_labels = [f'{l} - {c} ({v:.1f}%)' for l, c, v in zip(value_counts.index, value_counts.values, (value_counts / len(gbc_patients)) * 100)]
        ax.legend(wedges, legend_labels, title=feature, loc='center left', bbox_to_anchor=(1, 0, 0.5, 1))

        # Set title for the subplot with missing value count as a separate line
        ax.set_title(f'{feature} \nMissing Values: {missing_values_count}')

        # Adjust layout
        plt.tight_layout()

        # Show plot
        plt.show()


In [None]:
import matplotlib.pyplot as plt

# List of selected categorical features
selected_categorical_features = ['Thyroid', 'Hypertension', 'Family_cancer_history', 'ECOG', 'Gallstones', 
                                 'Pathology', 'Tumour grade', 'Cancer presentation', 'Tumour stage', 
                                 'Metastasis site', 'Surgery', 'Treatment administered', 'GENDER']
# Set up the subplot layout
num_cols = 1
num_rows = len(selected_categorical_features)

# Plot each categorical feature individually
for feature in selected_categorical_features:
    # Count values for each category
    value_counts = gbc_patients[feature].value_counts(dropna=False)
    missing_values_count = gbc_patients[feature].isnull().sum()

    # Plot pie chart for feature distribution
    if not value_counts.empty:
        fig, ax = plt.subplots(figsize=(10, 7), facecolor='white')
        if len(value_counts) > 0:
            wedges, _ = ax.pie(value_counts, startangle=90)
        labels = [f'{l} ({c})' for l, c in zip(value_counts.index, value_counts.values)]  # Combine label and count

        # Set legend with labels and missing value count for each category
        legend_labels = [f'{l} ({c})' for l, c in zip(value_counts.index, value_counts.values)]
        ax.legend(legend_labels, title=feature, loc='center left', bbox_to_anchor=(1, 0, 0.5, 1))

        # Set title for the subplot with missing value count as a separate line
        ax.set_title(f'{feature} \nMissing Values: {missing_values_count}')

        # Adjust layout
        plt.tight_layout()

        # Show plot
        plt.show()


In [None]:
import matplotlib.pyplot as plt

# List of selected categorical features
selected_categorical_features = ['Thyroid', 'Hypertension', 'Family_cancer_history', 'ECOG', 'Gallstones', 
                                 'Pathology', 'Tumour grade', 'Cancer presentation', 'Tumour stage', 
                                 'Metastasis site', 'Surgery', 'Treatment administered', 'GENDER']
# Set up the subplot layout
num_cols = 1
num_rows = len(selected_categorical_features)

# Plot each categorical feature individually
for feature in selected_categorical_features:
    # Count values for each category
    value_counts = gbc_patients[feature].value_counts(dropna=False)
    missing_values_count = gbc_patients[feature].isnull().sum()

    # Plot pie chart for feature distribution
    if not value_counts.empty:
        fig, ax = plt.subplots(figsize=(10, 7), facecolor='white')
        if len(value_counts) > 0:
            wedges, _ = ax.pie(value_counts, startangle=90)
        labels = [f'{l} ({c})' for l, c in zip(value_counts.index, value_counts.values)]  # Combine label and count
        
        # Calculate percentage of observations
        total_obs = value_counts.sum()
        percentages = [(c / total_obs) * 100 for c in value_counts.values]
        
        # Append percentages to legend labels
        legend_labels = [f'{l} ({c}, {p:.2f}%)' for l, c, p in zip(value_counts.index, value_counts.values, percentages)]
        
        # Set legend with labels and missing value count for each category
        ax.legend(legend_labels, title=feature, loc='center left', bbox_to_anchor=(1, 0, 0.5, 1))

        # Set title for the subplot with missing value count as a separate line
        ax.set_title(f'{feature} \nMissing Values: {missing_values_count}')

        # Adjust layout
        plt.tight_layout()

        # Show plot
        plt.show()


In [None]:
import matplotlib.pyplot as plt

# List of selected categorical features
selected_categorical_features = ['Thyroid', 'Hypertension', 'Family_cancer_history', 'ECOG', 'Gallstones', 
                                 'Pathology', 'Tumour grade', 'Cancer presentation', 'Tumour stage', 
                                 'Metastasis site', 'Surgery', 'Treatment administered', 'GENDER']
# Set up the subplot layout
num_cols = 1
num_rows = len(selected_categorical_features)

# Plot each categorical feature individually and save it
for feature in selected_categorical_features:
    # Count values for each category
    value_counts = gbc_patients[feature].value_counts(dropna=False)
    missing_values_count = gbc_patients[feature].isnull().sum()

    # Plot pie chart for feature distribution
    if not value_counts.empty:
        fig, ax = plt.subplots(figsize=(10, 7), facecolor='white')
        if len(value_counts) > 0:
            wedges, _ = ax.pie(value_counts, startangle=90)
        labels = [f'{l} ({c})' for l, c in zip(value_counts.index, value_counts.values)]  # Combine label and count
        
        # Calculate percentage of observations
        total_obs = value_counts.sum()
        percentages = [(c / total_obs) * 100 for c in value_counts.values]
        
        # Append percentages to legend labels
        legend_labels = [f'{l} ({c}, {p:.2f}%)' for l, c, p in zip(value_counts.index, value_counts.values, percentages)]
        
        # Set legend with labels and missing value count for each category
        ax.legend(legend_labels, title=feature, loc='center left', bbox_to_anchor=(1, 0, 0.5, 1))

        # Set title for the subplot with missing value count as a separate line
        ax.set_title(f'{feature} \nMissing Values: {missing_values_count}')

        # Adjust layout
        plt.tight_layout()

        # Save the plot
        plt.savefig(f'{feature}_distribution.png')

        # Show plot
        plt.show()


In [None]:
gbc_patients.columns

In [None]:
gbc_patients['Treatment']

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# List of selected categorical features
selected_categorical_features =  ['Thyroid', 'Hypertension', 'Family_cancer_history', 'ECOG', 'Gallstones', 
                                 'Pathology', 'Tumour grade', 'Cancer presentation', 'Tumour stage', 
                                 'Metastasis site', 'Surgery', 'Treatment administered', 'GENDER']
# List of status categories
status_categories = gbc_patients['Status'].unique()

# Calculate the total number of rows in the dataset
total_rows = len(gbc_patients)

# Set up the subplot layout (adjust num_rows/cols as needed)
num_rows = min(4, len(selected_categorical_features))  # Limit to 4 rows for readability
num_cols = 3

# Define plot size
figsize = (15, 5 * num_rows)

# Create subplots
fig, axs = plt.subplots(num_rows, num_cols, figsize=figsize, facecolor='white')

# Flatten the axs if needed
axs = axs.flatten() if num_rows > 1 else [axs]

# Plot each categorical feature
for i, feature in enumerate(selected_categorical_features):
    if i >= num_rows * num_cols:
        break  # Avoid exceeding subplot limit

    # Count values for each category within each status
    value_counts_by_status = {status: gbc_patients[gbc_patients['Status'] == status][feature].value_counts()
                             for status in status_categories}

    # Unstack data for plotting
    data_to_plot = pd.DataFrame(value_counts_by_status).fillna(0)

    # Create horizontal bar chart
    data_to_plot.plot(kind='barh', stacked=False, ax=axs[i], colormap='Set2')

    # Set labels and title
    axs[i].set_xlabel('Patient Count')
    axs[i].set_ylabel('Status')
    axs[i].set_title(f'{feature} Distribution w.r.t Status')

    # Rotate y-axis labels for better readability
    axs[i].tick_params(axis='y', rotation=0)

# Adjust layout and display plot
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# List of selected categorical features
selected_categorical_features =  ['Thyroid', 'Hypertension', 'Family_cancer_history', 'ECOG', 'Gallstones', 
                                 'Pathology', 'Tumour grade', 'Cancer presentation', 'Tumour stage', 
                                 'Metastasis site', 'Surgery', 'Treatment administered', 'GENDER']
# List of status categories
status_categories = gbc_patients['Status'].unique()

# Calculate the total number of rows in the dataset
total_rows = len(gbc_patients)

# Set up the subplot layout (adjust num_rows/cols as needed)
num_rows = min(4, len(selected_categorical_features))  # Limit to 4 rows for readability
num_cols = 3

# Define plot size
figsize = (15, 5 * num_rows)

# Create subplots
fig, axs = plt.subplots(num_rows, num_cols, figsize=figsize, facecolor='white')

# Flatten the axs if needed
axs = axs.flatten() if num_rows > 1 else [axs]

# Plot each categorical feature
for i, feature in enumerate(selected_categorical_features):
    if i >= num_rows * num_cols:
        break  # Avoid exceeding subplot limit

    # Count values for each category within each status
    value_counts_by_status = {status: gbc_patients[gbc_patients['Status'] == status][feature].value_counts()
                              for status in status_categories}

    # Unstack data for plotting
    data_to_plot = pd.DataFrame(value_counts_by_status).fillna(0)

    # Drop NaN category if exists
    if pd.NA in data_to_plot.columns:
        data_to_plot.drop(pd.NA, axis=1, inplace=True)

    # Drop 'nan' index if exists
    if 'nan' in data_to_plot.index:
        data_to_plot.drop('nan', axis=0, inplace=True)

    # Calculate missing values count
    missing_values_count = gbc_patients[feature].isnull().sum()

    # Create horizontal bar chart
    data_to_plot.plot(kind='barh', stacked=False, ax=axs[i], colormap='Set2')

    # Set labels and title
    axs[i].set_xlabel('Patient Count')
    axs[i].set_ylabel('Status')
    axs[i].set_title(f'{feature} Distribution w.r.t Status')

    # Rotate y-axis labels for better readability
    axs[i].tick_params(axis='y', rotation=0)

    # Add missing values count as text on the plot
    axs[i].text(0.5, 0.95, f'Missing Values: {missing_values_count}', horizontalalignment='center', 
                verticalalignment='center', transform=axs[i].transAxes, fontsize=10, color='red')

# Adjust layout and display plot
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# List of selected categorical features
selected_categorical_features =  ['Thyroid', 'Hypertension', 'Family_cancer_history', 'ECOG', 'Gallstones', 
                                 'Pathology', 'Tumour grade', 'Cancer presentation', 'Tumour stage', 
                                 'Metastasis site', 'Surgery', 'Treatment administered', 'GENDER']
# List of status categories
status_categories = gbc_patients['Status'].unique()

# Calculate the total number of rows in the dataset
total_rows = len(gbc_patients)

# Set up the subplot layout (adjust num_rows/cols as needed)
num_rows = min(4, len(selected_categorical_features))  # Limit to 4 rows for readability
num_cols = 3

# Define plot size
figsize = (15, 5 * num_rows)

# Create subplots
fig, axs = plt.subplots(num_rows, num_cols, figsize=figsize, facecolor='white')

# Flatten the axs if needed
axs = axs.flatten() if num_rows > 1 else [axs]

# Initialize legend labels
legend_labels = None

# Plot each categorical feature
for i, feature in enumerate(selected_categorical_features):
    if i >= num_rows * num_cols:
        break  # Avoid exceeding subplot limit

    # Count values for each category within each status
    value_counts_by_status = {status: gbc_patients[gbc_patients['Status'] == status][feature].value_counts()
                              for status in status_categories}

    # Unstack data for plotting
    data_to_plot = pd.DataFrame(value_counts_by_status).fillna(0)

    # Drop NaN category if exists
    if pd.NA in data_to_plot.columns:
        data_to_plot.drop(pd.NA, axis=1, inplace=True)

    # Drop 'nan' index if exists
    if 'nan' in data_to_plot.index:
        data_to_plot.drop('nan', axis=0, inplace=True)

    # Calculate missing values count
    missing_values_count = gbc_patients[feature].isnull().sum()

    # Create horizontal bar chart
    data_to_plot.plot(kind='barh', stacked=False, ax=axs[i], colormap='Set2')

    # Set labels and title
    axs[i].set_xlabel('Patient Count')
    axs[i].set_ylabel('Status')
    axs[i].set_title(f'{feature} Distribution w.r.t Status')

    # Rotate y-axis labels for better readability
    axs[i].tick_params(axis='y', rotation=0)

    # Add missing values count as text on the plot
    axs[i].text(0.5, 0.95, f'Missing Values: {missing_values_count}', horizontalalignment='center', 
                verticalalignment='center', transform=axs[i].transAxes, fontsize=10, color='red')

    # Get legend labels
    if legend_labels is None:
        legend_labels = data_to_plot.columns.tolist()

# Create legend
plt.figlegend(legend_labels, loc='lower center', ncol=len(legend_labels), labelspacing=0.9, bbox_to_anchor=(0.5, -0.05))

# Adjust layout and display plot
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# List of selected categorical features
selected_categorical_features =  ['Thyroid', 'Hypertension', 'Family_cancer_history', 'ECOG', 'Gallstones', 
                                 'Pathology', 'Tumour grade', 'Cancer presentation', 'Tumour stage', 
                                 'Metastasis site', 'Surgery', 'Treatment administered', 'GENDER']
# List of status categories
status_categories = gbc_patients['Status'].unique()

# Calculate the total number of rows in the dataset
total_rows = len(gbc_patients)

# Iterate over each categorical feature
for feature in selected_categorical_features:
    # Count values for each category within each status
    value_counts_by_status = {status: gbc_patients[gbc_patients['Status'] == status][feature].value_counts()
                              for status in status_categories}

    # Unstack data for plotting
    data_to_plot = pd.DataFrame(value_counts_by_status).fillna(0)

    # Drop NaN category if exists
    if pd.NA in data_to_plot.columns:
        data_to_plot.drop(pd.NA, axis=1, inplace=True)

    # Drop 'nan' index if exists
    if 'nan' in data_to_plot.index:
        data_to_plot.drop('nan', axis=0, inplace=True)

    # Calculate missing values count
    missing_values_count = gbc_patients[feature].isnull().sum()

    # Create horizontal bar chart
    fig, ax = plt.subplots(figsize=(10, 7), facecolor='white')
    data_to_plot.plot(kind='barh', stacked=False, ax=ax, colormap='Set2', legend=False)

    # Set labels and title
    ax.set_xlabel('Patient Count')
    ax.set_ylabel('Status')
    ax.set_title(f'{feature} Distribution w.r.t Status')

    # Rotate y-axis labels for better readability
    ax.tick_params(axis='y', rotation=0)

    # Add missing values count as text on the plot
    ax.text(0.5, 0.95, f'Missing Values: {missing_values_count}', horizontalalignment='center', 
            verticalalignment='center', transform=ax.transAxes, fontsize=10, color='red')
    
    # Show the plot
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# List of selected categorical features
selected_categorical_features =  ['Thyroid', 'Hypertension', 'Family_cancer_history', 'ECOG', 'Gallstones', 
                                 'Pathology', 'Tumour grade', 'Cancer presentation', 'Tumour stage', 
                                 'Metastasis site', 'Surgery', 'Treatment administered', 'GENDER']
# List of status categories
status_categories = gbc_patients['Status'].unique()

# Calculate the total number of rows in the dataset
total_rows = len(gbc_patients)

# Iterate over each categorical feature
for feature in selected_categorical_features:

    # Count values for each category within each status
    value_counts_by_status = {status: gbc_patients[gbc_patients['Status'] == status][feature].value_counts()
                              for status in status_categories}

    # Unstack data for plotting
    data_to_plot = pd.DataFrame(value_counts_by_status).fillna(0)

    # Drop NaN category if exists
    if pd.NA in data_to_plot.columns:
        data_to_plot.drop(pd.NA, axis=1, inplace=True)

    # Drop 'nan' index if exists
    if 'nan' in data_to_plot.index:
        data_to_plot.drop('nan', axis=0, inplace=True)

    # Calculate missing values count
    missing_values_count = gbc_patients[feature].isnull().sum()

    # Create horizontal bar chart
    fig, ax = plt.subplots(figsize=(10, 7), facecolor='white')
    data_to_plot.plot(kind='barh', stacked=False, ax=ax, colormap='Set2', legend=False)

    # Set labels and title
    ax.set_xlabel('Patient Count')
    ax.set_ylabel('Status')
    ax.set_title(f'{feature} Distribution w.r.t Status')

    # Rotate y-axis labels for better readability
    ax.tick_params(axis='y', rotation=0)

    # Add missing values count as text on the plot
    ax.text(0.5, 0.95, f'Missing Values: {missing_values_count}', horizontalalignment='center', 
            verticalalignment='center', transform=ax.transAxes, fontsize=10, color='red')

   
    # Show plot
    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os

# List of selected categorical features
selected_categorical_features =  ['Thyroid', 'Hypertension', 'Family_cancer_history', 'ECOG', 'Gallstones', 
                                 'Pathology', 'Tumour grade', 'Cancer presentation', 'Tumour stage', 
                                 'Metastasis site', 'Surgery', 'Treatment administered', 'GENDER']
# List of status categories
status_categories = gbc_patients['Status'].unique()

# Calculate the total number of rows in the dataset
total_rows = len(gbc_patients)

# Directory to save the plots
output_dir = "plots"
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Iterate over each categorical feature
for feature in selected_categorical_features:

    # Count values for each category within each status
    value_counts_by_status = {status: gbc_patients[gbc_patients['Status'] == status][feature].value_counts()
                              for status in status_categories}

    # Unstack data for plotting
    data_to_plot = pd.DataFrame(value_counts_by_status).fillna(0)

    # Drop NaN category if exists
    if pd.NA in data_to_plot.columns:
        data_to_plot.drop(pd.NA, axis=1, inplace=True)

    # Drop 'nan' index if exists
    if 'nan' in data_to_plot.index:
        data_to_plot.drop('nan', axis=0, inplace=True)

    # Calculate missing values count
    missing_values_count = gbc_patients[feature].isnull().sum()

    # Create horizontal bar chart
    fig, ax = plt.subplots(figsize=(10, 7), facecolor='white')
    data_to_plot.plot(kind='barh', stacked=False, ax=ax, colormap='Set2', legend=False)

    # Set labels and title
    ax.set_xlabel('Patient Count')
    ax.set_ylabel('Status')
    ax.set_title(f'{feature} Distribution w.r.t Status')

    # Rotate y-axis labels for better readability
    ax.tick_params(axis='y', rotation=0)

    # Add missing values count as text on the plot
    ax.text(0.5, 0.95, f'Missing Values: {missing_values_count}', horizontalalignment='center', 
            verticalalignment='center', transform=ax.transAxes, fontsize=10, color='red')

    # Save the plot
    output_path = os.path.join(output_dir, f'{feature}_distribution.png')
    plt.savefig(output_path)

    # Close the plot to release memory
    plt.close()

print("Plots saved successfully.")
