In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

HABERMAN DATASET

* This dataset contains cases from a study that was conducted between 1958 and 1970 at the University of Chicago's Billings 
  hospital on the survival of patients who had undergone surgery for breast cancer
* no of patients - 306
* no of features - 4(including the class)

* Explanation of each feature: 
  1)Age at the time of operation 
  2)year of operation
  3)# of positive axillary nodes - This is a lymph node in the area of the armpit(axilla) to which the cancer has spread.
  4)Survival status - 1 for patient who has survived 5 or more years, 2 for patient who died within 5 years of the operation.
  
* Objective: Classify a new patient as belonging to status 1 or 2(one of the 2 classes) with the given data (3 features)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# load Haberman dataset into pandas dataframe
haber = pd.read_csv("/kaggle/input/habermans-survival-data-set/haberman.csv", names = ["age", "year", "nodes", "status"])
haber

HIGH LEVEL STATISTICS FOR THE HABERMAN DATASET BELOW

In [None]:
# number of data points and features
haber.shape

In [None]:
# number of classes and data points for each class
haber["status"].value_counts()

In [None]:
# high level statistics
haber.describe()

High Level Statistics and Observation:
* The total number of data points is 306
* The total number of features is 4
* The number of classes are 2(status of 1 and status of 2)
* The number of data points for class 1 is 225
* The number of data points for class 2 is 81
* The maximum age of the patient is 83.
* The highest # of nodes is 52.
* 25% of the patients did not have any positive axillary nodes.
* 75% of the patients had 0 - 4 nodes.
* It is an imbalanced dataset as there is no 50/50 or 60/40 distribution of data. 
  There is approximately 75/25 distribution of data.

OBJECTIVE:

The objective is to do an EDA(Exploratory Data Analysis) on the given dataset and come up with univariate and multivariate analysis, observations about the dataset. These obersvations should help us classify a new patient as belonging to one of the above 2 classes(status 1 or status 2) with the given 3 features.Or in other words identify if a patient would be able to survive more than 5 years after the operation.

UNIVARIATE ANALYSIS

In [None]:
# Histogram and PDF for age
sns.FacetGrid(haber, hue = "status", height = 5)\
   .map(sns.distplot, "age")\
   .add_legend();
plt.title("Histogram for Age")
plt.ylabel("density")
plt.show();

In [None]:
# Histogram and PDF for year
sns.FacetGrid(haber, hue = "status", height = 5)\
   .map(sns.distplot, "year")\
   .add_legend();
plt.title("Histogram for Year")
plt.ylabel("density")
plt.show();

In [None]:
# Histogram and PDF for nodes
sns.FacetGrid(haber, hue = "status", height = 5)\
   .map(sns.distplot, "nodes")\
   .add_legend();
plt.title("Histogram for Nodes")
plt.ylabel("density")
plt.show();

OBSERVATIONS FROM HISTOGRAMS:
* The age and year histogram and PDF's do not give any useful observations
* The nodes PDF and histogram shows that lot of points lie between 0 and 5 nodes. In order to visually see what percentage of points lie in a certain range we need to compute CDF's.

In [None]:
# 1.1 Computing PDF and CDF for "nodes" feature, status = 1
haber_1 = haber.loc[haber["status"] == 1]
counts, bin_edges = np.histogram(haber_1["nodes"], bins = 23, density = True)
print(bin_edges)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
print(pdf)
print(cdf)
plt.plot(bin_edges[1:], pdf, label = "pdf")
plt.plot(bin_edges[1:], cdf, label = "cdf")
plt.xlabel("nodes of status 1")
plt.ylabel("probability of # of patients")
plt.legend();
plt.title("PDF and CDF for nodes with status 1")
plt.show;

In [None]:
# 1.2 Computing PDF and CDF for "nodes" feature, status = 2
haber_2 = haber.loc[haber["status"] == 2]
counts, bin_edges = np.histogram(haber_2["nodes"], bins = 26, density = False)
print(bin_edges)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
print(pdf)
print(cdf)
plt.plot(bin_edges[1:], pdf, label = "pdf")
plt.plot(bin_edges[1:], cdf, label = "cdf")
plt.xlabel("nodes of status 2")
plt.ylabel("probability of # of patients")
plt.legend();
plt.title("PDF and CDF for nodes with status 2")
plt.show;

In [None]:
# 1.3 Computing PDF and CDF for "nodes" feature
counts, bin_edges = np.histogram(haber["nodes"], bins = 26, density = False)
print(bin_edges)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
print(pdf)
print(cdf)
plt.plot(bin_edges[1:], pdf, label = "pdf")
plt.plot(bin_edges[1:], cdf, label = "cdf")
plt.xlabel("nodes of status 1 & 2 combined")
plt.ylabel("probability of # of patients")
plt.legend();
plt.title("PDF and CDF for nodes with status 1 & 2")
plt.show;

In [None]:
# 2.1 Computing PDF and CDF for "age" feature
counts, bin_edges = np.histogram(haber["age"], bins = 10, density = False)
print(bin_edges)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
print(pdf)
print(cdf)
plt.plot(bin_edges[1:], pdf, label = "pdf")
plt.plot(bin_edges[1:], cdf, label = "cdf")
plt.xlabel("age")
plt.ylabel("probability of # of patients")
plt.legend();
plt.title("PDF and CDF for Age")
plt.show;

In [None]:
# 2.2 Computing PDF and CDF for "year" feature
counts, bin_edges = np.histogram(haber["year"], bins = 10, density = False)
print(bin_edges)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
print(pdf)
print(cdf)
plt.plot(bin_edges[1:], pdf, label = "pdf")
plt.plot(bin_edges[1:], cdf, label = "cdf")
plt.xlabel("year")
plt.ylabel("probability of # of patients")
plt.legend();
plt.title("PDF and CDF for Year")
plt.show;

OBSERVATIONS FROM PDF AND CDF
* In 1.1, we notice that about 80% of the nodes with status 1 are between 0 and 4.
* In 1.2, we notice that about 70% of the nodes with status 2 are under 10.
* In 1.3, we notice that around 70% of the total nodes are under 4. 
* In 2.1, we see that there are only 15% of the patients that are below the age of 40 in the entire data.
* In 2.2, we notice that the cdf is almost linear and it increase w.r.t the year. About 30% of the operations are done from year 1958 to 1960.

In [None]:
#Mean and standard deviation
print("Mean:")
print(np.mean(haber_1["nodes"]))
print(np.mean(haber_2["nodes"]))

print(np.mean(haber["age"]))


print("\nStandard Deviation:")
print(np.std(haber_1["nodes"]))
print(np.std(haber_2["nodes"]))

In [None]:
# Median, Percentile, quantile, MAD
print("Median:")
print(np.median(haber_1["nodes"]))
print(np.median(haber_2["nodes"]))

print("\nQuantiles:")
print(np.percentile(haber_1["nodes"],np.arange(0, 100, 25)))
print(np.percentile(haber_2["nodes"],np.arange(0, 100, 25)))

print("\n20th Percentile range")
print(np.percentile(haber_1["nodes"],np.arange(0, 100, 20)))
print(np.percentile(haber_2["nodes"],np.arange(0, 100, 20)))

from statsmodels import robust
print("\nMedian Absolute Deviation:")
print(robust.mad(haber_1["nodes"]))
print(robust.mad(haber_2["nodes"]))

In [None]:
# Box plot and Whiskers

# Setting handles for the legend.
import matplotlib.patches as mpatches
blue_patch = mpatches.Patch(color = "steelblue", label = "1")
orange_patch = mpatches.Patch(color = "orange", label = "2")

# Box plot and whiskers for nodes
sns.boxplot(x = "status", y = "nodes", data = haber)
plt.title("Box plot for Nodes")
plt.legend(title = "status", handles = [blue_patch, orange_patch])
plt.show();
# Box plot and whiskers for age
sns.boxplot(x = "status", y = "age", data = haber)
plt.title("Box plot for Age")
plt.legend(title = "status", handles = [blue_patch, orange_patch])
plt.show();
# Box plot and whiskers for year
sns.boxplot(x = "status", y = "year", data = haber)
plt.title("Box plot for Year")
plt.legend(title = "status", handles = [blue_patch, orange_patch])
plt.show();

In [None]:
#Violin plots
import matplotlib.patches as mpatches
blue_patch = mpatches.Patch(color = "steelblue", label = "1")
orange_patch = mpatches.Patch(color = "orange", label = "2")

# Violin plot for nodes
sns.violinplot(x = "status", y = "nodes", data = haber)
plt.title("Violin plot for Nodes")
plt.legend(title = "status", handles = [blue_patch, orange_patch])
plt.show();
# Violin plot for age 
sns.violinplot(x = "status", y = "age", data = haber)
plt.title("Violin plot for Age")
plt.legend(title = "status", handles = [blue_patch, orange_patch])
plt.show();
# Violin plot for year
sns.violinplot(x = "status", y = "year", data = haber)
plt.title("Violin plot for Year")
plt.legend(title = "status", handles = [blue_patch, orange_patch])
plt.show();

OBSERVATIONS FROM BOX PLOTS AND VIOLIN PLOTS
* From the box plots, we see that 50% of the patients with status 1 have 0 nodes. IQR is within 0-3. This confirms our earlier analysis that 80% of the patients that survived had lesser # of nodes.
* For status 2, IQR is within 0 - 11 nodes. 50th percentile stands at 0 - 4 nodes. 25th percentile stands at 1 node.
* From the box plots and the percentile calculation above, we see that atleast 50% of the patient with status 1 have 0 nodes(about 112 patients). Atleast 20% of the patients with status 2 have 0 nodes(about 16 patients). This shows that there are some patients with 0 nodes but still didn't survive. Looking at these data, we come to the conclusion that most patients with 0 positive axillary nodes survived. About 85%.

MULTIVARIATE ANALYSIS

In [None]:
# Scatter plot of nodes and age
sns.set_style("whitegrid")
sns.FacetGrid(haber, hue = "status", height = 5)\
   .map(plt.scatter, "nodes", "age")\
   .add_legend()
plt.title("Scatter plot of nodes and age")
plt.show();

In [None]:
# Pair plots
plt.close();
#plt.title("Pair plots for 3 features in the dataset")
sns.set_style("whitegrid");
sns.pairplot(haber, hue = "status", vars = ["age", "year", "nodes"], height = 3);
plt.suptitle("Pair plots for the 3 features in the dataset")
plt.show()

OBSERVATIONS FROM SCATTER PLOTS AND PAIR PLOTS
* From the above scatter plot, we see that patients below the age of 40 have a high chance of survival after 5 years of operation. There are many blue points(about 90%) and very few orange points.
* Patients between the age of 50 and 60(both exclusive) with no positive axillary nodes have all survived 5 or more years after operation.

CONCLUSIONS:
* It is an imbalanced dataset as there is no 50/50 or 60/40 distribution of data. 
  There is approximately 75/25 distribution of data.
* There is no clear seperation between the patients who survived 5 or more years after the operation and patients who died within 5 years   of the operation
* Approximately 85% of the patients with no positive axillary nodes(0 nodes) survived 5 or more years after the operation.
* In the given dataset we see that 90% of the patients under 40 years of age survived 5 or more years after the operation.