In [75]:
import pandas as pd
import numpy as np

hem_types = [
    "any",
    "epidural",
    "intraparenchymal",
    "intraventricular",
    "subarachnoid",
    "subdural"
]

# Data exploration
In this notebook we'll look at balance of the labels in our dataset as well as the balance after undersampling any=0 to match number of any=1.

## Full dataset

In [76]:
full = pd.read_csv("../input/rsna-intracranial-hemorrhage-detection/stage_1_train.csv")
full.head(5)

Unnamed: 0,ID,Label
0,ID_63eb1e259_epidural,0
1,ID_63eb1e259_intraparenchymal,0
2,ID_63eb1e259_intraventricular,0
3,ID_63eb1e259_subarachnoid,0
4,ID_63eb1e259_subdural,0


Transform the dataset to wide format, i.e. each class as separate binary column

In [None]:
full[["ID","subtype"]] = full.ID.str.rsplit("_", n=1, expand=True)
full.drop_duplicates(['ID','subtype'], inplace=True)
full = full.pivot('ID', 'subtype', 'Label')
full.reset_index(inplace=True) 
full = full.drop('ID', axis=1)
size_full = len(full)
full.head(5)

Count all classes..

In [78]:
counts_full = full.astype(bool).sum(axis=0)

...and calculate proportion of each of them...

In [79]:
props_full = [counts_full[hem_type] / len(full) for hem_type in range(6)]

...and proportion of each class to number of any=1

In [80]:
props_full_any = [counts_full[hem_type] / counts_full[0] for hem_type in range(6)]

In [81]:
#print nice summary
print("\nProbability that a patient in the dataset has any of the 5 hemorrhage types:")
print("Number of patients with hemorrhage:  {}".format(counts_full[0]))
print("Total number of patients:           {}".format(size_full))
print("p(hemorrhage): %.2f%%" % (counts_full[0] / size_full * 100))
print("\nNumber of each type of hemorrhage found in dataset and share of total hemorrhages [p(H type| any=1)]: ")
print("%21s | %7s | %9s" % ("hemorrhage type", "count", "portion"))
for hem_type in np.arange(1,6):
    print("%21s | %7d | %8.2f%%" % (hem_types[hem_type], counts_full[hem_type], props_full_any[hem_type] * 100))
print("\nNumber of each type of hemorrhage found in dataset and share of the raw total [p(hem_type)]: ")
print("%21s | %7s | %9s" % ("hemorrhage type", "count", "portion"))
for hem_type in range(6):
    print("%21s | %7d | %8.2f%%" % (hem_types[hem_type], counts_full[hem_type], props_full[hem_type] * 100))
print("\n")


Probability that a patient in the dataset has any of the 5 hemorrhage types:
Number of patients with hemorrhage:  97103
Total number of patients:           674258
p(hemorrhage): 14.40%

Number of each type of hemorrhage found in dataset and share of total hemorrhages [p(H type| any=1)]: 
      hemorrhage type |   count |   portion
             epidural |    2761 |     2.84%
     intraparenchymal |   32564 |    33.54%
     intraventricular |   23766 |    24.48%
         subarachnoid |   32122 |    33.08%
             subdural |   42496 |    43.76%

Number of each type of hemorrhage found in dataset and share of the raw total [p(hem_type)]: 
      hemorrhage type |   count |   portion
                  any |   97103 |    14.40%
             epidural |    2761 |     0.41%
     intraparenchymal |   32564 |     4.83%
     intraventricular |   23766 |     3.52%
         subarachnoid |   32122 |     4.76%
             subdural |   42496 |     6.30%




## Balanced dataset

In [82]:
data = pd.read_csv("../input/anns-train-df/train_balanced.csv")
size_bal = len(data)
data.head(5)

Unnamed: 0.1,Unnamed: 0,index,SOPInstanceUID,Modality,PatientID,StudyInstanceUID,SeriesInstanceUID,StudyID,ImagePositionPatient,ImageOrientationPatient,...,WindowCenter1,MultiWindowWidth,WindowWidth1,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,pct_cut
0,0,0,ID_231d901c1,CT,ID_b81a287f,ID_dd37ba3adb,ID_15dcd6057a,,-125.0,1.0,...,,,,1,0,0,0,1,0,"(0.1, 0.2]"
1,1,12,ID_019fca83f,CT,ID_a6882fde,ID_a06a9a18ce,ID_545f0fb9b1,,-125.0,1.0,...,,,,1,0,0,0,1,0,"(0.2, 0.3]"
2,2,19,ID_bab3d55d5,CT,ID_1ce79b2e,ID_55f72c7bad,ID_3ee2dc53e7,,-125.0,1.0,...,,,,1,1,0,0,1,0,"(0.2, 0.3]"
3,3,22,ID_929cbea19,CT,ID_657fbcb3,ID_c4d29f2047,ID_4f14b399c4,,-173.0,1.0,...,40.0,1.0,80.0,1,0,1,1,0,0,"(0.1, 0.2]"
4,4,25,ID_a5387e47d,CT,ID_8c9bf219,ID_341f6cbe22,ID_6356f1b6f7,,-128.5,1.0,...,,,,1,0,0,0,1,0,"(0.2, 0.3]"


In [83]:
cols = data.columns
drop = cols[0:44]
data = data.drop(drop,axis=1)
data = data.drop(['pct_cut'], axis=1)
data.head(5)

Unnamed: 0,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,1,0,0,0,1,0
1,1,0,0,0,1,0
2,1,1,0,0,1,0
3,1,0,1,1,0,0
4,1,0,0,0,1,0


In [84]:
counts_bal = data.astype(bool).sum(axis=0)
props_bal = [counts_bal[hem_type] / len(data) for hem_type in range(6)]
props_bal_any = [counts_bal[hem_type] / counts_bal[0] for hem_type in range(6)]

In [85]:
#print nice summary
print("\nProbability that a patient in the dataset has any of the 5 hemorrhage types:")
print("Number of patients with hemorrhage:  {}".format(counts_bal[0]))
print("Total number of patients:           {}".format(size_bal))
print("p(hemorrhage): %.2f%%" % (counts_bal[0] / size_bal * 100))
print("\nNumber of each type of hemorrhage found in dataset and share of total hemorrhages [p(H type| any=1)]: ")
print("%21s | %7s | %9s" % ("hemorrhage type", "count", "portion"))
for hem_type in np.arange(1,6):
    print("%21s | %7d | %8.2f%%" % (hem_types[hem_type], counts_bal[hem_type], props_bal_any[hem_type] * 100))
print("\nNumber of each type of hemorrhage found in dataset and share of the raw total [p(hem_type)]: ")
print("%21s | %7s | %9s" % ("hemorrhage type", "count", "portion"))
for hem_type in range(6):
    print("%21s | %7d | %8.2f%%" % (hem_types[hem_type], counts_bal[hem_type], props_bal[hem_type] * 100))
print("\n")


Probability that a patient in the dataset has any of the 5 hemorrhage types:
Number of patients with hemorrhage:  97041
Total number of patients:           194082
p(hemorrhage): 50.00%

Number of each type of hemorrhage found in dataset and share of total hemorrhages [p(H type| any=1)]: 
      hemorrhage type |   count |   portion
             epidural |    2761 |     2.85%
     intraparenchymal |   32557 |    33.55%
     intraventricular |   23766 |    24.49%
         subarachnoid |   32114 |    33.09%
             subdural |   42448 |    43.74%

Number of each type of hemorrhage found in dataset and share of the raw total [p(hem_type)]: 
      hemorrhage type |   count |   portion
                  any |   97041 |    50.00%
             epidural |    2761 |     1.42%
     intraparenchymal |   32557 |    16.77%
     intraventricular |   23766 |    12.25%
         subarachnoid |   32114 |    16.55%
             subdural |   42448 |    21.87%


