In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
import sys
sys.path.append("../src")

In [4]:
from d01_data.db import get_db_config_path, get_db_config, get_sqlalchemy_connection, read_table

In [5]:
path = get_db_config_path()
conf = get_db_config(path)
conn = get_sqlalchemy_connection(conf)

In [6]:
N_BINS = 20

In [None]:
measurement_abstract_rpt_df = read_table(conn, 'measurement_abstract_rpt')
measurement_abstract_rpt_df.head()

# 1. Read relevant tables into dataframes

* `measurement_abstract_rpt`: 1,568,350 final (reported) study measurements (more than measured because some are automatically calculated)
* `a_measgraphref`: 823,886 total (recorded) instance measurements (34,332 with bad `instanceidk` value, 37 with bad `indexinmglist` value)
* `a_measgraphic`: 486,884 instance frames

In [None]:
# Remove whitespace around measurement names to match with given measurements that correspond to views (fix units too)
measurement_abstract_rpt_df['name'] = measurement_abstract_rpt_df['name'].str.strip()
measurement_abstract_rpt_df['unitname'] = measurement_abstract_rpt_df['unitname'].str.strip()

### `a_measgraphref`

In [None]:
measgraphref_df = read_table(conn, 'a_measgraphref')
measgraphref_df.head()

In [None]:
# Drop fields with unknown descriptions in Xcelera documentation
measgraphref_df = measgraphref_df.drop("srinstanceidk", axis="columns")
measgraphref_df.head()

In [None]:
BAD_INSTANCE_IDKS = ["", "-1"]

In [None]:
len(measgraphref_df[measgraphref_df["instanceidk"].isin(BAD_INSTANCE_IDKS)])

In [None]:
# Drop bad instanceidk from dataframe and cast to int
measgraphref_df = measgraphref_df[~measgraphref_df["instanceidk"].isin(BAD_INSTANCE_IDKS)]
measgraphref_df["instanceidk"] = measgraphref_df["instanceidk"].astype(int)
len(measgraphref_df)

In [None]:
measgraphref_df['indexinmglist'].sort_values().unique()

In [None]:
BAD_INDEX_IN_MG_LIST_VALUES = [-1]

In [None]:
len(measgraphref_df[measgraphref_df["indexinmglist"].isin(BAD_INDEX_IN_MG_LIST_VALUES)])

In [None]:
measgraphref_df = measgraphref_df[~measgraphref_df["indexinmglist"].isin(BAD_INDEX_IN_MG_LIST_VALUES)]
len(measgraphref_df)

In [None]:
measgraphref_df["howentered"].unique()

In [None]:
(measgraphref_df["imagesopinstanceuid"] != "").sum()

In [None]:
(measgraphref_df.groupby("instanceidk").first()["imagesopinstanceuid"] != "").sum()

### `a_measgraphic`

In [None]:
measgraphic_df = read_table(conn, 'a_measgraphic')
measgraphic_df.head()

In [None]:
# Drop fields with unknown descriptions in Xcelera documentation
measgraphic_df = measgraphic_df.drop(["graphictoolidk", "longaxisindex", "measidk", "loopidk", "instancerecordtype"], axis="columns")
measgraphic_df.head()

# 2. Descriptive Statistics

* 362 unique measurements
* 24,912 unique studies with final (reported) measurements
* 24,471 unique studies with total (recorded) measurements
    * 128 outliers based on instances per study)
* 254,159 unique instances with total (recorded) measurements
    * 2,357 outliers based on measurements per instance
    * 31,877 outliers based on frame counts per instance)

### What are the most/least common final (reported) measurements?

In [None]:
counts = measurement_abstract_rpt_df['name'].value_counts()
counts

### How many measurements appear once/in single digits?

In [None]:
len(counts[counts==1].index)

In [None]:
len(counts[counts<10].index)

### What is the distribution of counts for final (reported) measurements?

In [None]:
plt.title("Counts for Final Measurements")
final_measurement_counts = measurement_abstract_rpt_df['name'].value_counts().values
plt.hist(final_measurement_counts, N_BINS);

### What is the distribution of counts for final (reported) measurements per study?

In [None]:
final_measurement_counts_per_study = measurement_abstract_rpt_df.groupby("studyid").size()
len(final_measurement_counts_per_study)

In [None]:
final_measurement_counts_per_study.head()

In [None]:
final_measurement_counts_per_study.describe()

In [None]:
plt.title("Counts for Final (Reported) Measurements Per Study")
plt.hist(final_measurement_counts_per_study.values, N_BINS);

### What is the distribution of counts for total (recorded) measurements per study?

In [None]:
total_measurement_counts_per_study = measgraphref_df.groupby("studyidk").size()
len(total_measurement_counts_per_study)

In [None]:
total_measurement_counts_per_study.head()

In [None]:
total_measurement_counts_per_study.describe()

In [None]:
plt.title("Counts for Total (Recorded) Measurements Per Study")
plt.hist(total_measurement_counts_per_study.values, N_BINS);

### What is the distribution of counts for total (recorded) measurements per instance?

In [None]:
total_measurement_counts_per_instance = measgraphref_df.groupby("instanceidk").size()
len(total_measurement_counts_per_instance)

In [None]:
total_measurement_counts_per_instance.head()

In [None]:
total_measurement_counts_per_instance.describe()

In [None]:
total_measurement_counts_per_instance = total_measurement_counts_per_instance.sort_values(ascending=False)

In [None]:
plt.title("Counts for Total (Recorded) Measurements Per Instance")
plt.hist(total_measurement_counts_per_instance.values, N_BINS);

In [None]:
plt.title("Boxplot for Total (Recorded) Measurements Per Instance")
result = plt.boxplot(total_measurement_counts_per_instance)

In [None]:
def get_outlier_thresholds(result):
    outlier_min, outlier_max = [item.get_ydata()[0] for item in result['caps']]
    return outlier_min, outlier_max

In [None]:
outlier_min, outlier_max = get_outlier_thresholds(result)
outlier_min, outlier_max

In [None]:
# TODO: filter outliers?
num_outliers = ((total_measurement_counts_per_instance < outlier_min) | (total_measurement_counts_per_instance > outlier_max)).sum()
num_outliers

### What is the distribution of counts for instances per study?

In [None]:
frame_counts_per_study_and_instance = measgraphref_df.groupby(["studyidk", "instanceidk"]).size()
len(frame_counts_per_study_and_instance)

In [None]:
frame_counts_per_study_and_instance.head()

In [None]:
instance_counts_per_study = frame_counts_per_study_and_instance.reset_index().groupby('studyidk').size()
len(instance_counts_per_study)

In [None]:
instance_counts_per_study.head()

In [None]:
instance_counts_per_study.describe()

In [None]:
plt.title("Counts for Instances Per Study")
plt.hist(instance_counts_per_study.values, N_BINS);

In [None]:
plt.title("Boxplot for Instances Per Study")
result = plt.boxplot(instance_counts_per_study)

In [None]:
outlier_min, outlier_max = get_outlier_thresholds(result)
outlier_min, outlier_max

In [None]:
# TODO: filter outliers?
num_outliers = ((instance_counts_per_study < outlier_min) | (instance_counts_per_study > outlier_max)).sum()
num_outliers

### What is the distribution of counts for frames per instance?

In [None]:
frame_counts_per_instance = measgraphref_df.groupby(["instanceidk", "indexinmglist"]).size().reset_index().groupby('instanceidk').size()
len(frame_counts_per_instance)

In [None]:
frame_counts_per_instance.head()

In [None]:
frame_counts_per_instance.describe()

In [None]:
plt.title("Counts for Frames Per Instance")
plt.hist(frame_counts_per_instance.values, N_BINS);

In [None]:
plt.title("Boxplot for Frames Per Instance")
result = plt.boxplot(frame_counts_per_instance)

In [None]:
outlier_min, outlier_max = get_outlier_thresholds(result)
outlier_min, outlier_max

In [None]:
# TODO: filter outliers?
num_outliers = ((frame_counts_per_instance < outlier_min) | (frame_counts_per_instance > outlier_max)).sum()
num_outliers

### What is the distribution of counts for measurements per frame?

In [None]:
measurement_counts_per_frame = measgraphref_df.groupby(["instanceidk", "indexinmglist"]).size()
len(measurement_counts_per_frame)

In [None]:
measurement_counts_per_frame.head()

In [None]:
measurement_counts_per_frame.describe()

In [None]:
plt.title("Counts for Measurements Per Frame")
plt.hist(measurement_counts_per_frame.values, N_BINS);

# 3. View groundtruth from measurements

* 823,886 total (recorded) instance measurements with frames
* 295,962 total (recorded) instance measurements with frames for measurements we care about
* 23,327 unique studies with total (recorded) measurements for measurements we care about
* 69,185 unique instancs with total (recorded) measurements for measurements we care about
* 162 frames with conflicting views
    * 113 A4C & A2C
    * 49 A4C & PLAX
* 53,852 instances (potentially) witht PLAX view
    * 27,838 instances frames with PLAX view
    * 26,014 instances frames with PLAX view potentially
* 38,880 instances frames with A4C view
* 20,908 instances frames with A2C view
* 45,782 instances frames 
* 113,640 total frames from all views

In [None]:
measgraphref_df.head()

In [None]:
measgraphref_df = measgraphref_df[['studyidk', 'measabstractnumber', 'instanceidk', 'indexinmglist']]
measgraphref_df.head()

In [None]:
measgraphic_df.head()

In [None]:
measgraphic_df = measgraphic_df[['instanceidk', 'indexinmglist', 'frame']]
measgraphic_df.head()

In [None]:
measurement_abstract_rpt_df.head()

In [None]:
measurement_abstract_rpt_df = measurement_abstract_rpt_df[['studyid', 'measabstractnumber', 'name']]
measurement_abstract_rpt_df = measurement_abstract_rpt_df.rename(index=str, columns={"studyid": "studyidk"})
measurement_abstract_rpt_df.head()

In [None]:
merge_df = measgraphref_df.merge(measgraphic_df, on=['instanceidk', 'indexinmglist'])
print(merge_df.shape)
merge_df.head()

In [None]:
merge_df = merge_df.merge(measurement_abstract_rpt_df, on=['studyidk', 'measabstractnumber'])
print(merge_df.shape)
merge_df.head()

In [None]:
MEASUREMENTS_PARASTERNAL_LONG_AXIS_VIEW = ['Diám raíz Ao', 'Diám. Ao asc.', 'Diám TSVI', 'Dimensión AI']
POTENTIAL_MEASUREMENTS_PARASTERNAL_LONG_AXIS_VIEW = ['Diám TSVD', 'DVItd', 'DVIts', 'SIVtd', 'PPVItd']
MEASUREMENTS_APICAL_4_CHAMBER_VIEW = ['AVItd ap4', 'VTD(el-ps4)', 'VTD(MDD-ps4)', 'VTD 4C', 'AVIts ap4', 'VTS(el-ps4)', 'VTS(MDD-ps4)', 'VTS 4C', 'Vol. AI (MOD-sp4)']
MEASUREMENTS_APICAL_2_CHAMBER_VIEW = ['AVItd ap2', 'VTD(el-ps2)', 'VTD(MDD-ps2)', 'VTD 2C', 'AVIts ap2', 'VTS(el-ps2)', 'VTS(MDD-ps2)', 'VTS 2C', 'Vol. AI (MOD-sp2)']
ALL_MEASUREMENTS = MEASUREMENTS_PARASTERNAL_LONG_AXIS_VIEW + POTENTIAL_MEASUREMENTS_PARASTERNAL_LONG_AXIS_VIEW + MEASUREMENTS_APICAL_4_CHAMBER_VIEW + MEASUREMENTS_APICAL_2_CHAMBER_VIEW

In [None]:
filter_df = merge_df[merge_df.name.isin(ALL_MEASUREMENTS)].copy()
print(filter_df.shape)
filter_df.head()

In [None]:
filtered_measurements = filter_df['name']
filtered_measurements.value_counts()

In [None]:
plt.title("Counts for Filtered Measurements")
plt.hist(filtered_measurements.value_counts().values, N_BINS);

In [None]:
len(filter_df.groupby(['studyidk']).size())

In [None]:
len(filter_df.groupby(['instanceidk']).size())

In [None]:
MEASUREMENTS_END_DIASTOLIC = ['DVItd', 'SIVtd', 'PPVItd', 'AVItd ap4', 'VTD(el-ps4)', 'VTD(MDD-ps4)', 'VTD 4C', 'AVItd ap2', 'VTD(el-ps2)', 'VTD(MDD-ps2)', 'VTD 2C']
MEASUREMENTS_END_SYSTOLIC = ['DVIts', 'AVIts ap4', 'VTS(el-ps4)', 'VTS(MDD-ps4)', 'VTS 4C', 'AVIts ap2', 'VTS(el-ps2)', 'VTS(MDD-ps2)', 'VTS 2C']

In [None]:
filter_df['is_end_diastolic'] = filter_df['name'].isin(MEASUREMENTS_END_DIASTOLIC)
filter_df['is_end_systolic'] = filter_df['name'].isin(MEASUREMENTS_END_SYSTOLIC)

In [None]:
(filter_df['is_end_diastolic'] & filter_df['is_end_systolic']).sum()

In [None]:
filter_df['is_plax'] = filter_df['name'].isin(MEASUREMENTS_PARASTERNAL_LONG_AXIS_VIEW)
filter_df['maybe_plax'] = filter_df['name'].isin(POTENTIAL_MEASUREMENTS_PARASTERNAL_LONG_AXIS_VIEW)
filter_df['is_a4c'] = filter_df['name'].isin(MEASUREMENTS_APICAL_4_CHAMBER_VIEW)
filter_df['is_a2c'] = filter_df['name'].isin(MEASUREMENTS_APICAL_2_CHAMBER_VIEW)

In [None]:
filter_df['view'] = ''
filter_df.loc[filter_df['is_plax']==True, 'view'] = 'plax'
filter_df.loc[filter_df['maybe_plax']==True, 'view'] = 'plax'
filter_df.loc[filter_df['is_a4c']==True, 'view'] = 'a4c'
filter_df.loc[filter_df['is_a2c']==True, 'view'] = 'a2c'

In [None]:
group_df = filter_df.groupby(['instanceidk', 'frame']).first()
group_df = group_df.drop(['measabstractnumber', 'name'], axis='columns')
print(group_df.shape)
group_df.head()

In [None]:
(group_df.reset_index().groupby(['instanceidk', 'indexinmglist'])['view'].nunique().eq(1)==False).sum()

In [None]:
(group_df.reset_index().groupby('instanceidk')['view'].nunique().eq(1)==False).sum()

In [None]:
is_instance_multiview = (group_df.reset_index().groupby('instanceidk')['view'].nunique().eq(1)==False).reset_index()
is_instance_multiview = is_instance_multiview.rename(index=str, columns={"view": "is_multiview"})

In [None]:
group_df = group_df.merge(is_instance_multiview, on='instanceidk')

In [None]:
frames_with_views_df = group_df
frames_with_views_df.head()

In [None]:
conflict_sets = frames_with_views_df[frames_with_views_df['is_multiview']==True].groupby('instanceidk').agg({'view': lambda x: set(x)})
conflict_sets.head()

In [None]:
conflict_sets['view'].value_counts()

In [None]:
from collections import defaultdict

def get_view_counts(df):
    d = defaultdict(dict)
    d['plax'] = df['is_plax'].sum()
    d['plax_maybe'] = df['maybe_plax'].sum()
    d['plax_total'] = d['plax'] + d['plax_maybe']
    d['a4c'] = df['is_a4c'].sum()
    d['a2c'] = df['is_a2c'].sum()
    d['total'] = d['plax_total'] + d['a4c'] + d['a2c']
    return d

In [None]:
results = defaultdict(dict)
results['end_diastolic'] = get_view_counts(frames_with_views_df[frames_with_views_df['is_end_diastolic']])
results['end_systolic'] = get_view_counts(frames_with_views_df[frames_with_views_df['is_end_systolic']])
results['neither'] = get_view_counts(frames_with_views_df[(~frames_with_views_df['is_end_diastolic']) & (~frames_with_views_df['is_end_systolic'])])
results['total'] = get_view_counts(frames_with_views_df)

In [None]:
results_df = pd.DataFrame.from_dict(results)
results_df = results_df.transpose()
results_df.head()

In [None]:
frames_with_views_df.to_csv('../data/02_intermediate/frames_with_views.csv')