In [24]:
import sys
import json
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import pearsonr, spearmanr

In [25]:
# Inserting parent folder in sys, to allow imports
sys.path.append("..")
from load_preprocess.load_meta import get_cbis_test
from load_preprocess.load_meta import get_inbreast

In [26]:
meta = get_cbis_test(whole_image_labels=True)

In [27]:
meta

Unnamed: 0,Subject ID,pathology,true_malignant,true_benign
0,Calc-Test_P_00038_LEFT_CC,BENIGN,0,1
1,Calc-Test_P_00038_LEFT_MLO,BENIGN,0,1
2,Calc-Test_P_00038_RIGHT_CC,BENIGN_WITHOUT_CALLBACK,0,1
4,Calc-Test_P_00038_RIGHT_MLO,BENIGN_WITHOUT_CALLBACK,0,1
6,Calc-Test_P_00041_LEFT_CC,BENIGN_WITHOUT_CALLBACK,0,1
...,...,...,...,...
699,Mass-Test_P_01825_RIGHT_MLO,BENIGN_WITHOUT_CALLBACK,0,1
700,Mass-Test_P_01833_RIGHT_MLO,MALIGNANT,1,0
701,Mass-Test_P_01865_LEFT_MLO,MALIGNANT,1,0
702,Mass-Test_P_01912_RIGHT_CC,MALIGNANT,1,0


In [28]:
meta_features = pd.read_csv(
    "../../data/cbis-ddsm/meta/cbis_test_with_features.csv"
)

In [29]:
full_meta = meta.merge(meta_features, on="Subject ID")

In [30]:
full_meta = full_meta.merge(get_cbis_test(whole_image_labels=False), on="Subject ID").drop_duplicates(
            subset=["Subject ID"], inplace=False
        )

In [31]:
meta_preds = pd.read_csv(
    "../../data/cbis-ddsm/results/end2end_cbis_test_results.csv"
)
meta_preds["Subject ID"] = meta_preds["Filename"].str.split("/").str[1].str.split(".").str[0]

In [32]:
full_meta = full_meta.merge(meta_preds, on="Subject ID")

In [33]:
full_meta.columns

Index(['Subject ID', 'pathology_x', 'true_malignant', 'true_benign',
       'feature_text', 'feature_mole_circle', 'feature_nipple_dot',
       'feature_scar_line', 'feature_triangle', 'feature_arrow',
       'feature_ruler', 'patient_id', 'breast_density', 'left or right breast',
       'image view', 'abnormality id', 'abnormality type', 'calc type',
       'calc distribution', 'assessment', 'pathology_y', 'subtlety',
       'image file path', 'cropped image file path', 'ROI mask file path',
       'mass shape', 'mass margins', 'Series UID', 'Collection',
       '3rd Party Analysis', 'Data Description URI', 'Study UID',
       'Study Description', 'Study Date', 'Series Description', 'Manufacturer',
       'Modality', 'SOP Class Name', 'SOP Class UID', 'Number of Images',
       'Unnamed: 14', 'File Size', 'File Location', 'Download Timestamp',
       'Filename', 'true_neg', 'true_pos', 'res_pred_neg', 'res_pred_pos',
       'vgg_pred_neg', 'vgg_pred_pos', 'hybrid_pred_neg', 'hybrid_pr

In [34]:
# Column Subset
full_meta = full_meta[
    [
        "true_malignant",
        "true_benign",
        "feature_text",
        "feature_nipple_dot",
        "feature_scar_line",
        "feature_ruler",
        "abnormality type",
        "hybrid_pred_pos_aug"
    ]
]

In [37]:
# Selection of features
Xs = ['feature_text', 'feature_nipple_dot', 'feature_scar_line', 'feature_ruler']

# We now prepare for running a multivariate linear regresion using statsmodel
# The library requires us to create a constant variable, to calculate the intercept.
full_meta = sm.add_constant(full_meta)
Xs.append("const")

  x = pd.concat(x[::order], 1)


In [38]:
full_meta

Unnamed: 0,const,true_malignant,true_benign,feature_text,feature_nipple_dot,feature_scar_line,feature_ruler,abnormality type,hybrid_pred_pos_aug
0,1.0,0,1,1.0,1.0,,,calcification,0.443646
1,1.0,0,1,1.0,,,,calcification,0.825486
2,1.0,0,1,1.0,1.0,,,calcification,0.686737
3,1.0,0,1,,1.0,,,calcification,0.240121
4,1.0,0,1,1.0,1.0,,,calcification,0.121573
...,...,...,...,...,...,...,...,...,...
640,1.0,0,1,1.0,,,,mass,0.018770
641,1.0,1,0,1.0,1.0,,,mass,0.746999
642,1.0,1,0,1.0,,,1.0,mass,0.445542
643,1.0,1,0,,,,,mass,0.550366


In [39]:
full_meta = full_meta.fillna(0)

# Experiement: With binary label as response variable

## First, attempt to fit on whole set

In [40]:
est = sm.OLS(full_meta["true_malignant"], full_meta[Xs], hasconst = True).fit()
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:         true_malignant   R-squared:                       0.035
Model:                            OLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     5.729
Date:                Fri, 13 May 2022   Prob (F-statistic):           0.000156
Time:                        21:21:50   Log-Likelihood:                -446.00
No. Observations:                 645   AIC:                             902.0
Df Residuals:                     640   BIC:                             924.3
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
feature_text          -0.0598      0

## Subsets

In [41]:
calc_meta = full_meta[full_meta["abnormality type"] == "calcification"]
mass_meta = full_meta[full_meta["abnormality type"] == "mass"]

In [42]:
# Calc cases
est = sm.OLS(calc_meta["true_malignant"], calc_meta[Xs], hasconst = True).fit()
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:         true_malignant   R-squared:                       0.048
Model:                            OLS   Adj. R-squared:                  0.034
Method:                 Least Squares   F-statistic:                     3.521
Date:                Fri, 13 May 2022   Prob (F-statistic):            0.00801
Time:                        21:21:52   Log-Likelihood:                -195.36
No. Observations:                 284   AIC:                             400.7
Df Residuals:                     279   BIC:                             419.0
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
feature_text          -0.1574      0

In [43]:
# Mass cases
est = sm.OLS(mass_meta["true_malignant"], mass_meta[Xs], hasconst = True).fit()
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:         true_malignant   R-squared:                       0.090
Model:                            OLS   Adj. R-squared:                  0.080
Method:                 Least Squares   F-statistic:                     8.829
Date:                Fri, 13 May 2022   Prob (F-statistic):           8.32e-07
Time:                        21:21:53   Log-Likelihood:                -237.82
No. Observations:                 361   AIC:                             485.6
Df Residuals:                     356   BIC:                             505.1
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
feature_text           0.0264      0

# Now with continuous ML prediction probs as target

In [48]:
full_meta

Unnamed: 0,const,true_malignant,true_benign,feature_text,feature_nipple_dot,feature_scar_line,feature_ruler,abnormality type,hybrid_pred_pos_aug
0,1.0,0,1,1.0,1.0,0.0,0.0,calcification,0.443646
1,1.0,0,1,1.0,0.0,0.0,0.0,calcification,0.825486
2,1.0,0,1,1.0,1.0,0.0,0.0,calcification,0.686737
3,1.0,0,1,0.0,1.0,0.0,0.0,calcification,0.240121
4,1.0,0,1,1.0,1.0,0.0,0.0,calcification,0.121573
...,...,...,...,...,...,...,...,...,...
640,1.0,0,1,1.0,0.0,0.0,0.0,mass,0.018770
641,1.0,1,0,1.0,1.0,0.0,0.0,mass,0.746999
642,1.0,1,0,1.0,0.0,0.0,1.0,mass,0.445542
643,1.0,1,0,0.0,0.0,0.0,0.0,mass,0.550366


In [49]:
# Subset by mal / benign
mal_meta = full_meta[full_meta["true_malignant"] == 1]
ben_meta = full_meta[full_meta["true_malignant"] == 0]

## Malignant

In [53]:
est = sm.OLS(mal_meta["hybrid_pred_pos_aug"], mal_meta[Xs], hasconst = True).fit()
print(est.summary())

                             OLS Regression Results                            
Dep. Variable:     hybrid_pred_pos_aug   R-squared:                       0.049
Model:                             OLS   Adj. R-squared:                  0.035
Method:                  Least Squares   F-statistic:                     3.355
Date:                 Sat, 14 May 2022   Prob (F-statistic):             0.0106
Time:                         11:33:08   Log-Likelihood:                -62.905
No. Observations:                  264   AIC:                             135.8
Df Residuals:                      259   BIC:                             153.7
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
feature_text          -0.0

## Benign

In [54]:
est = sm.OLS(ben_meta["hybrid_pred_pos_aug"], ben_meta[Xs], hasconst = True).fit()
print(est.summary())

                             OLS Regression Results                            
Dep. Variable:     hybrid_pred_pos_aug   R-squared:                       0.177
Model:                             OLS   Adj. R-squared:                  0.168
Method:                  Least Squares   F-statistic:                     20.19
Date:                 Sat, 14 May 2022   Prob (F-statistic):           4.45e-15
Time:                         11:34:15   Log-Likelihood:                -127.95
No. Observations:                  381   AIC:                             265.9
Df Residuals:                      376   BIC:                             285.6
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
feature_text          -0.2

# Attempts without subsetting by true label

## Whole set

In [45]:
est = sm.OLS(full_meta["hybrid_pred_pos_aug"], full_meta[Xs], hasconst = True).fit()
print(est.summary())

                             OLS Regression Results                            
Dep. Variable:     hybrid_pred_pos_aug   R-squared:                       0.097
Model:                             OLS   Adj. R-squared:                  0.091
Method:                  Least Squares   F-statistic:                     17.15
Date:                 Fri, 13 May 2022   Prob (F-statistic):           2.25e-13
Time:                         21:22:20   Log-Likelihood:                -257.96
No. Observations:                  645   AIC:                             525.9
Df Residuals:                      640   BIC:                             548.3
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
feature_text          -0.1

## Mass

In [46]:
# Mass cases
est = sm.OLS(mass_meta["hybrid_pred_pos_aug"], mass_meta[Xs], hasconst = True).fit()
print(est.summary())

                             OLS Regression Results                            
Dep. Variable:     hybrid_pred_pos_aug   R-squared:                       0.089
Model:                             OLS   Adj. R-squared:                  0.079
Method:                  Least Squares   F-statistic:                     8.713
Date:                 Fri, 13 May 2022   Prob (F-statistic):           1.02e-06
Time:                         21:30:12   Log-Likelihood:                -139.21
No. Observations:                  361   AIC:                             288.4
Df Residuals:                      356   BIC:                             307.9
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
feature_text          -0.1

## Calcification

In [47]:
# Calc cases
est = sm.OLS(calc_meta["hybrid_pred_pos_aug"], calc_meta[Xs], hasconst = True).fit()
print(est.summary())

                             OLS Regression Results                            
Dep. Variable:     hybrid_pred_pos_aug   R-squared:                       0.133
Model:                             OLS   Adj. R-squared:                  0.121
Method:                  Least Squares   F-statistic:                     10.70
Date:                 Fri, 13 May 2022   Prob (F-statistic):           4.40e-08
Time:                         21:30:12   Log-Likelihood:                -110.96
No. Observations:                  284   AIC:                             231.9
Df Residuals:                      279   BIC:                             250.2
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
feature_text          -0.2