In [62]:
import sys
import json
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import pearsonr, spearmanr

In [63]:
# Inserting parent folder in sys, to allow imports
sys.path.append("..")
from load_preprocess.load_meta import get_cbis_test
from load_preprocess.load_meta import get_inbreast

In [64]:
meta = get_cbis_test(whole_image_labels=True)

In [65]:
meta

Unnamed: 0,Subject ID,pathology,true_malignant,true_benign
0,Calc-Test_P_00038_LEFT_CC,BENIGN,0,1
1,Calc-Test_P_00038_LEFT_MLO,BENIGN,0,1
2,Calc-Test_P_00038_RIGHT_CC,BENIGN_WITHOUT_CALLBACK,0,1
4,Calc-Test_P_00038_RIGHT_MLO,BENIGN_WITHOUT_CALLBACK,0,1
6,Calc-Test_P_00041_LEFT_CC,BENIGN_WITHOUT_CALLBACK,0,1
...,...,...,...,...
699,Mass-Test_P_01825_RIGHT_MLO,BENIGN_WITHOUT_CALLBACK,0,1
700,Mass-Test_P_01833_RIGHT_MLO,MALIGNANT,1,0
701,Mass-Test_P_01865_LEFT_MLO,MALIGNANT,1,0
702,Mass-Test_P_01912_RIGHT_CC,MALIGNANT,1,0


In [66]:
meta_features = pd.read_csv(
    "../../data/cbis-ddsm/meta/cbis_test_with_features.csv"
)

In [67]:
full_meta = meta.merge(meta_features, on="Subject ID")

In [68]:
full_meta = full_meta.merge(get_cbis_test(whole_image_labels=False), on="Subject ID").drop_duplicates(
            subset=["Subject ID"], inplace=False
        )

In [69]:
full_meta.columns

Index(['Subject ID', 'pathology_x', 'true_malignant', 'true_benign',
       'feature_text', 'feature_mole_circle', 'feature_nipple_dot',
       'feature_scar_line', 'feature_triangle', 'feature_arrow',
       'feature_ruler', 'patient_id', 'breast_density', 'left or right breast',
       'image view', 'abnormality id', 'abnormality type', 'calc type',
       'calc distribution', 'assessment', 'pathology_y', 'subtlety',
       'image file path', 'cropped image file path', 'ROI mask file path',
       'mass shape', 'mass margins', 'Series UID', 'Collection',
       '3rd Party Analysis', 'Data Description URI', 'Study UID',
       'Study Description', 'Study Date', 'Series Description', 'Manufacturer',
       'Modality', 'SOP Class Name', 'SOP Class UID', 'Number of Images',
       'Unnamed: 14', 'File Size', 'File Location', 'Download Timestamp'],
      dtype='object')

In [70]:
# Subset
full_meta = full_meta[
    [
        "true_malignant", "true_benign", "feature_text", "feature_nipple_dot", "feature_scar_line", "feature_ruler", "abnormality type"
    ]
]

In [71]:
# Selection of features
Xs = ['feature_text', 'feature_nipple_dot', 'feature_scar_line', 'feature_ruler']

# We now prepare for running a multivariate linear regresion using statsmodel
# The library requires us to create a constant variable, to calculate the intercept.
full_meta = sm.add_constant(full_meta)
Xs.append("const")

  x = pd.concat(x[::order], 1)


In [72]:
full_meta

Unnamed: 0,const,true_malignant,true_benign,feature_text,feature_nipple_dot,feature_scar_line,feature_ruler,abnormality type
0,1.0,0,1,1.0,1.0,,,calcification
1,1.0,0,1,1.0,,,,calcification
2,1.0,0,1,1.0,1.0,,,calcification
4,1.0,0,1,,1.0,,,calcification
6,1.0,0,1,1.0,1.0,,,calcification
...,...,...,...,...,...,...,...,...
699,1.0,0,1,1.0,,,,mass
700,1.0,1,0,1.0,1.0,,,mass
701,1.0,1,0,1.0,,,1.0,mass
702,1.0,1,0,,,,,mass


In [73]:
full_meta = full_meta.fillna(0)

## First, attempt to fit on whole set

In [74]:
# First we run the linear multivariate regression
est = sm.OLS(full_meta["true_malignant"], full_meta[Xs], hasconst = True).fit()
# Lots to unpack here, but let's focus on the basics. The R-squared (top-right)
# is a measure of prediction quality: how much of the daily variation in number
# of cases can we explain? The "P>|t|" column tells you the (non Bonferroni
# corrected) p-values of each variable *when keeping all the other constant*.
# For instance, this regression tells us that varying SolarRadiation doesn't
# tell us anything interesting if everything else is held constant.
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:         true_malignant   R-squared:                       0.035
Model:                            OLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     5.729
Date:                Tue, 03 May 2022   Prob (F-statistic):           0.000156
Time:                        10:13:49   Log-Likelihood:                -446.00
No. Observations:                 645   AIC:                             902.0
Df Residuals:                     640   BIC:                             924.3
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
feature_text          -0.0598      0

## Subsets

In [75]:
calc_meta = full_meta[full_meta["abnormality type"] == "calcification"]
mass_meta = full_meta[full_meta["abnormality type"] == "mass"]

In [77]:
# Calc cases
est = sm.OLS(calc_meta["true_malignant"], calc_meta[Xs], hasconst = True).fit()
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:         true_malignant   R-squared:                       0.048
Model:                            OLS   Adj. R-squared:                  0.034
Method:                 Least Squares   F-statistic:                     3.521
Date:                Tue, 03 May 2022   Prob (F-statistic):            0.00801
Time:                        10:15:24   Log-Likelihood:                -195.36
No. Observations:                 284   AIC:                             400.7
Df Residuals:                     279   BIC:                             419.0
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
feature_text          -0.1574      0

In [79]:
# Mass cases
est = sm.OLS(mass_meta["true_malignant"], mass_meta[Xs], hasconst = True).fit()
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:         true_malignant   R-squared:                       0.090
Model:                            OLS   Adj. R-squared:                  0.080
Method:                 Least Squares   F-statistic:                     8.829
Date:                Tue, 03 May 2022   Prob (F-statistic):           8.32e-07
Time:                        10:16:12   Log-Likelihood:                -237.82
No. Observations:                 361   AIC:                             485.6
Df Residuals:                     356   BIC:                             505.1
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
feature_text           0.0264      0