# MNIST Standard Biological Images: Linear Regression

In [196]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
import time
import re

In [197]:
def flatten_X_data(contained_array, x_data):
    x_shape = x_data.shape
    end_of_range = len(x_shape)
    #print("end_of_range:", end_of_range)

    dim = ""
    if end_of_range == 4:
        if x_shape[3] == 3:
            dim = "RGB images:"
        else:
            dim = "3D images: "
        dim = dim + str(x_shape[1]) + "x" + str(x_shape[2]) + "x" + str(x_shape[3])
    elif end_of_range == 3:
        dim = "2D images: " + str(x_shape[1]) + "x" + str(x_shape[2])
    if (dim != ""):
        print(f"  {contained_array}: {x_shape[0]} {dim}")
    
    if (end_of_range > 2):
        # Handle 2D, 3D, ... nD image dimensions
        flat_sz = 1
        for ix in range(1, end_of_range):
            flat_sz = flat_sz * x_shape[ix]
        #print("flat_sz:", flat_sz)
        x_data = x_data.reshape(-1, flat_sz)
    else:
        # Convert label dimensions to 1D
        x_data = np.ravel(x_data, order='C')
    return(x_data)

In [198]:
def load_and_flatten(fn):
    blood_npz = np.load(fn, allow_pickle=True)
    #print(fn, "arrays:", blood_npz.files)

    data_arr = {}
    label_arr = {}
    substr = "labels"
    yx = 0
    for contained_array in blood_npz.files:
        #print(f"BEFORE: {contained_array}: {blood_npz[contained_array].shape}")
        arr = flatten_X_data(contained_array, blood_npz[contained_array])
        #print("arr.shape:", arr.shape) 
        #print(f"{substr}:{contained_array}")
        match = re.search(substr, contained_array)
        if (match):
            label_arr[contained_array] = arr
            #print(f"{contained_array} (flattened): {label_arr[contained_array].shape}")
        else:
            data_arr[contained_array] = arr
            #print(f"{contained_array} (flattened): {data_arr[contained_array].shape}")
    return data_arr, label_arr

In [199]:
def plt_regress(title, X_train, y_train, mdl):

    #plt.scatter(X_train, y_train, color='blue', alpha=0.5, label='Training data')
    plt.plot(X_train, mdl.predict(X_train), color='red', label='Regression line')
    plt.xlabel("Predict")
    plt.ylabel("True")
    plt.title(title)
    plt.legend()
    plt.show()

In [200]:
def abline(slope, intercept):
    """Plot a line from slope and intercept"""
    axes = plt.gca()
    x_vals = np.array(axes.get_xlim())
    y_vals = intercept + slope * x_vals
    plt.plot(x_vals, y_vals, '--')

In [201]:
def LinRegress(title, X_train, y_train, X_test, y_test):

    # Create and fit the linear regression model
    model = LinearRegression()
    start = time.time()
    model.fit(X_train, y_train)
    elapsed = time.time() - start
    print(f"  Fit time:     {round(elapsed, 2)} seconds")
    
    # Get the R-squared score on the test set
    start = time.time()
    r_squared = model.score(X_test, y_test)
    elapsed = time.time() - start
    print(f"  Score time:     {round(elapsed, 2)} seconds")
    print(f"\nR-squared score: {r_squared}")

    #print(f"Intercept: {model.intercept_}")
    #print(f"Coefficient: {model.coef_[0]}")

    print(X_train.shape, y_train.shape)
    #plt_regress(title, X_train, y_train, model)
    #abline(model.coef_[0], model.intercept_)
    
    return r_squared

In [202]:
def LinearRegress(fn):
    mdl_type = "Linear Regression"
    title = fn + ": " + mdl_type
    print("\n-------------------------------------------------------")
    print(f"{title}")
    features, labels = load_and_flatten(fn)

    #for key in features:
    #    features[key] = normalize_X_data(features[key])

    # Confirm data/label order is maintained
    feature_keys = list(features.keys())
    label_keys = list(labels.keys())
    #print(f"  feature keys: {feature_keys}")
    #print(f"  label keys:   {label_keys}")

    X_train = features[feature_keys[0]]
    y_train = labels[label_keys[0]]
    X_val = features[feature_keys[1]]
    y_val = labels[label_keys[1]]
    X_test = features[feature_keys[2]]
    y_test = labels[label_keys[2]]

    r_squared = LinRegress(title, X_train, y_train, X_test, y_test)

    return(r_squared)

In [203]:
# NOTE: 'chestmnist.npz', # image, label size length mismatch
files = ['bloodmnist.npz', 'breastmnist.npz', 'dermamnist.npz', 'octmnist.npz', 'organsmnist.npz', \
         'nodulemnist3d.npz', 'organmnist3d.npz', 'retinamnist_128.npz']
#files = ['bloodmnist.npz', 'breastmnist.npz']

rscores = {}
for file_name in files:
    rsq = LinearRegress(file_name)
    rscores[file_name] = rsq



-------------------------------------------------------
bloodmnist.npz: Linear Regression
  train_images: 11959 RGB images:28x28x3
  val_images: 1712 RGB images:28x28x3
  test_images: 3421 RGB images:28x28x3
  Fit time:     1.16 seconds
  Score time:     0.01 seconds

R-squared score: 0.5465406264237447
(11959, 2352) (11959,)

-------------------------------------------------------
breastmnist.npz: Linear Regression
  train_images: 546 2D images: 28x28
  val_images: 78 2D images: 28x28
  test_images: 156 2D images: 28x28
  Fit time:     0.06 seconds
  Score time:     0.0 seconds

R-squared score: -2.1374319225069938
(546, 784) (546,)

-------------------------------------------------------
dermamnist.npz: Linear Regression
  train_images: 7007 RGB images:28x28x3
  val_images: 1003 RGB images:28x28x3
  test_images: 2005 RGB images:28x28x3
  Fit time:     0.96 seconds
  Score time:     0.01 seconds

R-squared score: -0.025539300975444323
(7007, 2352) (7007,)

---------------------------

In [204]:
print(f"\nLinear Regression R-Squared Score Summary")
for file_name in files:
    if (len(file_name) <= 12):
        tb = "\t\t"
    else:
        tb = "\t"
    print(f"  {file_name}:{tb}{rscores[file_name]}")


Linear Regression R-Squared Score Summary
  bloodmnist.npz:	0.5465406264237447
  breastmnist.npz:	-2.1374319225069938
  dermamnist.npz:	-0.025539300975444323
  octmnist.npz:		0.11412123448005118
  organsmnist.npz:	0.019899817857234825
  nodulemnist3d.npz:	-0.1880624675111926
  organmnist3d.npz:	0.4455104158434551
  retinamnist_128.npz:	-0.09576155424730537
