In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import matplotlib.patheffects as PathEffects
from pathlib import Path
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
try:
    import pad
except ModuleNotFoundError:
    sys.path.insert(0, "../src")
    import pad

In [None]:
def check_if_dir_object_in(dir_object, directories):
    this = [Path(x) for x in directories]
    return Path(dir_object.path_to_dir) in this

In [None]:
OUTPUT_DIRECTORY = "C:\\wherever\\you\\want"
pad.create_output_dir_if_needed(OUTPUT_DIRECTORY)

In [None]:
POSITIVE_DIRECTORIES = [
    "C:\\Users\\pete.buttigieg\\old_lactose_pad_images",
    "C:\\Users\\pete.buttigieg\\fresh_lactose_pad_images",
    "C:\\Users\\pete.buttigieg\\10_percent_lactose_pad_images",
    "C:\\Users\\pete.buttigieg\\20_percent_lactose_pad_images",
    "C:\\Users\\pete.buttigieg\\50_percent_lactose_pad_images",
    "C:\\Users\\pete.buttigieg\\100_percent_lactose_pad_images"
]
NEGATIVE_DIRECTORIES = [
    "C:\\Users\\pete.buttigieg\\old_starch_pad_images",
    "C:\\Users\\pete.buttigieg\\fresh_starch_pad_images",
    "C:\\Users\\pete.buttigieg\\0_percent_lactose_pad_images"
]
ALL_DIRECTORIES = POSITIVE_DIRECTORIES + NEGATIVE_DIRECTORIES

Here's an example settings file.

```
[
    {
        "directory": "C:\\Users\\pete.buttigieg\\10_percent_lactose_pad_images",
        "title": "0% Lactose",
        "box_settings": {
            "box_width": 40,
            "box_height": 150,
            "num_boxes": 12,
            "offset_top": 1450,
            "offset_left": 505,
            "horizontal_spacer": 115
        },
        "lanes_to_sample": [
            "A",
            "B",
            "C",
            "D",
            "E",
            "F",
            "G",
            "H",
            "I",
            "J",
            "K",
            "L"
        ]
    },
    {
        "directory": "C:\\Users\\pete.buttigieg\\20_percent_lactose_pad_images",
        "title": "20% Lactose",
        "box_settings": {
            "box_width": 40,
            "box_height": 150,
            "num_boxes": 12,
            "offset_top": 1520,
            "offset_left": 550,
            "horizontal_spacer": 120
        },
        "lanes_to_sample": [
            "A",
            "B",
            "C",
            "D",
            "E",
            "F",
            "G",
            "H",
            "I",
            "J",
            "K",
            "L"
        ]
    }
]
```

In [None]:
with open("C:\\Users\\pete.buttigieg\\settings.json", "r") as f:
    settings = json.load(f)

You can utilize multiple settings files. I recommend one per directory.

In [None]:
with open("C:\\Users\\pete.buttigieg\\old_starch_pad_images\\settings.json", "r") as f:
    more_settings = json.load(f)

In [None]:
settings.extend(more_settings)

Run only once.

In [None]:
image_dir_objects = pad.load_multiple_directories(settings)

Only keep directories we labeled as true positives or true negatives.

In [None]:
image_dir_objects = [x for x in image_dir_objects if Path(x.path_to_dir) in [Path(z) for z in ALL_DIRECTORIES]]
print("retained {} directories".format(len(image_dir_objects)))

Only keep the most recent/most developed/latest timestamp image from each directory. This assumes that the filename ends with a timestamp. Most cell phones write filenames in this fashion.

In [None]:
for dir_object in image_dir_objects:
    dir_object._image_filenames = [dir_object.image_filenames[-1]]

In [None]:
for dir_object in image_dir_objects:
    dir_object.column_data, dir_object.summary_data = pad.sample_directory_and_create_output_data(
        image_filenames=dir_object.image_filenames,
        output_directory=OUTPUT_DIRECTORY,
        box_settings=dir_object.box_settings,
        lanes_to_sample=dir_object.lanes_to_sample,
        verbose=False,
        draw=False
    )
    dir_object.column_data_frame = pad.create_column_data_frame(dir_object.column_data)

Create dataframe for analysis.

In [None]:
df = pd.concat([x.column_data_frame for x in image_dir_objects])

Save the data in case you don't want to process all those images again.

In [None]:
df.to_csv(Path(OUTPUT_DIRECTORY, "data.csv"), index=False)

You can reload the data from here.

In [None]:
df_with_label = pd.read_csv(Path(OUTPUT_DIRECTORY, "data.csv"))

In [None]:
df_with_label

In [None]:
features = ["AverageBlue", "AverageGreen", "AverageRed"]
target = ["Target"]
x = df_with_label.loc[:, features].values
y = df_with_label.loc[:, target].values

Standardize data.

In [None]:
x = StandardScaler().fit_transform(x)

In [None]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(x)
pca_df = pd.DataFrame(
    data=principal_components,
    columns=["Principal Component 1", "Principal Component 2"]
)
pca_df = pd.concat([pca_df, df_with_label["Target"], df_with_label["Label"]], axis=1)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1) 
ax.set_xlabel("Principal Component 1")
ax.set_ylabel("Principal Component 2")
ax.set_title("")
targets = ["Positive", "Negative"]
colors = ['g', 'r']
for target, color in zip(targets, colors):
    indicies_to_keep = pca_df["Target"] == target
    ax.scatter(
        pca_df.loc[indicies_to_keep, "Principal Component 1"],
        pca_df.loc[indicies_to_keep, "Principal Component 2"],
        c = color,
        s = 50
    )
ax.legend(targets)
# ax.grid()
plt.tight_layout()
# fig.savefig(Path(OUTPUT_DIRECTORY, "example.png"))

In [None]:
sample_vector = pca_df[["Principal Component 1", "Principal Component 2"]].to_numpy()
label_vector = pca_df["Target"].to_numpy()

In [None]:
label_vector = [0 if x == "Negative" else 1 for x in label_vector]

In [None]:
plt.rcParams["font.size"] = 10

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1) 
ax.set_xlabel("Principal Component 1")
ax.set_ylabel("Principal Component 2")
ax.set_title("")

labels = df_with_label.Label.unique()

legend_order = [
    "100% Lactose - Old PAD",
    "100% Lactose",
    "50% Lactose",
    "20% Lactose",
    "10% Lactose",
    "5% Lactose",
    "0% Lactose",
    "0% Lactose - Old PAD"
]

lines = []

for label in legend_order:
    indicies_to_keep = pca_df["Label"] == label
    line = ax.scatter(
        pca_df.loc[indicies_to_keep, 'Principal Component 1'],
        pca_df.loc[indicies_to_keep, 'Principal Component 2'],
        # c = color,
        s = 50,
        label=label
    )
    lines.append(line)

x_limits = ax.get_xlim()
y_limits = ax.get_ylim()
x_values_for_line = np.linspace(x_limits[0], x_limits[1], 30)
lr = LogisticRegression()
lr.fit(sample_vector, label_vector)
params = lr.coef_[0]
intercept = lr.intercept_
y_values_for_line = np.dot((-1. / params[1]), (np.dot(params[0], x_values_for_line) + intercept))
another_line = ax.plot(x_values_for_line, y_values_for_line, label="Decision Boundary")
ax.set_ylim(y_limits)

width_in_cm = 11
width_in_in = width_in_cm * 0.3937
height_in_cm = 10
height_in_in = height_in_cm * 0.3937
fig.set_size_inches(width_in_in, height_in_in)

ax.legend(
    # bbox_to_anchor=(-0.13, -0.2),
    # loc="upper left",
    # bbox_to_anchor=(1, 1),
    # bbox_to_anchor=(0.0, 0.0),
    bbox_to_anchor=(0.0, -0.14),
    frameon=False,
    ncol=2,
    # borderaxespad=0,
    # mode="expand",
    bbox_transform=fig.transFigure,
    loc="upper center"
)

plt.tight_layout()

In [None]:
fig.savefig(
    str(Path(OUTPUT_DIRECTORY, "beautiful_publication_quality_figure.tiff")),
    dpi=600,
    format="tiff",
    pil_kwargs={"compression": "tiff_lzw"},
    bbox_inches="tight"
)

In [None]:
feature_labels = ["Average Blue", "Average Green", "Average Red"]

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
im = ax.imshow(pca.components_)

plt.yticks(
    [0, 1],
    ["Principal Component 1", "Principal Component 2"]
)
plt.xticks(
    [0, 1, 2],
    feature_labels
)

x_text_min = -0.26
x_text_max = 1.79
x_text_num = 3
x_text_increment = (x_text_max - x_text_min) / (x_text_num - 1)
x_text_locations = [x_text_min + x_text_increment * z for z in range(x_text_num)]
print("x_test_locations: {}".format(x_text_locations))
y_text_min = 0.025
y_text_max = 1.07
y_text_num = 2
y_text_increment = (y_text_max - y_text_min) / (y_text_num - 1)
y_text_locations = [y_text_min + y_text_increment * z for z in range(y_text_num)]
print("y_test_locations: {}".format(y_text_locations))

for y, y_loc in zip(pca.components_, y_text_locations):
    for x, x_loc in zip(y, x_text_locations):
        text = ax.text(x_loc, y_loc, "{:.4f}".format(x))
        text.set_path_effects([PathEffects.withStroke(linewidth=3.5, foreground='w')])

plt.colorbar(im)
plt.tight_layout()
plt.show()

In [None]:
fig.savefig(Path(OUTPUT_DIRECTORY, "explain_pca.png"), dpi=300)
fig.savefig(Path(OUTPUT_DIRECTORY, "expalin_pca.eps"), format='eps')
fig.savefig(Path(OUTPUT_DIRECTORY, "expalin_pca.tiff"), format='tiff', dpi=1200)