In [None]:
import pandas as pd
import numpy as np
import os
import urllib.request

data = pd.read_excel("data/bold_data.xlsx")
rows = np.array(data["image_urls"])
downloaded = os.listdir("data/images")

# Download images from bold_data.xlsx
for r in rows:
    try:
        urls = r.split("|")
        for url in urls:
            file_name = url.split("/")[-1]
            
            # Skip file if already downloaded
            if file_name in downloaded:
                continue

            urllib.request.urlretrieve(url, f"data/images/{file_name}")
            print(url)
    except Exception as e:
        print(url, e)

In [None]:
import torch
import os

model = torch.hub.load("pytorch/vision:v0.10.0", "resnet101", pretrained=True)
model = torch.nn.Sequential(*(list(model.children())[:-1]))
model.eval()

In [None]:
import pandas as pd
import numpy as np
from PIL import Image
from torchvision import transforms

directory = "data/images"
images = os.listdir(directory)
data = pd.read_excel("data/bold_data.xlsx", keep_default_na="")
X_data = np.empty((0, 2048))
Y_data = np.empty((0, 7))

for image in images:
    image_id = image[:-4]
    print(f"Processing {image_id}")

    try:
        input_image = Image.open(f"{directory}/{image}")
        preprocess = transforms.Compose(
            [
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize(
                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
                ),
            ]
        )
        input_tensor = preprocess(input_image)
        # create a mini-batch as expected by the model
        input_batch = input_tensor.unsqueeze(0)

        # move the input and model to GPU for speed if available
        if torch.cuda.is_available():
            input_batch = input_batch.to("cuda")
            model.to("cuda")

        with torch.no_grad():
            output = model(input_batch)

        # Match image with corresponding labels
        record = data.loc[data["image_urls"].str.contains(image_id, regex=False)][0:1]
        labels = np.array(
            record[
                [
                    "processid",
                    "class_name",
                    "order_name",
                    "family_name",
                    "genus_name",
                    "species_name",
                ]
            ]
        )
        if record.shape[0] == 0:
            print("Skipping", image_id)
            continue
        Y_data = np.vstack((Y_data, np.hstack((np.array([[image_id]]), labels))))
        X_data = np.vstack((X_data, output[0].numpy().reshape(2048)))
    except RuntimeError as e:
        print("Error:", image_id, e)

# Save X_data and Y_data
np.savetxt(f"data/X_data.txt", X_data, delimiter="\t", fmt="%s")
header = "\t".join(["id", "processid", "class", "order", "family", "genus", "species"])
np.savetxt(
    f"data/Y_data.txt", Y_data, header=header, comments="", delimiter="\t", fmt="%s"
)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from joblib import dump

X = pd.read_csv("data/X_data.txt", delimiter="\t", header=None)

columns = ["class", "order", "family", "genus", "species"]
Y = np.array(pd.read_csv("data/Y_data.txt", delimiter="\t")[columns])

# 80/20 split for training/test
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

# Standardize to mean of 0 and standard deviation of 1
scaler = StandardScaler()
scaler.fit(X_train)
dump(scaler, f"scaler.joblib")
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reduce to 500 dimensions
pca = PCA(n_components=500)
pca.fit(X_train_scaled)
dump(pca, f"pca.joblib")
X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Save training and test data
np.savetxt(f"data/X_train_pca.txt", X_train_pca, delimiter="\t")
np.savetxt(f"data/X_test_pca.txt", X_test_pca, delimiter="\t")
np.savetxt(f"data/Y_train.txt", Y_train, delimiter="\t", fmt="%s")
np.savetxt(f"data/Y_test.txt", Y_test, delimiter="\t", fmt="%s")