# Analysis

In [1]:
import os
import bokeh
from bokeh.plotting import show
import pandas as pd
import numpy as np
import torch

import flowkit as fk
import seaborn as sns
import matplotlib.pyplot as plt

bokeh.io.output_notebook()

In [2]:
#load in patient dataset
data = pd.read_csv("patient_data.csv")
data

Unnamed: 0,Subject Accession,H1,H3,B,FCS Raw,FCS Preprocessed,WSP File
0,SUB120420,64,16,16,Panel_L1_A1_L1_110194_PBMC_10112011_A01.580227...,export_Panel_L1_A1_L1_110194_PBMC_10112011_A01...,Panel_L1_Run_2.804556.wsp
1,SUB120423,256,16,16,Panel_L1_A2_L1_110197_PBMC_10112011_A02.579447...,export_Panel_L1_A2_L1_110197_PBMC_10112011_A02...,Panel_L1_Run_1.804538.wsp
2,SUB120445,64,8,32,Panel_L1_A3_L1_110243_PBMC_10172011_A03.578999...,export_Panel_L1_A3_L1_110243_PBMC_10172011_A03...,Panel_L1_Run_4.804574.wsp
3,SUB120446,1024,256,64,Panel_L1_A1_L1_110244_PBMC_10172011_A01.579604...,export_Panel_L1_A1_L1_110244_PBMC_10172011_A01...,Panel_L1_Run_5.804583.wsp
4,SUB120449,1024,128,64,Panel_L1_E3_L1_110247_PBMC_10172011_E03.580335...,export_Panel_L1_E3_L1_110247_PBMC_10172011_E03...,Panel_L1_Run_4.804574.wsp
5,SUB120450,64,32,16,Panel_L1_E1_L1_110248_PBMC_10172011_E01.578572...,export_Panel_L1_E1_L1_110248_PBMC_10172011_E01...,Panel_L1_Run_5.804583.wsp
6,SUB120452,128,128,128,Panel_L1_A2_L1_110250_PBMC_10172011_A02.579307...,export_Panel_L1_A2_L1_110250_PBMC_10172011_A02...,Panel_L1_Run_5.804583.wsp
7,SUB120457,128,128,32,Panel_L1_E2_L1_110255_PBMC_10172011_E02.580138...,export_Panel_L1_E2_L1_110255_PBMC_10172011_E02...,Panel_L1_Run_5.804583.wsp
8,SUB120458,256,64,32,Panel_L1_A3_L1_110256_PBMC_10172011_A03.579670...,export_Panel_L1_A3_L1_110256_PBMC_10172011_A03...,Panel_L1_Run_5.804583.wsp
9,SUB120459,128,32,64,Panel_L1_A1_L1_110257_PBMC_10172011_A01.580299...,export_Panel_L1_A1_L1_110257_PBMC_10172011_A01...,Panel_L1_Run_6.804592.wsp


In [3]:
#Sample Extraction Function
def extractFCSDF(fileName, source = "Raw"):
    path = ""
    if source == "Preprocessed":
        path = os.path.join("FCS/Preprocessed", fileName)
    else:
        path = os.path.join("FCS/Raw", fileName)
    sample = fk.Sample(path)
    return sample.as_dataframe(source = "raw")

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

X_values = []
Y_values = []

for index, row in data.iterrows():
    fileName = row['FCS Raw']
    sample_df = extractFCSDF(fileName, source="Raw")
    
    subject_features = sample_df.mean(axis=0)
    X_values.append(subject_features.values)
    
    Y_values.append([row["H1"], row["H3"], row["B"]])

X = np.array(X_values)
y = np.array(Y_values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

classifier = MultiOutputClassifier(
    LogisticRegression(max_iter=2000, random_state=10)
)
classifier.fit(X_train_scaled, y_train)

for i, target_name in enumerate(['H1', 'H3', 'B']):
    accuracy = classifier.estimators_[i].score(X_test_scaled, y_test[:, i])
    print(f"Accuracy for {target_name}: {accuracy:.3f}")

# Calculate F1 score 
for i, target_name in enumerate(['H1', 'H3', 'B']):
    print(classifier.estimators_[i].predict(X_test_scaled))
    print(y_test[:, i])
    f1 = f1_score(y_test[:, i], classifier.estimators_[i].predict(X_test_scaled), average='weighted')
    print(f"F1 score for {target_name}: {f1:.3f}")




Accuracy for H1: 0.000
Accuracy for H3: 0.000
Accuracy for B: 0.500
[1024   64  256  256]
[128 256  64 128]
F1 score for H1: 0.000
[256   8  64  64]
[128  64  32 128]
F1 score for H3: 0.000
[32 64 32 32]
[ 32  64  16 128]
F1 score for B: 0.375
