# Data Parsing and Collection

In [1]:
import pandas as pd
import numpy as np

In [2]:
HAI = pd.read_csv('SDY404-DR54_Subject_2_HAI_result.txt', sep="\t")
HAI = HAI[['Subject Accession','Subject Phenotype','Planned Visit Accession','File Detail', 'Study Time Collected','File Name']] #Important Things
HAI.head()

Unnamed: 0,Subject Accession,Subject Phenotype,Planned Visit Accession,File Detail,Study Time Collected,File Name
0,SUB120417,"Older adult, not-frail",PV3222,HAI result,0,ALL_HAI_year2.388540.txt
1,SUB120417,"Older adult, not-frail",PV3225,HAI result,28,ALL_HAI_year2.388540.txt
2,SUB120418,"Older adult, frail",PV3222,HAI result,0,ALL_HAI_year2.388515.txt
3,SUB120418,"Older adult, frail",PV3225,HAI result,28,ALL_HAI_year2.388515.txt
4,SUB120419,"Older adult, pre-frail",PV3222,HAI result,0,ALL_HAI_year2.388540.txt


#### DESIRABLE PARAMETERS:
**Subject Phenotype:** Younger Adult \
**Planned Visit Accession:** PV3225 (Later Time Point)

In [3]:
HAI = HAI.loc[HAI['Subject Phenotype'] == 'Younger adult']
HAI = HAI.loc[HAI['Planned Visit Accession'] == 'PV3225']
HAI.head()

Unnamed: 0,Subject Accession,Subject Phenotype,Planned Visit Accession,File Detail,Study Time Collected,File Name
7,SUB120420,Younger adult,PV3225,HAI result,28,ALL_HAI_year2.388515.txt
13,SUB120423,Younger adult,PV3225,HAI result,28,ALL_HAI_year2.388540.txt
53,SUB120444,Younger adult,PV3225,HAI result,28,ALL_HAI_year2.388540.txt
55,SUB120445,Younger adult,PV3225,HAI result,28,ALL_HAI_year2.388515.txt
57,SUB120446,Younger adult,PV3225,HAI result,28,ALL_HAI_year2.388515.txt


In [4]:
display(HAI['File Name'].unique())

array(['ALL_HAI_year2.388515.txt', 'ALL_HAI_year2.388540.txt'],
      dtype=object)

### Flow Data Processing

In [5]:
WSP = pd.read_csv('SDY404-DR54_Subject_2_Flow_cytometry_workspace.txt', sep="\t")
WSP = WSP[['Subject Accession','Subject Phenotype','Planned Visit Accession','File Detail','File Name']] #Important Things
WSP.head()

Unnamed: 0,Subject Accession,Subject Phenotype,Planned Visit Accession,File Detail,File Name
0,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry workspace,Panel_L1_Run_1.804538.wsp
1,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry workspace,Panel_P2_Run_1-regated.804546.wsp
2,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry workspace,Panel_P13_Run_1-regated.804545.wsp
3,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry workspace,Panel_P9-P10_Run_1.804544.wsp
4,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry workspace,Panel P1.804543.wsp


In [6]:
FCS = pd.read_csv('SDY404-DR54_Subject_2_Flow_cytometry_result.txt', sep="\t")
FCS = FCS[['Subject Accession','Subject Phenotype','Planned Visit Accession','File Detail','File Name']] #Important Things
FCS.head()

Unnamed: 0,Subject Accession,Subject Phenotype,Planned Visit Accession,File Detail,File Name
0,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry result,Panel_P13_A10_P13_110191_PBMC_10112011_A10.578...
1,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry result,Panel_P9_A1_P9_110191_PBMC_10112011_A01.578824...
2,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry result,Panel_P2_A7_P2_110191_PBMC_10112011_A07.579100...
3,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry result,Panel_L2_A4_L2_110191_PBMC_10112011_A04.579270...
4,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry result,Panel_L1_A1_L1_110191_PBMC_10112011_A01.579376...


#### DESIRABLE PARAMETERS:
**Subject Phenotype:** Younger Adult \
**Planned Visit Accession:** PV3222 (Pre-Vaccination Point)

In [7]:
WSP = WSP.loc[WSP['Subject Phenotype'] == 'Younger adult']
WSP = WSP.loc[WSP['Planned Visit Accession'] == 'PV3222']
display(WSP.head())

Unnamed: 0,Subject Accession,Subject Phenotype,Planned Visit Accession,File Detail,File Name
111,SUB120420,Younger adult,PV3222,Flow cytometry workspace,Panel_L1_Run_2.804556.wsp
112,SUB120420,Younger adult,PV3222,Flow cytometry workspace,Panel P2 Run 2.804564.wsp
113,SUB120420,Younger adult,PV3222,Flow cytometry workspace,Panel_P13_Run_2-regated.804563.wsp
114,SUB120420,Younger adult,PV3222,Flow cytometry workspace,Panel_P9-P10_Run_2-regated.804562.wsp
115,SUB120420,Younger adult,PV3222,Flow cytometry workspace,Panel P1 Run 2.804561.wsp


In [8]:
FCS = FCS.loc[FCS['Subject Phenotype'] == 'Younger adult']
FCS = FCS.loc[FCS['Planned Visit Accession'] == 'PV3222']
display(FCS.head())
FCS.shape

Unnamed: 0,Subject Accession,Subject Phenotype,Planned Visit Accession,File Detail,File Name
120,SUB120420,Younger adult,PV3222,Flow cytometry result,Panel_L2_A4_L2_110194_PBMC_10112011_A04.578338...
121,SUB120420,Younger adult,PV3222,Flow cytometry result,Panel_L4_B_Cells_A1_L4_110194_PBMC_10112011_A0...
122,SUB120420,Younger adult,PV3222,Flow cytometry result,Panel_L3_A7_L3_110194_PBMC_10112011_A07.578627...
123,SUB120420,Younger adult,PV3222,Flow cytometry result,Panel_P9_A1_P9_110194_PBMC_10112011_A01.578763...
124,SUB120420,Younger adult,PV3222,Flow cytometry result,Panel_P13_A10_P13_110194_PBMC_10112011_A10.578...


(210, 5)

#### For Downloading Purposes

In [9]:
#WSP Files
wsps = np.sort(WSP['File Name'].unique())
display(wsps)

array(['Panel L2 Run 7.804602.wsp', 'Panel L2 Run 8.804611.wsp',
       'Panel P1 Run 2.804561.wsp', 'Panel P1 Run 4b.804579.wsp',
       'Panel P1 Run 5b.804588.wsp', 'Panel P1 Run 6.804597.wsp',
       'Panel P1 Run 7.804606.wsp', 'Panel P1 Run 8.804615.wsp',
       'Panel P1.804543.wsp', 'Panel P2 Run 2.804564.wsp',
       'Panel P9-P10 Run 6.804598.wsp', 'Panel_L1_Run_1.804538.wsp',
       'Panel_L1_Run_2.804556.wsp', 'Panel_L1_Run_4.804574.wsp',
       'Panel_L1_Run_5.804583.wsp', 'Panel_L1_Run_6.804592.wsp',
       'Panel_L1_Run_7-regated.804601.wsp',
       'Panel_L1_Run_8-regated.804610.wsp',
       'Panel_L2_Run_1-regated.804539.wsp',
       'Panel_L2_Run_2-regated.804557.wsp',
       'Panel_L2_Run_4-regated.804575.wsp',
       'Panel_L2_Run_5-regated.804584.wsp',
       'Panel_L2_Run_6-regated.804593.wsp', 'Panel_L3_Run_1.804540.wsp',
       'Panel_L3_Run_2.804558.wsp', 'Panel_L3_Run_4.804576.wsp',
       'Panel_L3_Run_5.804585.wsp', 'Panel_L3_Run_6.804594.wsp',
       'Panel

In [10]:
#FCS Files
fcss = np.sort(FCS['File Name'].unique())
display(fcss)

array(['Panel_L1_A1_L1_110194_PBMC_10112011_A01.580227.fcs',
       'Panel_L1_A1_L1_110244_PBMC_10172011_A01.579604.fcs',
       'Panel_L1_A1_L1_110257_PBMC_10172011_A01.580299.fcs',
       'Panel_L1_A1_L1_110267_PBMC_10172011_A01.579602.fcs',
       'Panel_L1_A2_L1_110197_PBMC_10112011_A02.579447.fcs',
       'Panel_L1_A2_L1_110250_PBMC_10172011_A02.579307.fcs',
       'Panel_L1_A2_L1_110260_PBMC_10172011_A02.578398.fcs',
       'Panel_L1_A2_L1_110271_PBMC_10172011_A02.579258.fcs',
       'Panel_L1_A3_L1_110243_PBMC_10172011_A03.578999.fcs',
       'Panel_L1_A3_L1_110256_PBMC_10172011_A03.579670.fcs',
       'Panel_L1_A3_L1_110265_PBMC_10172011_A03.578294.fcs',
       'Panel_L1_E1_L1_110248_PBMC_10172011_E01.578572.fcs',
       'Panel_L1_E1_L1_110258_PBMC_10172011_E01.579967.fcs',
       'Panel_L1_E1_L1_110268_PBMC_10172011_E01.578748.fcs',
       'Panel_L1_E2_L1_110255_PBMC_10172011_E02.580138.fcs',
       'Panel_L1_E2_L1_110272_PBMC_10172011_E02.578501.fcs',
       'Panel_L1_E3_L1_1

## TODO: File Collection

In [11]:
import os
import bokeh
from bokeh.plotting import show
import pandas as pd

import flowkit as fk

bokeh.io.output_notebook()

In [12]:
#Finding the FCS files you want
def findFCS(wsp):
    workspace = fk.Workspace(wsp, ignore_missing_files=True, find_fcs_files_from_wsp=True)
    allFiles = workspace.get_sample_ids()
    needFiles = []
    for f in allFiles:
        if f in fcss:
            needFiles.append(f)
            fcss.remove(f)
    return needFiles

In [13]:
#Download FCS Files
def getFCS(needFiles):
    #weeeeeeeeeeeee
    return

In [14]:
#Analyzing and Parsing Gates
fcs_filepath = ...
def analyzeWSP(wsp, needFiles):
    workspace = fk.Workspace(wsp, fcs_samples = fcs_filepath, ignore_missing_files=True)
    for f in needFiles:
        sample = workspace.analyze_samples(sample_id = f)
        #do stuff to get the stuff you want
        #talk with greg on how we want data structured -- extract first three gates for "preprocessing." keep in mind gate names will change
        #save to github repository under same name but with .csv

In [15]:
def deleteFiles(wsp, needFiles):
    #delete for space :)
    return

## TODO: Add HAI Titer 
(and day if possible -- check if its all day 28) to FCS File dataframe. Change FCS File names to .csv and add preprocessing column if necessary

In [16]:
HAI1 = pd.read_csv("ALL_HAI_year2.388515.txt", sep="\t")
HAI2 = pd.read_csv("ALL_HAI_year2.388540.txt", sep="\t")
display(HAI1.shape)
display(HAI2.shape)
display(HAI1.equals(HAI2))
HAI1.head(10)
#File Name does not matter because they're the same

(138, 6)

(138, 6)

True

Unnamed: 0,Subject,Day,H1,H3,B,Age Group
0,110191,0,64,8,32,Old
1,110191,28,128,16,32,Old
2,110192,0,32,16,64,Old
3,110192,28,32,16,64,Old
4,110193,0,8,8,16,Old
5,110193,28,16,8,16,Old
6,110194,0,32,16,16,Young
7,110194,28,64,16,16,Young
8,110195,0,16,16,8,Old
9,110195,28,64,32,32,Old


In [17]:
HAI = pd.read_csv("SDY404-DR54_Subject_2_HAI_result.txt", sep="\t")
HAI = HAI[['Subject Accession', 'Subject Phenotype', 'Planned Visit Accession','Study Time Collected']]
display(HAI.head())
HAI.shape

Unnamed: 0,Subject Accession,Subject Phenotype,Planned Visit Accession,Study Time Collected
0,SUB120417,"Older adult, not-frail",PV3222,0
1,SUB120417,"Older adult, not-frail",PV3225,28
2,SUB120418,"Older adult, frail",PV3222,0
3,SUB120418,"Older adult, frail",PV3225,28
4,SUB120419,"Older adult, pre-frail",PV3222,0


(138, 4)

In [18]:
def oldyoung(x):
    if x == "Younger adult":
        return "Young"
    else:
        return "Old"

HAI['Subject Phenotype'] = HAI['Subject Phenotype'].apply(oldyoung)
HAI.rename(columns={'Subject Phenotype': 'Age Group', 'Study Time Collected': 'Day'}, inplace=True)
HAI.head(10)

Unnamed: 0,Subject Accession,Age Group,Planned Visit Accession,Day
0,SUB120417,Old,PV3222,0
1,SUB120417,Old,PV3225,28
2,SUB120418,Old,PV3222,0
3,SUB120418,Old,PV3225,28
4,SUB120419,Old,PV3222,0
5,SUB120419,Old,PV3225,28
6,SUB120420,Young,PV3222,0
7,SUB120420,Young,PV3225,28
8,SUB120421,Old,PV3222,0
9,SUB120421,Old,PV3225,28


In [19]:
df = HAI[['Age Group','Day']].eq(HAI1[['Age Group','Day']])
display(df['Age Group'].unique()) #Age Group Matches, so can drop
display(df['Day'].unique()) #Day does not match, why?

array([ True])

array([ True, False])

In [20]:
HAI.iloc[df[df['Day'] == False].index]

Unnamed: 0,Subject Accession,Age Group,Planned Visit Accession,Day
61,SUB120449,Young,PV3225,24
83,SUB120461,Young,PV3225,36
101,SUB120470,Young,PV3225,24
105,SUB120472,Young,PV3225,32
115,SUB120477,Old,PV3225,35
117,SUB120478,Old,PV3225,35


In [21]:
HAI1.iloc[df[df['Day'] == False].index]

Unnamed: 0,Subject,Day,H1,H3,B,Age Group
61,110247,28,1024,128,64,Young
83,110259,28,128,128,64,Young
101,110268,28,512,32,64,Young
105,110270,28,128,16,128,Young
115,110275,28,32,8,128,Old
117,110276,28,256,16,32,Old


In [22]:
#So day is slightly different, but otherwise probably ok to assume. HAI titer still later so probably ok. We will go based on HAI Titer Document with actual HAI data.
HAI1.drop(columns = ['Subject','Age Group'], inplace=True)
HAI.drop(columns = 'Day', inplace=True)
HAIall = HAI.join(HAI1)
HAIall

Unnamed: 0,Subject Accession,Age Group,Planned Visit Accession,Day,H1,H3,B
0,SUB120417,Old,PV3222,0,64,8,32
1,SUB120417,Old,PV3225,28,128,16,32
2,SUB120418,Old,PV3222,0,32,16,64
3,SUB120418,Old,PV3225,28,32,16,64
4,SUB120419,Old,PV3222,0,8,8,16
...,...,...,...,...,...,...,...
133,SUB120486,Old,PV3225,28,32,16,16
134,SUB120487,Old,PV3222,0,8,8,64
135,SUB120487,Old,PV3225,28,8,8,64
136,SUB120488,Old,PV3222,0,64,8,16


In [23]:
HAIall = HAIall.loc[HAIall['Age Group'] == 'Young']
HAIall = HAIall.loc[HAIall['Day'] == 28]
display(HAIall.head())
HAIall.shape

Unnamed: 0,Subject Accession,Age Group,Planned Visit Accession,Day,H1,H3,B
7,SUB120420,Young,PV3225,28,64,16,16
13,SUB120423,Young,PV3225,28,256,16,16
53,SUB120444,Young,PV3225,28,512,16,32
55,SUB120445,Young,PV3225,28,64,8,32
57,SUB120446,Young,PV3225,28,1024,256,64


(32, 7)

In [24]:
HAIall.drop(columns = ['Age Group','Planned Visit Accession','Day'], inplace = True)
HAIall.reset_index(drop=True, inplace = True)
HAIall.head()
#Y values: H1, H3, B

Unnamed: 0,Subject Accession,H1,H3,B
0,SUB120420,64,16,16
1,SUB120423,256,16,16
2,SUB120444,512,16,32
3,SUB120445,64,8,32
4,SUB120446,1024,256,64
