In [37]:
%matplotlib inline
%run utils.py
import os.path as osp
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 1000)

In [174]:
# Pull list of pubmed ids with a known cell type 
valid_ids = (
    pd.read_csv(osp.join(DATA_DIR, 'pubmed_abstract_tcell_types.csv'))
    .pipe(lambda df: df[df['type_key'].notnull()])['id'].unique()
)
len(valid_ids)

6171

In [175]:
# Read and restrict protein mappings to only those for papers with associated
# cell type data (to limit number of proteins to resolve)
df = pd.read_csv(osp.join(DATA_DIR, 'pubmed_abstract_proteins.csv'))
df = df[df['id'].isin(valid_ids)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65161 entries, 1 to 453611
Data columns (total 5 columns):
id       65161 non-null int64
start    65161 non-null int64
end      65161 non-null int64
value    65161 non-null object
class    65161 non-null object
dtypes: int64(3), object(2)
memory usage: 3.0+ MB


### Split Protein Symbols

In [573]:
def is_sequence(text):
    # Examples:
    # CD127(low+/-) CD25(+) forkhead box protein 3
    # CD25(+)FoxP3(+)CD127(low)CD4(+)
    # CD8(+)CD28(-)T  
    # Foxp3(+)CD25(hi)CD127(lo)CD39(hi)
    # CD4⁺CD25⁺Fxop3⁺
    # FoxP3(+)RoRγt(+)IL17(+)
    
    # Counts of substrings that are possible in real words but upper case when used as protein name
    m_ct1 = sum([text.count(x) for x in ['CD', 'CCR', 'CCL', 'CXCR', 'ICOS', 'IL', 'TCR']])
    # Counts of very unique substrings in any case
    m_ct2 = sum(text.upper().count(x) for x in ['FOXP'])
    # Counts of common separating characters 
    m_ct3 = sum([text.count(x) for x in ['lo', 'hi', 'Lo', 'Hi', '+', '(-)', '⁺', '⁻']])
    return m_ct1 + m_ct2 >= 2 or m_ct3 >= 2

In [574]:
# Idenfity sequences of identifying proteins
df[df['value'].apply(is_sequence)]['value'].value_counts().head(150)

CD4(+)CD25(+)                                       356
CD4+CD25                                            343
CD4(+)CD25(-)                                       100
CD4(+)CD25(high)                                     82
CD4(+)CD25(+)Foxp3(+)                                80
CD4+CD25-                                            78
CD4(+) CD25(+)                                       73
CD4+CD25+Foxp3                                       48
CD4(+)CD25(+)FoxP3(+)                                47
CD4(+)Foxp3(+)                                       46
CD4+CD25high                                         35
CD4+CD25(high)                                       30
CD4+CD25+FoxP3                                       29
CD4+CD25+FOXP3                                       27
CD4(+)CD25(+)FOXP3(+                                 27
CD4(+)CD25(hi)                                       25
CD8+CD28-                                            25
CD4(+)FOXP3(+                                   

In [575]:
#re.findall('[(CD)|(CCR)|(CXCR)]\w+?(?=CD|CCR|CXCR|$)', 'CCD4CXCR5CCR4')
#re.findall('(?:CD|CCR|CXCR)\w+?(?=CD|CCR|CXCR|$)', 'CCD4CXCR5CCR4')
#re.findall('(?:CD|CCR|CXCR|ICOS)\w+?(?=CD|CCR|CXCR|ICOS|$)', 'mmmCD4CXCR5CCR4CD62L'.upper())

In [576]:
def split_compound(text):
    # Examples:
    # mmmCD4CXCR5CCR4CD62L -> ['CD4', 'CXCR5', 'CCR4', 'CD62L']
    parts = re.findall('(?:CD|CCR|CCL|CXCR|ICOS|FOXP|IL|TCR)\w+?(?=CD|CCR|CCL|CXCR|ICOS|FOXP|IL|TCR|$)', text.upper())
    parts = [p.strip() for p in parts]
    return [p for p in parts if len(p) > 0]

def split_sequence(text):
    # Note that the precedence here matters and that hyphen splits are left until the end since
    # they are very ambiguous and should come w/ relevant lookaheads
    parts = re.split('\\(.*?\\)|\\+|\\(|\\)|:|,|/|⁺|⁻|High|high|Low|low|Hi|hi|Lo|lo|-(?=CD)|-(?=CCR)|-(?=CXCR)|-(?=CCL)|-(?=IL)|-(?=TCR)|-$', text)
    parts = [p.strip() for p in parts]
    # Remove any single char elements or those with spaces (which are typically phrases)
    parts = [p for p in parts if len(p) > 1 and p.count(' ') == 0]
    
    # Lastly, now that all separating elements are gone, check each element to see if 
    # it is still a likely sequence and if so, apply splits based on naming conventions
    # (i.e. for cases like "CD4CXCR3CCR6" where there are no separating elements)
    res = []
    for p in parts:
        if not is_sequence(p):
            res.append(p)
            continue
        res.extend(split_compound(p))
        
    return res

In [577]:
# split_sequence('CD1highCD2 + CD1lowCD99(-)CD19')
# split_sequence('CD4(+)CD25(+)CCR4(+)')
# split_sequence('CD62LhighCD25')
# split_sequence('CD4+CD25+highFoxp3+CD62L+')
# split_sequence('IFN-γ(+) IL-4(-)')
# split_sequence('CD19+CD24+CD38+TGF-β1')
# split_sequence('Tissue inhibitor of metalloproteinase 1')
# split_sequence('CD45RA(-)CD45RO(+)CD95(hi)CD62L(lo) ')
# split_sequence('FALC Lin(-)c-Kit(+)Sca-1(+)')
# split_sequence('CD45RO+CD62Ll(ow)CCR7(low)CD40L(high)ICOS(high)')
split_sequence('CD45RO+CD62L-CD4-CXCR5')

['CD45RO', 'CD62L', 'CD4', 'CXCR5']

In [578]:
def melt_sequences(df):
    res = []
    for i, r in df.iterrows():
        is_seq = is_sequence(r['value'])
        if not is_seq:
            res.append(r)
            continue
        # Split sequence which may produce empty values
        values = split_sequence(r['value'])
        for v in values:
            # Write new value with all other fields as is
            rs = r.copy()
            rs['value'] = v
            res.append(rs)
    return pd.DataFrame(res)
df_mlt = melt_sequences(df)
df_mlt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69949 entries, 1 to 453611
Data columns (total 5 columns):
id       69949 non-null int64
start    69949 non-null int64
end      69949 non-null int64
value    69949 non-null object
class    69949 non-null object
dtypes: int64(3), object(2)
memory usage: 3.2+ MB


In [579]:
# Ideally, there should be no "sequences" left but the is_sequence method 
# intentionally aims for recall rather than precision while the split_sequences
# method does the opposite so there may be some remainder here and it should be
# validate manually as unimportant (and ideally small or empty)
df_mlt['value'][df_mlt['value'].apply(is_sequence)]

Series([], Name: value, dtype: object)

In [580]:
#df_mlt['value'].value_counts().head(250)

### Normalize Protein Symbols

In [581]:
import re
pr_m_sub = [    
    ('INTERFERON', 'IFN'),
    ('INTERLEUKIN', 'IL'),
    ('TRANSFORMINGGROWTHFACTOR', 'TGF'),
    ('TUMORNECROSISFACTOR', 'TNF'),
    ('TUMOURNECROSISFACTOR', 'TNF'),
    ('TCELLRECEPTOR', 'TCR'),
    ('FORKHEADBOXPROTEIN', 'FOXP'),
    ('FORKHEADBOXP', 'FOXP'),
    ('TOLLLIKERECEPTOR', 'TLR'),
    ('HUMANLEUKOCYTEANTIGEN', 'HLA'),
    ('MAJORHISTOCOMPATIBILITYCOMPLEX', 'MHC'),
    ('SIGNALTRANSDUCERANDACTIVATOROFTRANSCRIPTION', 'STAT'),
    ('INDOLEAMINE2,3DIOXYGENASE', 'IDO'),
    ('MHCCLASSII', 'MHCII'),
    ('HLACLASSII', 'MHCII'),
    ('MHCCLASSI', 'MHCI'),
    ('HLACLASSI', 'MHCI'),
    ('GRANZYMEB', 'GZMB'),
    ('GRANZB', 'GZMB'),
    ('ALPHA', 'α'), ('Α', 'α'),
    ('BETA', 'β'),  ('Β', 'β'),
    ('GAMMA', 'γ'), ('Γ', 'γ'),
    ('DELTA', 'δ'), ('Δ', 'δ'),
    ('KAPPA', 'κ'), ('Κ', 'κ'),
    ('ZETA', 'ζ'), ('Ζ', 'ζ'),
    
    # Reordered-synonyms
    ('IFNG', 'IFNγ'), ('γIFN', 'IFNγ'),
    
    # Final eliminations
    ('RECEPTORS', ''), ('RECEPTOR', ''), 
    ('LIGANDS', ''), ('LIGAND', ''),
    ('ANTIGENS', ''), ('ANTIGEN', ''),
    ('BRIGHT', ''), ('FAMILY', ''),
    ('CYTOKINES', ''), ('CYTOKINE', ''),
    ('CHEMOKINES', ''), ('CHEMOKINE', ''),
    ('CYTOKINES', ''), ('CYTOKINE', ''),
    ('LYMPHOKINES', ''), ('LYMPHOKINE', ''),
]
pr_m_lkp = dict([
    ('IGG', 'IgG'), ('IGA', 'IgA'), ('IGE', 'IgE'), ('IGD', 'IgD'), ('IGM', 'IgM'),
    ('PDL1', 'PD1'), ('SCD25', 'CD25'), ('CD8α', 'CD8'), ('CD8αα', 'CD8'),
    ('ILT2', 'CD85J'),
    ('LSELECTIN', 'CD62L'), 
    ('INTEGRINα4β7', 'INTEGRINβ7'), ('INTEGRINαEβ7', 'INTEGRINβ7'),
    ('CCL4', 'MIP1β'),
    ('HVα7.2', 'Vα7.2'), ('IVα7.2', 'Vα7.2'), ('TCRVα7.2', 'Vα7.2'), ('Vα7.2Jα33', 'Vα7.2') 
    
])
pr_m_bl = [
    'TRANSCRIPTIONFACTOR', 'TRANSCRIPTIONFACTORS',
    'TREG', 'TREGS', 'TH', 'FOXP3TREG', 'ITREG',
    'IMMUNOGLOBULIN', 'IMMUNOGLOBULINS', 'IMMUNOGLOBULINE',
    'ASTHMA', 'INSULIN', 'ADHESIONMOLECULES',
    'GVHD'
]
def prep_protein(pr):
    pr = pr.strip()
    pr = re.sub('High$|high$|Low$|low$|Hi$|hi$|Lo$|lo$', '', pr)
    pr = pr.upper().strip()
    
    pr = re.sub('\\(.*?\\)', '', pr)
    pr = re.sub('[\\+\\-\\(\\)⁺⁻;:,]', '', pr)
    pr = re.sub('\s+', '', pr)
    for e in pr_m_sub:
        pr = pr.replace(e[0], e[1])
    pr = pr.strip()
    
    if pr in pr_m_lkp:
        pr = pr_m_lkp[pr].strip()
        
    if len(pr) <= 1 or pr in pr_m_bl:
        return None
    return pr

In [582]:
prep_protein('CD4(+)CD25(+)FOXP3(+) ⁺⁻ INTERFERONG )( sd CD103Hi ')

'CD4CD25FOXP3IFNγSDCD103'

In [583]:
df_mlt['value_norm'] = df_mlt['value'].apply(prep_protein)
df_mlt['value_norm'].value_counts().head(250)

CD4                         7615
FOXP3                       4761
CD25                        3521
CD8                         1847
IFNγ                        1604
IL10                        1531
IL2                         1284
IL17                        1260
IL4                         1078
TGFβ                         897
TCR                          727
CD3                          604
IL6                          561
IL12                         508
IL                           479
HLA                          449
CTLA4                        445
IL21                         404
TNFα                         403
CD127                        392
CD28                         364
IL17A                        353
PD1                          352
IL22                         343
IFN                          306
CXCR5                        291
IgE                          279
IL5                          279
IL23                         277
TGFβ1                        259
CCR4      

In [584]:
df_mlt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69949 entries, 1 to 453611
Data columns (total 6 columns):
id            69949 non-null int64
start         69949 non-null int64
end           69949 non-null int64
value         69949 non-null object
class         69949 non-null object
value_norm    61427 non-null object
dtypes: int64(3), object(3)
memory usage: 3.7+ MB


### Resolve HGNC Lookups

In [585]:
df_hgnc = pd.read_csv(osp.join(DATA_DIR, 'hgnc_map.csv'))
assert m_hgnc['key'].is_unique
df_hgnc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2917 entries, 0 to 2916
Data columns (total 3 columns):
approved_symbol    2917 non-null object
key                2917 non-null object
value              2917 non-null object
dtypes: object(3)
memory usage: 68.4+ KB


In [586]:
df_mlt['value_lbl'] = df_mlt['value_norm'].map(df_hgnc.set_index('key')['value'])
df_mlt['value_sym'] = df_mlt['value_norm'].map(df_hgnc.set_index('key')['approved_symbol'])
df_mlt['value_lbl'] = df_mlt['value_lbl'].where(df_mlt['value_lbl'].notnull(), df_mlt['value_norm'])
assert df_mlt['value_lbl'].notnull().equals(df_mlt['value_norm'].notnull())
df_mlt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69949 entries, 1 to 453611
Data columns (total 8 columns):
id            69949 non-null int64
start         69949 non-null int64
end           69949 non-null int64
value         69949 non-null object
class         69949 non-null object
value_norm    61427 non-null object
value_lbl     61427 non-null object
value_sym     17196 non-null object
dtypes: int64(3), object(5)
memory usage: 4.8+ MB


### Add FlowRepsoitory Annotations

In [615]:
m_flow_rep = [
    'CD3', 'CD4', 'CD8', 'CD45', 'CD45RA', 'CD45RO',
    'CD127', 'CD14', 'CD16', 'CD161', 'CD19', 'CD25', 'CD27', 'CD28',
    'CCR10', 
    'CD192', # CCR2
    'CD194', # CCR4
    'CD195', # CCR5
    'CD196', # CCR6
    'CD197', # CCR7
    'CDW199', # CCR9
    'CD278', # ICOS
    'CD103', 
    'CD107A', 
    'CD29', 'CD31', 'CD38', 'CD49A', 'CD49D',
    'CD56', 'CD57', 'CD62L', 'CD69', 'CD85', 'CD94', 'CD95',
    'HLADR', 'GMCSF', 'CX3CR1', 'IFNγ', 
    'IL10', 'IL17A', 'IL17F', 'IL2', 'IL22', 'IL4', 'IL5', 'IL8', 'IL9',
    'INTEGRINβ7', 'MIP1β', 'TCRγδ', 'TNFα', 'Vδ1', 'Vδ2', 'Vα7.2',
    'CD162', # Aliases: SELPLG, CD162, CLA, PSGL-1, PSGL1, selectin P ligand
    'CD279', # PD1
    'CD154', # CD40L
    'CD152', # CTLA4
    'CD183', # CXCR3
    'CD184', # CXCR4
    'CD185', # CXCR5
    'CD186', # CXCR6
    'GRZB', # Granyzme B
]
len(m_flow_rep)

65

In [614]:
mask = df_mlt['value_norm'].fillna('').str.startswith('ICOS')
#mask = df_mlt['value_norm'] == 'Vα7.2'
df_mlt[mask].fillna('NA').groupby(['value', 'value_norm', 'value_lbl', 'value_sym'])\
    .size().rename('count').reset_index()

Unnamed: 0,value,value_norm,value_lbl,value_sym,count
0,-ICOS,ICOS,CD278,ICOS,2
1,-ICOSL,ICOSL,CD275,ICOSLG,1
2,ICOS,ICOS,CD278,ICOS,131
3,ICOS ligand,ICOS,CD278,ICOS,4
4,ICOS(+),ICOS,CD278,ICOS,1
5,ICOS+,ICOS,CD278,ICOS,1
6,ICOS-,ICOS,CD278,ICOS,1
7,ICOS-L,ICOSL,CD275,ICOSLG,5
8,ICOS-ligand,ICOS,CD278,ICOS,2
9,ICOS1,ICOS1,ICOS1,,1


In [616]:
print('Original')
#pr = ('PD1', 'CD279')
#pr = ('ICOS', 'CD278')
#pr = ('CD40L', 'CD154')
#pr = ('CTLA4', 'CD152')
#pr = ('CCR4', 'CD194')
#pr = ('CCR9', 'CDW199')
pr = ('ICOS', 'CD278')
#pr = ('CXCR3', 'CD183')
#pr = ('CXCR5', 'CD185')
#pr = ('CXCR6', 'CD186')
#pr = ('CCR7',)*2

c = 'value_norm'
print(df_mlt[df_mlt[c].str.contains(pr[0]).fillna(False)][c].value_counts())

print('After HGNC')
c = 'value_lbl'
print(df_mlt[df_mlt[c].str.contains(pr[1]).fillna(False)][c].value_counts())

Original
ICOS       143
ICOSL       33
ICOSPD1      2
ICOSLG       1
T1ICOST      1
ICOS1        1
RICOS        1
Name: value_norm, dtype: int64
After HGNC
CD278    143
Name: value_lbl, dtype: int64


In [617]:
df_mlt['is_common'] = df_mlt['value_lbl'].isin(m_flow_rep)
# Ensure that all common markers were identified
assert sorted(list(df_mlt[df_mlt['is_common'] == True]['value_lbl'].unique())) == sorted(m_flow_rep)

### Export

In [619]:
df_mlt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69949 entries, 1 to 453611
Data columns (total 9 columns):
id            69949 non-null int64
start         69949 non-null int64
end           69949 non-null int64
value         69949 non-null object
class         69949 non-null object
value_norm    61427 non-null object
value_lbl     61427 non-null object
value_sym     17196 non-null object
is_common     69949 non-null bool
dtypes: bool(1), int64(3), object(5)
memory usage: 4.9+ MB


In [621]:
df_exp = df_mlt[df_mlt['value_lbl'].notnull()]
df_exp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61427 entries, 1 to 453611
Data columns (total 9 columns):
id            61427 non-null int64
start         61427 non-null int64
end           61427 non-null int64
value         61427 non-null object
class         61427 non-null object
value_norm    61427 non-null object
value_lbl     61427 non-null object
value_sym     17196 non-null object
is_common     61427 non-null bool
dtypes: bool(1), int64(3), object(5)
memory usage: 4.3+ MB


In [622]:
path = osp.join(DATA_DIR, 'pubmed_abstract_proteins_resolved.csv')
df_exp.to_csv(path, index=False)
path

'/Users/eczech/tmp/nlp/data/pubmed_abstract_proteins_resolved.csv'