In [44]:
""" creates a by-sample breakdown of AA level mutations for three genes of interest;
    EGRF, BRAF and KRAS. goal here is to generate data for the top panel of figure 2 """

import re
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 999)

In [45]:
def char_strip(df):
    """ string strip all of the wierd dictionary characters, 
        and then rewriting egfr_muts """
    for idx, row in df.iterrows():
        cell = row.cell
        mutations = row.mutations
        new_list = []

        if not pd.isna(mutations):
            mutations = mutations.split(',')

            for j in range(len(mutations)):
                temp = re.sub(r'\W+', '', mutations[j])
                new_list.append(temp)

            df.mutations[idx] = new_list
    
    df = df.fillna(0)

    return(df)

In [46]:
def dict_to_csv(d):
    """ convert a dict to csv, even if dict values have uneven
        number of elements """
    for k in d.keys():
        v = d.get(k)
        curr_len = len(v)
        if curr_len < 27:
            to_add = 28 - curr_len
            v = np.append(v, np.repeat(np.nan, to_add))
            v = list(v)
            d.update({k:v})
            
    t = pd.DataFrame.from_dict(d)
    return(t)

In [47]:
def build_dict(df, gene, d):
    """ builds a by-sample dict for all the mutations found to a given gene """
    for idx, row in df.iterrows():
        curr_cell = row.cell
        mutations = row.mutations

        meta_row = meta[meta.cell == curr_cell]
        sample = list(meta_row.sample_name)[0]

        if mutations != 0:
            if sample in d:
                elm = d.get(sample)

                for mut in mutations:
                    mut = gene + ' ' + mut
                    if mut not in elm:
                        elm.append(mut)

                d.update({sample:elm})
                    
            else:
                muts = []
                for elm in mutations:
                    elm = gene + ' ' + elm
                    muts.append(elm)
                d.update({sample:muts})

    return(d)

In [48]:
meta = pd.read_csv('/Users/lincoln.harris/code/SNP_calling_pipeline/metadata_all_cells_4.10.19.csv')
meta = meta.rename(columns={'Unnamed: 0':'cell'})

egfr_muts = pd.read_csv('/Users/lincoln.harris/Desktop/egfr_all_AA.csv', names=['cell', 'mutations'])
kras_muts = pd.read_csv('/Users/lincoln.harris/Desktop/kras_all_AA.csv', names=['cell', 'mutations'])
braf_muts = pd.read_csv('/Users/lincoln.harris/Desktop/braf_all_AA.csv', names=['cell', 'mutations'])

egfr_muts = char_strip(egfr_muts)
kras_muts = char_strip(kras_muts)
braf_muts = char_strip(braf_muts)

big_dict = {}
big_dict = build_dict(egfr_muts, 'EGFR', big_dict)
big_dict = build_dict(kras_muts, 'KRAS', big_dict)
big_dict = build_dict(braf_muts, 'BRAF', big_dict)

df = dict_to_csv(big_dict)
df.to_csv('top_panel_muts_by_sample_v1.csv', index=False)

In [49]:
df

Unnamed: 0,LT_S11,LT_S21,LT_S75,LT_S66,LT_S69,LT_S57,LT_S08,LT_S71,LT_S50,LT_S63,...,LT_S53,LT_S56,LT_S51,LT_S58,LT_S52,LT_S42,LT_S80,LT_S34,LT_S78,LT_S49
0,EGFR R521K,EGFR R521K,EGFR N158N,EGFR T903T,EGFR Q787Q,EGFR T629T,EGFR T629T,EGFR K745_A750T,EGFR Q787Q,EGFR T903T,...,EGFR T903T,EGFR T903T,EGFR T629T,EGFR Q787Q,EGFR N158N,EGFR N158N,EGFR T903T,EGFR L858R,EGFR Q787Q,EGFR T903T
1,EGFR K745_A750T,EGFR Q787Q,EGFR Q787Q,EGFR N158N,EGFR T903T,EGFR R521K,EGFR G652G,EGFR T903T,EGFR R521K,EGFR N158N,...,EGFR Q787Q,EGFR N158N,EGFR R521K,EGFR T903T,EGFR Q787Q,,EGFR A237Y,EGFR D1014N,EGFR N158N,
2,EGFR T629T,EGFR T903T,EGFR T903T,EGFR R521K,EGFR T629T,EGFR K745_A750T,EGFR Q787Q,EGFR Q787Q,EGFR T903T,EGFR R1100S,...,EGFR D1014N,EGFR V300M,EGFR Q787Q,EGFR N158N,EGFR K745_A750T,,EGFR Q787Q,EGFR T903T,,
3,EGFR T903T,EGFR T629T,EGFR L387M,EGFR T629T,EGFR I1093M,EGFR Q787Q,EGFR R521K,EGFR L1034I,EGFR T629T,EGFR S442I,...,EGFR F856L,BRAF G643G,EGFR T903T,KRAS Q61H,EGFR T903T,,,EGFR T629T,,
4,EGFR L833V,EGFR L858R,KRAS Q61H,EGFR Q787Q,EGFR S921R,EGFR N158N,BRAF G643G,EGFR L1167V,EGFR I569I,EGFR L1167V,...,KRAS L19F,,EGFR N158N,,EGFR G42D,,,EGFR Q787Q,,
5,KRAS G13V,EGFR A237V,,EGFR V1142V,EGFR L1034I,EGFR T903T,,EGFR A21A,KRAS G12F,EGFR R521K,...,KRAS G13V,,,,KRAS Q61H,,,,,
6,BRAF W450L,EGFR A237Y,,EGFR G42D,EGFR K745_A750T,EGFR L1167V,,KRAS G13D,KRAS G12Y,EGFR V1142V,...,,,,,,,,,,
7,BRAF G643G,EGFR V536M,,EGFR F856L,EGFR R1052I,EGFR G42D,,BRAF L89L,KRAS G12C,EGFR R831H,...,,,,,,,,,,
8,,EGFR G42D,,EGFR D1014N,EGFR S811F,EGFR A21A,,,,EGFR G42D,...,,,,,,,,,,
9,,EGFR G598V,,EGFR L1167V,KRAS L19F,KRAS C118S,,,,KRAS C118S,...,,,,,,,,,,


In [None]:
#///////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////////////////

In [21]:
# are the cell names in the same order? 
print(kras_muts.cell.equals(egfr_muts.cell))
print(kras_muts.cell.equals(braf_muts.cell))
print(braf_muts.cell.equals(egfr_muts.cell))

True
True
True


In [22]:
# make a by-cell version of the dataframe
by_cell_df = egfr_muts
by_cell_df.columns = ['cell', 'egfr_muts']
by_cell_df['kras_muts'] = np.nan
by_cell_df['braf_muts'] = np.nan

by_cell_df.kras_muts = kras_muts.mutations
by_cell_df.braf_muts = braf_muts.mutations

by_cell_df

Unnamed: 0,cell,egfr_muts,kras_muts,braf_muts
0,A10_B003523,0,0,0
1,H22_B003116,0,0,0
2,C14_B003777,0,0,0
3,I17_B003777,0,0,0
4,M16_B003518,0,0,0
5,H13_B003588,0,0,0
6,H9_B000860,0,0,0
7,J3_B003093,0,0,0
8,I19_1001000339,0,0,0
9,G17_B000862,0,0,0


In [23]:
by_cell_df.to_csv('by_cell_df.csv', index=False)

In [24]:
def get_unique_elements(colname):
    """ returns a list of the unique elements in a given pd series"""
    keep = by_cell_df[colname].to_numpy().nonzero() 
    non_zero = list(by_cell_df[colname].iloc[keep])

    unique = []
    for elm in non_zero:
        for sub_elm in elm:
            if sub_elm not in unique:
                unique.append(sub_elm)

    return(unique)

In [25]:
egfr_unique = get_unique_elements('egfr_muts')
kras_unique = get_unique_elements('kras_muts')
braf_unique = get_unique_elements('braf_muts')

In [26]:
egfr_unique
#kras_unique
#braf_unique

['R521K',
 'T903T',
 'Q787Q',
 'T629T',
 'L858R',
 'N158N',
 'G652G',
 'K745_A750T',
 'S442I',
 'V1142V',
 'I1093M',
 'A237V',
 'A237Y',
 'D994D',
 'S921R',
 'L1167V',
 'G42D',
 'R1100S',
 'V536M',
 'L387M',
 'L1034I',
 'G598V',
 'D1014N',
 'R1052I',
 'S811F',
 'E709_T710D',
 'A21A',
 'F856L',
 'L833V',
 'V300M',
 'I91V',
 'I569I',
 'R831H']

In [7]:
braf_muts = pd.read_csv('/Users/lincoln.harris/Desktop/braf_AA_revised.csv', names=['cell', 'mutations'])
braf_muts = char_strip(braf_muts)
braf_muts

Unnamed: 0,cell,mutations
0,A10_B003523,0
1,H22_B003116,0
2,C14_B003777,0
3,I17_B003777,0
4,M16_B003518,0
5,H13_B003588,0
6,H9_B000860,0
7,J3_B003093,0
8,I19_1001000339,0
9,G17_B000862,0


In [28]:
egfr_muts = pd.read_csv('/Users/lincoln.harris/Desktop/EGFR_AA.csv', names=['cell', 'mutations'])
egfr_muts = char_strip(egfr_muts)
egfr_muts

Unnamed: 0,cell,mutations
0,A10_B003523,0
1,H22_B003116,0
2,C14_B003777,0
3,I17_B003777,0
4,M16_B003518,0
5,H13_B003588,0
6,H9_B000860,0
7,J3_B003093,0
8,I19_1001000339,0
9,G17_B000862,0


In [26]:
# which of these cells/samples have L858R? 
for idx, row in egfr_muts.iterrows():
    curr_cell = row.cell
    mutations = row.mutations
    if mutations != 0:
        for mut in mutations:
            if mut == 'L858R':
                #print(curr_cell)
                keep  = meta.cell == curr_cell
                meta_row = meta[keep]
                curr_sample = list(meta_row.sample_name)[0]
                print(curr_sample)

LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S34
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21
LT_S21