In [3]:
import numpy as np
import pandas as pd

In [4]:
dir ='/root/JupyterNotebook/kaggle-Redefining-Cancer-Treatment'

In [6]:
train = pd.read_csv(dir+'/4 input for stage2/training_variants.csv')
test = pd.read_csv(dir+'/4 input for stage2/test_variants.csv')
test['Class'] = 0

In [7]:
train.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [8]:
test.head()

Unnamed: 0,ID,Gene,Variation,Class
0,3,WNT4,E216G,0
1,4,SUCLA2,G118R,0
2,6,CHEK2,E239K,0
3,7,CHST3,T141M,0
4,8,RNF6,G244D,0


In [9]:
print("Train shape".ljust(15), train.shape)
print("Test shape".ljust(15), test.shape)

Train shape     (3689, 4)
Test shape      (619, 4)


In [10]:
genes_train = set(train["Gene"])
print("%i train unique Genes" %(len(genes_train)))

269 train unique Genes


In [11]:
genes_test = set(test["Gene"])
print("%i test unique Genes" %(len(genes_test)))

167 test unique Genes


In [12]:
variations_train = set(train["Variation"])
print("%i train unique Variations" %(len(variations_train)))

3309 train unique Variations


In [13]:
variations_test = set(test["Variation"])
print("%i test unique Variations" %(len(variations_test)))

617 test unique Variations


In [14]:
genes_shared = genes_train&genes_test
print("%i shared genes in train and test" %(len(genes_shared)))

35 shared genes in train and test


In [15]:
variations_shared = variations_train&variations_test
print("%i shared variations in train and test" %(len(variations_shared)))

5 shared variations in train and test


In [16]:
print(variations_shared)

{'E322K', 'D32H', 'G244D', 'G13R', 'Q61K'}


In [17]:
from collections import Counter
top_gene_train = Counter(train["Gene"]).most_common(10)
print(top_gene_train)

[('BRCA1', 293), ('TP53', 182), ('EGFR', 157), ('PTEN', 140), ('BRCA2', 139), ('KIT', 110), ('BRAF', 104), ('ALK', 77), ('ERBB2', 76), ('PDGFRA', 67)]


In [18]:
top_gene_test = Counter(test["Gene"]).most_common(10)
print(top_gene_test)

[('SCN4A', 26), ('TP53', 21), ('TSHR', 21), ('TP63', 16), ('LRP5', 13), ('SCN9A', 13), ('EGFR', 13), ('GNE', 13), ('ADAMTS13', 12), ('CLDN16', 12)]


In [19]:
top_variation_train = Counter(train["Variation"]).most_common(10)
print(top_variation_train)

[('Truncating Mutations', 111), ('Deletion', 88), ('Amplification', 79), ('Fusions', 37), ('Overexpression', 6), ('Q61R', 4), ('G12V', 4), ('G12D', 3), ('G12C', 3), ('Q61H', 3)]


In [20]:
top_variation_test = Counter(test["Variation"]).most_common(10)
print(top_variation_test)

[('G13R', 2), ('G13S', 2), ('Q300R', 1), ('R669H', 1), ('S180N', 1), ('S241A', 1), ('P63H', 1), ('R155P', 1), ('V1589M', 1), ('L274H', 1)]


# next generate more var info

In [21]:
df_joint = pd.concat([train,test], ignore_index=True)
print("train+test rows:",df_joint.shape[0])


train+test rows: 4308


In [22]:
df_joint.tail()

Unnamed: 0,ID,Gene,Variation,Class
4303,982,TP63,S580P,0
4304,983,SCN4A,R672G,0
4305,984,BRAF,N581H,0
4306,985,TSHR,S281N,0
4307,986,ASS1,S180N,0


In [23]:
import re
rule_basic = re.compile(r'^([a-z\*])(\d{1,7})([a-z\*])$',)
truncating_list = ['truncating mutations','trunc']
delins_list = ['insertions/deletions','deletion/insertion','delins']
fusion_list = ['fusions','fusion','fus','fs*','fs']
deletion_list = ['deletions','deletion','del']
ins_list = ['insertions','insertion','ins']
amplification_list = ['amplification']
overexpression_list = ['overexpression']
splice_list = ['splice']
duplication_list = ['duplications','duplication','dup']
lookup_list = [truncating_list, delins_list, fusion_list, deletion_list,ins_list, 
               amplification_list, overexpression_list,splice_list, duplication_list ]
operation_list = ['truncating', 'delins','fusion', 'deletion', 'insertion', 'amplification', 
                   'overexpression','splice','duplication']
unused_list = ['mutations','_','-']

def var_to_pattern(variation):
    variation = variation.replace('null','*')
    variation = variation.lower()
    result = {'first_letter':'none','location_number':0,'last_letter':'none','operation':'other','other_text':'none'}
    words = variation.split(' ')
    i = len(words)
    m = re.match(rule_basic,words[0])
    if i == 1 and m:
        #substitution case
        result['operation'] = 'substitution'
        if m.group(1) != '*':
            result['first_letter'] = m.group(1)
        result['location_number'] = m.group(2)
        if m.group(3) != '*':
            result['last_letter'] = m.group(3)
    else:
        flag = False
        temp_txt = variation
        for index,item in enumerate(lookup_list):
            for x in item:
                if x in temp_txt:
                    flag = True
                    result['operation'] = operation_list[index]
                    temp_txt = variation.replace(x,' ')
                    break
            if flag:break
        #for rest text
        if not temp_txt.isspace():
            for x in unused_list:
                temp_txt = temp_txt.replace(x,' ')
            if not temp_txt.isspace():
                temp_txt = temp_txt.strip()
                temp_words = temp_txt.split(' ')
                if len(temp_words) != 1 :
                    if len(temp_words[-1]) < 3:
                        del temp_words[-1]
                temp_txt = ' '.join(temp_words)
                result['other_text'] = temp_txt
        
    return result

In [24]:
pattern = df_joint["Variation"].apply(var_to_pattern)

In [25]:
pattern_df = pd.DataFrame(list(pattern))

In [26]:
done_df = pd.concat([df_joint,pattern_df],axis=1)

In [27]:
done_df.to_csv(dir+'/5 middle for stage2/var_done.csv', index=False)

In [29]:
done_df.tail()

Unnamed: 0,ID,Gene,Variation,Class,first_letter,last_letter,location_number,operation,other_text
4303,982,TP63,S580P,0,s,p,580,substitution,none
4304,983,SCN4A,R672G,0,r,g,672,substitution,none
4305,984,BRAF,N581H,0,n,h,581,substitution,none
4306,985,TSHR,S281N,0,s,n,281,substitution,none
4307,986,ASS1,S180N,0,s,n,180,substitution,none
