# Looking_up_synonyms_handling_code 

Drafting code to assist in making code text to place in `Standardizing_identifier_order_in_humap2-provided_csv.ipynb`.
This was to mainly deal with synonyms, but I added generating code text dealing with another special case (where gaps seem to be in Uniprot_ACCs column because UniProtKb identifier removed it seems) that I came across while iterating on those rows containing 'SPECIAL_unaccounted_gene' and finding out what situation is that made them special.

In [2]:
import pandas as pd
df = pd.read_csv('DRAFThumap2_complexes_20200809InOrderMatched.csv')
df.head()

Unnamed: 0,HuMAP2_ID,Confidence,Uniprot_ACCs,genenames
0,HuMAP2_00000,3,Q9BQS8 O95900,FYCO1 TRUB2
1,HuMAP2_00001,4,P68402 Q15102 P08133 Q15797 Q99426 Q9H4M9,PAFAH1B2 PAFAH1B3 ANXA6 SMAD1 TBCB EHD1
2,HuMAP2_00002,5,Q9UF11 A1KXE4 Q15038 Q6ZRY4 O43251 Q9Y6M7 Q930...,PLEKHB1 FAM168B DAZAP2 RBPMS2 RBFOX2 SLC4A7 RB...
3,HuMAP2_00003,5,O14974 Q8WUM9 Q9Y5Y0 Q15836 Q16563 Q14919 Q299...,PPP1R12A SLC20A1 FLVCR1 VAMP3 SYPL1 DRAP1 MICA...
4,HuMAP2_00004,4,Q8WV99 Q49A92 Q9NQT8 Q9H672 P20774,ZFAND2B C8orf34 KIF13B ASB7 OGN


In [3]:
pattern = fr'\bSPECIAL_unaccounted_gene\b' # Create a regex pattern with word boundaries
rows_with_term_df = df[df['Uniprot_ACCs'].str.contains(pattern, case=False, regex=True)]

In [4]:
rows_with_term_df.head()

Unnamed: 0,HuMAP2_ID,Confidence,Uniprot_ACCs,genenames
6,HuMAP2_00006,4,Q96C28 O43296 Q5CZA5 P07199 SPECIAL_unaccounte...,ZNF707 ZNF264 ZNF805 CENPB ZNF678
11,HuMAP2_00011,5,Q9BXX0 Q3ZCT1 O00339 Q99435 Q92832 SPECIAL_una...,EMILIN2 ZNF260 MATN2 NELL2 NELL1 C4orf48
34,HuMAP2_00035,5,Q9P2F8 Q5PRF9 O15063 Q8N5S9 SPECIAL_unaccounte...,SIPA1L2 SAMD4B GARRE1 CAMKK1 KIAA0355
56,HuMAP2_00058,5,Q05DH4 Q8N612 Q9H8T0 Q9UJC3 Q86VS8 Q13506 Q157...,FHIP1A FHIP1B AKTIP HOOK1 HOOK3 NAB1 NAB2 CTIF...
68,HuMAP2_00070,4,Q6P1J9 Q9GZS3 Q8WVC0 Q9HCK8 Q8N7H5 Q14241 Q6PD...,CDC73 SKIC8 LEO1 CHD8 PAF1 ELOA CTR9 WDR61


In [4]:
len(rows_with_term_df)

698

Only run this when making initial collection:

In [8]:
'''
# make look up dicts for the synonyms of the UniProt identifiers in the rows that have `SPECIAL_unaccounted_gene`
from unipressed import UniprotkbClient
import time
intermed_df = rows_with_term_df.copy()
intermed_df['Uniprot_ACCs'] = intermed_df['Uniprot_ACCs'].str.split()
expanded_df = intermed_df.explode(['Uniprot_ACCs']).copy()
accs = list(set(expanded_df['Uniprot_ACCs'].to_list()))
acc2synonyms_look_up_dict = {}
genename2synonyms_look_up_dict = {}
synonyms2genenames_look_up_dict = {}
synonyms2acc_look_up_dict = {}
for acc in accs:
    if acc != 'SPECIAL_unaccounted_gene':
        #print(acc) # ONLY FOR DEBUGGING. Uncomment when debugging
        uniprot_record = UniprotkbClient.fetch_one(acc)
        if 'genes' in uniprot_record:
            genename = '; '.join([x['geneName']['value'] for x in uniprot_record['genes']])
            synonyms = []
            if uniprot_record['genes']:
                for i in uniprot_record['genes']:
                    if 'synonyms' in i:
                        for s in i['synonyms']:
                            synonyms.append(s['value'])
            if synonyms:
                acc2synonyms_look_up_dict[acc] = synonyms
                genename2synonyms_look_up_dict[genename] = synonyms
                for syn in synonyms:
                    synonyms2genenames_look_up_dict[syn] = genename
                    synonyms2acc_look_up_dict[syn] = acc
            else:
                acc2synonyms_look_up_dict[acc] = 'None_reported'
                genename2synonyms_look_up_dict[genename] =  'None_reported'
            time.sleep(1.1) # don't slam the API
        else:
            print(f"No 'genes' in uniprot_record for {acc}. Here is the record:{uniprot_record}")
''';

No 'genes' in uniprot_record for Q6ZMK1. Here is the record:{'entryType': 'Inactive', 'primaryAccession': 'Q6ZMK1', 'uniProtkbId': 'CYHR1_HUMAN', 'annotationScore': 0.0, 'inactiveReason': {'inactiveReasonType': 'DEMERGED', 'mergeDemergeTo': ['P0DTL5', 'P0DTL6']}}
No 'genes' in uniprot_record for P04745. Here is the record:{'entryType': 'Inactive', 'primaryAccession': 'P04745', 'uniProtkbId': 'AMY1A_HUMAN', 'annotationScore': 0.0, 'inactiveReason': {'inactiveReasonType': 'DEMERGED', 'mergeDemergeTo': ['P0DTE7', 'P0DUB6', 'P0DTE8']}}
No 'genes' in uniprot_record for P0DN76. Here is the record:{'entryType': 'Inactive', 'primaryAccession': 'P0DN76', 'uniProtkbId': 'U2AF5_HUMAN', 'annotationScore': 0.0, 'inactiveReason': {'inactiveReasonType': 'DELETED', 'deletedReason': 'Deleted from Swiss-Prot'}, 'extraAttributes': {'uniParcId': 'UPI0000000C26'}}
No 'genes' in uniprot_record for A6NLF2. Here is the record:{'entryType': 'Inactive', 'primaryAccession': 'A6NLF2', 'uniProtkbId': 'ELB3D_HUMAN'

In [13]:
# if need to read in the substantial lookup dictionaries
'''
import pickle
with open("acc2synonyms_look_up_dict.pkl", "rb") as f:
        acc2synonyms_look_up_dict = pickle.load(f)
print(len(list(acc2synonyms_look_up_dict.keys())))
with open("genename2synonyms_look_up_dict.pkl", "rb") as f:
        genename2synonyms_look_up_dict = pickle.load(f)
print(len(list(genename2synonyms_look_up_dict.keys())))
with open("synonyms2genenames_look_up_dict.pkl", "rb") as f:
        synonyms2genenames_look_up_dict = pickle.load(f)
print(len(list(synonyms2genenames_look_up_dict.keys())))
with open("synonyms2acc_look_up_dict.pkl", "rb") as f:
        synonyms2acc_look_up_dict = pickle.load(f)
print(len(list(synonyms2acc_look_up_dict.keys())))
'''

# if need to save the substantial lookup dictionaries again for some reason
'''
import pickle
with open("acc2synonyms_look_up_dict.pkl", "wb") as f:
        pickle.dump(acc2synonyms_look_up_dict, f)
with open("genename2synonyms_look_up_dict.pkl", "wb") as f:
        pickle.dump(genename2synonyms_look_up_dict, f)
with open("synonyms2genenames_look_up_dict.pkl", "wb") as f:
        pickle.dump(synonyms2genenames_look_up_dict, f)
with open("synonyms2acc_look_up_dict.pkl", "wb") as f:
        pickle.dump(synonyms2acc_look_up_dict, f)
''';

In [56]:
# FOR ONLY USING DURING DEVELOPMENT TO CHECK, make sure set to something like `for acc in accs[:10]:` in where making look up dicts for synonyms above 
'''
print(acc2synonyms_look_up_dict)
print(genename2synonyms_look_up_dict)
print(synonyms2genenames_look_up_dict)
print(synonyms2acc_look_up_dict)
''';

{'O15063': ['KIAA0355']}
{'GARRE1': ['KIAA0355']}
{'KIAA0355': 'GARRE1'}
{'KIAA0355': 'O15063'}


In [1]:
import pickle
with open("look_up_dict_for_h2nh3_all_14188_Uniprot_ACCs.pkl", "rb") as f:
        merged_lookup_dict = pickle.load(f)
len(list(merged_lookup_dict.keys()))

14188

In [10]:
# With the look up dicts related to the synonyms generated for the UniProt identifiers in the rows that have `SPECIAL_unaccounted_gene`

all_presumed_synonyms = []
'''
EXAMPLE CODE THAT EACH PRESUME_SYNONYM SHOULD MAKE where top line gene name is what fixed handling uses and bottom is what the authors had in hu.MAP 2 text data file:
if matched_genename == 'GARRE1':
    matched_genename = 'KIAA0355' # makes it match what author provided file had for that one; they used its synonym
''' 
genenames_already_dealt_with = ['KIAA0355','WDR61','CCDC84','C17orf80','C3orf14','AKAP2','CYHR1','AMY1A','U2AF1L5','ELOA3D','TCP10L2','UGT2A1','TCP10','PRAMEF9','CBSL','HSP90AB4P','HSP90AB3P','HSP90AB2P','HSP90AA5P','H3-2','RIPK4','ZNF678','C4orf48','WASH6P','IGHG4','LINC01587','FAM90A26','FAM90A5P','MSL3P1','PI4KAP1','ZRSR2P1','DENND10P1','LILRB2','ALDH3B2','FAM153CP','FAM153B','FRG1BP','TUBB7P','CHCHD2P9','OFCC1','SNRPGP15' ,'IGHA1' ,'NME2P1' ,'DSCR4' ,'PABPC4L' ,'IGLC7' ,'SSX6P' ,'GPATCH4' ,'CEP170P1' ,'FRMD8P1' ,'CROCCP2' ,'OR4K3' ,'FBXL18' ,'HMGB1P1' ,'FBLL1' ,'WASH3P' ,'ABHD18' ,'PZP' ,'IGLC3' ,'HECTD4' ,'SOWAHA' ,'SIGLEC16' ,'TEX15' ,'PPP1R15B' ,'PRSS46P' ,'PLEKHA8P1' ,'SAC3D1' ,'KIR3DP1' ,'MRPL45' ,'NEDD8-MDP1' ,'HSPA7' ,'ARMCX4' ,'IGHM' ,'PIPSL' ,'APOBEC3D' ,'POTEKP' ,'IGKC' ,'CIRBP-AS1' ,'ZSCAN12' ,'APOA4' ,'IGHG1' ,'CTSL3P' ,'ZNF724' ,'PRKY' ,'NPIPB7' ,'DHRS4L2' ,'HLA-H' ,'IGHA2' ,'PRR5-ARHGAP8' ,'EP400P1' ,'PATJ' ,'LILRB3' ,'TRAC' ,'POM121C' ,'IGLC2' ,'LILRA6' ,'IGHG3' ,'IGHG2' ,'RUNDC1' ,'RPS26P11' ,'HSP90AA4P','LINC01667','LOC100507703','KLRA1P','MRRFP1','INTS4P2','C9orf106','GGT2','GK3P','RPSAP58','USP41'] # I had already added special handling for some and so  
# don't want unnecessary code generated; this includes one where genename removed from UniProt, where authors for some reason seemed to remove uniprot id and made gap, as well as a couple of synonyms I had added handling for already
# Plus I don't want to deal here with examineing with the ones with semi-colons in gene name becuase should be handled now.
genenames_already_dealt_with = [v.split(';')[0] for v in merged_lookup_dict.values() if ';' in v] + genenames_already_dealt_with    
syn_special_handling_dict = {}
syn_special_handling_code_text = ""
presumed_synonyms = []
for row in rows_with_term_df.itertuples():
    #print(row)# ONLY FOR DEBUGGING. Uncomment when debugging
    special_tag_text = "SPECIAL_unaccounted_gene"
    special_tag_num = 0
    
    if special_tag_text in row.Uniprot_ACCs:
        special_tag_num += row.Uniprot_ACCs.count(special_tag_text)
    if special_tag_num:
        presumed_synonyms_per_row = []
        genenames_text_list = row.genenames.split()
        for num in range(special_tag_num):
            collected_genename = genenames_text_list.pop(-1)
            # check for semi-colon ones here since they may mess up things if not handling
            if ';' in collected_genename:
                print(f"WARNING: `{genename}` has a semi-colon in it. A problem here or down the line?")
            if (collected_genename not in presumed_synonyms) and (collected_genename not in genenames_already_dealt_with):
                presumed_synonyms.append(collected_genename) # because want to only add if not already in list and check if has semi-colons, cannot simply use `presumed_synonyms.append(genenames_text_list.pop(-1))`
                presumed_synonyms_per_row.append(collected_genename) 
                # Now to find what UniProt identifier in the `Uniprot_ACCs` for
                # this row matches with the collected presumed synonym. 
                # That should then give the two pieces necessary to build the
                # code string for special handling during the accounting to 
                # build a balanced fixed row
                id_list = row.Uniprot_ACCs.split()
                corresponding_acc = None
                for current_id in id_list:
                    if (current_id != special_tag_text) and (collected_genename in acc2synonyms_look_up_dict[current_id]):
                        corresponding_acc = current_id
                if corresponding_acc == None:
                    print(f"WARNING: No match seen for {collected_genename} among {' '.join(id_list )}")
                else:
                    syn_special_handling_dict[collected_genename] = (merged_lookup_dict[corresponding_acc],collected_genename) # THIS 
                # IS THE MAIN POINT OF COLLECTION. First item collected in this tuple will be the gene name that is primary
                # gene name matching the UniProt, and the second item in the tuple will be the synonym authors used. AND
                # I can use those two points of information to make code text that will handle these and avoid adding
                # instances of `SPECIAL_unaccounted_gene` to the `Uniprot_ACCs` column when there shouldn't be 
                # because balance already handled but missed synonym when doing balacning/accounting steps.
# now that done collecting all the parts, use `syn_special_handling_dict` to
# make the `syn_special_handling_code_text`
print("\n\n\n####----------------------------------------------------------------####")
print("####----------------------------------------------------------------####")
print("BELOW IS THE CODE GENERATED WITH THE INFORMATION TO ADD SPECIAL HANDLING:")
print("####----------------------------------------------------------------####")
print("####----------------------------------------------------------------####\n\n")
for v in syn_special_handling_dict.values():
    syn_special_handling_code_text += f"if matched_genename == '{v[0]}':\n    matched_genename = '{v[1]}' # makes it match what author provided file had for that one; they used its synonym\n"
print(syn_special_handling_code_text)




####----------------------------------------------------------------####
####----------------------------------------------------------------####
BELOW IS THE CODE GENERATED WITH THE INFORMATION TO ADD SPECIAL HANDLING:
####----------------------------------------------------------------####
####----------------------------------------------------------------####


if matched_genename == 'FHIP1B':
    matched_genename = 'FAM160A2' # makes it match what author provided file had for that one; they used its synonym
if matched_genename == 'FHIP1A':
    matched_genename = 'FAM160A1' # makes it match what author provided file had for that one; they used its synonym
if matched_genename == 'IFT70B':
    matched_genename = 'TTC30B' # makes it match what author provided file had for that one; they used its synonym
if matched_genename == 'IFT70A':
    matched_genename = 'TTC30A' # makes it match what author provided file had for that one; they used its synonym
if matched_genename == 'IFT56':
 

(Note:
I later noted an issue with this section from that outout:

```python
if matched_genename == 'LARS1':
    matched_genename = 'LARS' # makes it match what author provided file had for that one; they used its synonym
if matched_genename == 'EPRS1':
    matched_genename = 'EPRS' # makes it match what author provided file had for that one; they used its synonym
if matched_genename == 'EPRS1':
    matched_genename = 'QARS' # makes it match what author provided file had for that one; they used its synonym
if matched_genename == 'MARS1':
    matched_genename = 'MARS' # makes it match what author provided file had for that one; they used its synonym
if matched_genename == 'IARS1':
    matched_genename = 'IARS' # makes it match what author provided file had for that one; they used its synonym
if matched_genename == 'DARS1':
    matched_genename = 'DARS' # makes it match what author provided file had for that one; they used its synonym
```

In particular, note that `if matched_genename == 'EPRS1':` gets listed twice!!
`P07814`, which is EPRS1 matches to synonym of `QARS` and so does `P47897`, which is `QARS1`.
It seems for `QARS` it picked the one the authots didn't use. **I had to hard code change subsequent to here for this one!!**
Want:
```python
if matched_genename == 'QARS1':
    matched_genename = 'QARS' # makes it match what author provided file had for that one; they used its synonym
```
I guess if I was redesigning, I'd add in a check make sure no two repeats for `if matched_genename == 'EPRS1':` in printout.   
I'm hoping I didn't miss anything else insidious like that.)

The 'WARNING:' ones at the top (OF THESE INTERMEDIATE RESULTS - I ADDED HANDLING OF MOST OF THESE SINCE THAT [read on in this markdown cell for referencing that] HAD BEEN RUN SO IT WON'T LOOK LIKE THAT IF YOU RUN IT NOW) are ones that the synonym handling I was adding doesn't seem to address. Mainly these seem to be cases where it looks like in the `Uniprot_ACCs` column of the author provided `humap2_complexes_20200809.txt` file they did some processing that seemed to end up leaving a gap but removed identifiers for some gene names in the `genenames` column. That is speculation based on the corresponding 'gaps' I see for those (or most of those) I investigated by hand. Anyways, to keep balanced, the genenames need something on the `Uniprot_ACCs` column side.
I did some of that by hand and once I figured out the pattern, I drafted code in the notebook `drafting_No_Match_handling.ipynb` to make additional code pieces I need for all the remaining to add in to `Standardizing_identifier_order_in_humap2-provided_csv.ipynb` for those, too.  
Putting the produced here for a more complete record without needing to go over to `drafting_No_Match_handling.ipynb`.

In [1]:
# Drafting code to handle additional drafting code for 'No Match' examples identified during synonyms work
# This is the string I got when running the code above at some point
s = '''WARNING: No match seen for LILRA6 among O75019 Q8IZ02 SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene
WARNING: No match seen for LILRB3 among O75019 Q8IZ02 SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene
WARNING: No match seen for NEDD8-MDP1 among Q9H2D1 Q9NWH7 SPECIAL_unaccounted_gene
WARNING: No match seen for GK3P among P32189 SPECIAL_unaccounted_gene
WARNING: No match seen for HMGB1P1 among P26583 P09429 P23497 SPECIAL_unaccounted_gene
WARNING: No match seen for NME2P1 among Q9H4I3 P22392 O00746 Q13232 P22392 SPECIAL_unaccounted_gene
WARNING: No match seen for MRPL45 among Q8TAE8 Q9UBX3 Q9NVS2 Q13405 Q6P161 Q96EL3 Q7Z7H8 Q9NWU5 Q7Z2W9 Q13084 Q9NQ50 Q8IXM3 Q86TS9 Q9BYC9 Q9P0M9 Q8TCC3 Q9NRX2 Q96DV4 Q9H0U6 P52815 Q9Y3B7 Q9H9J2 Q9HD33 Q96GC5 Q7Z7F7 Q9NP92 Q9NYK5 Q9BZE1 Q14197 Q16540 Q9NX20 Q4U2R6 Q9BQC6 Q9BYD1 Q9BYC8 P49406 Q9BQ48 Q96EH3 Q9NZE8 Q6P1L8 Q8N983 Q9H2W6 Q9Y6G3 O75394 Q96A35 Q8N5N7 Q9P015 P09001 Q9BYD6 Q5T653 Q9BYD3 Q9BYD2 SPECIAL_unaccounted_gene
WARNING: No match seen for CIRBP-AS1 among Q15042 Q02750 P36507 P15056 P01116 P10398 P04049 Q8IVT5 SPECIAL_unaccounted_gene
WARNING: No match seen for INTS4P2 among Q5TA45 Q96HW7 Q9NVH2 Q6P9B9 Q9NV88 Q9UL03 Q13315 SPECIAL_unaccounted_gene
WARNING: No match seen for PRKY among Q16644 O75582 O75676 Q16539 P31260 Q96CC6 Q15759 Q9BY84 Q5VYV7 Q9HBH9 P31269 P17482 O75147 Q32MK0 Q15256 Q9H1R3 Q9BUB5 SPECIAL_unaccounted_gene
WARNING: No match seen for TEX15 among P01008 Q99941 P43251 SPECIAL_unaccounted_gene
WARNING: No match seen for GGT2 among P19440 SPECIAL_unaccounted_gene
WARNING: No match seen for PRR5-ARHGAP8 among P85298 Q8WXA8 SPECIAL_unaccounted_gene
WARNING: No match seen for PATJ among Q9Y2J4 Q9HAP6 Q9NUP9 Q8N3R9 Q9BUF7 O75970 SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene
WARNING: No match seen for SNRPGP15 among P63162 P62306 SPECIAL_unaccounted_gene
WARNING: No match seen for PZP among Q96S86 Q641Q3 Q9UJH8 Q08397 Q8WUX2 P22352 P01023 SPECIAL_unaccounted_gene
WARNING: No match seen for PRSS46P among Q9UHI8 Q9UI38 SPECIAL_unaccounted_gene
WARNING: No match seen for IGHG3 among P35219 SPECIAL_unaccounted_gene
WARNING: No match seen for RPS26P11 among Q9BU19 Q02539 SPECIAL_unaccounted_gene
WARNING: No match seen for ABHD18 among Q3V5L5 SPECIAL_unaccounted_gene
WARNING: No match seen for IGHA1 among O75556 Q8NFR7 P02788 P61626 SPECIAL_unaccounted_gene
WARNING: No match seen for EP400P1 among Q96HT8 Q15014 Q9UBU8 Q9Y605 Q86YC2 SPECIAL_unaccounted_gene
WARNING: No match seen for DSCR4 among P02655 P01350 SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene
WARNING: No match seen for CROCCP2 among P02655 P01350 SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene
WARNING: No match seen for SOWAHA among Q53LP3 Q9HCM7 Q8WXX7 P61962 SPECIAL_unaccounted_gene
WARNING: No match seen for POM121C among Q96HA1 SPECIAL_unaccounted_gene
WARNING: No match seen for LINC01667 among O00481 P78410 O00478 SPECIAL_unaccounted_gene
WARNING: No match seen for CTSL3P among P07711 SPECIAL_unaccounted_gene
WARNING: No match seen for KIR3DP1 among Q14954 P43631 Q14953 P43629 SPECIAL_unaccounted_gene
WARNING: No match seen for AKAP2 among Q9Y2D5 Q96SN8 P31323 P13861 P57105 Q68DQ2 P22612 P17612 P22694 Q9UKA4 Q12802 Q86UN6 Q99996 Q92667 P24588 Q76N32 P11137 P51817 Q96DH6 SPECIAL_unaccounted_gene
WARNING: No match seen for FBLL1 among P49184 SPECIAL_unaccounted_gene
WARNING: No match seen for IGHA2 among Q96Q45 P11678 SPECIAL_unaccounted_gene
WARNING: No match seen for GPATCH4 among Q8NDF8 SPECIAL_unaccounted_gene
WARNING: No match seen for ZSCAN12 among P14384 SPECIAL_unaccounted_gene
WARNING: No match seen for USP41 among Q9UMW8 SPECIAL_unaccounted_gene
WARNING: No match seen for TRAC among Q15545 Q86UD1 SPECIAL_unaccounted_gene
WARNING: No match seen for RUNDC1 among P09917 SPECIAL_unaccounted_gene
WARNING: No match seen for PABPC4L among Q8NEY8 Q86U42 Q8WXF0 Q96MU7 O75494 Q96N46 Q8IYB3 P62995 Q16629 Q13247 Q13243 Q13595 Q9UQ35 Q9UHC7 O00507 Q07955 Q13242 P84103 Q08170 Q9BRL6 Q13427 P49761 P49759 P49760 Q9NWH9 SPECIAL_unaccounted_gene
WARNING: No match seen for CCDC84 among Q86UT8 P54105 P62316 Q969L4 P63162 P14678 P62306 P62304 P62308 Q8IYT2 Q96GM8 SPECIAL_unaccounted_gene
WARNING: No match seen for HSP90AA4P among Q8TDR2 Q14004 Q9Y243 SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene
WARNING: No match seen for FRMD8P1 among Q9BZ67 SPECIAL_unaccounted_gene
WARNING: No match seen for HSPA7 among Q5T124 O15229 SPECIAL_unaccounted_gene
WARNING: No match seen for IGLC7 among B9A064 P15814 SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene
WARNING: No match seen for IGLC2 among B9A064 P15814 SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene
WARNING: No match seen for IGLC3 among B9A064 P15814 SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene
WARNING: No match seen for APOBEC3D among Q8IUX4 Q96E39 SPECIAL_unaccounted_gene
WARNING: No match seen for HECTD4 among Q08380 Q8NCJ5 O75592 Q9Y223 SPECIAL_unaccounted_gene
WARNING: No match seen for CEP170P1 among Q5SW79 SPECIAL_unaccounted_gene
WARNING: No match seen for WASH3P among Q9Y3C0 Q9UM21 O75143 O75385 SPECIAL_unaccounted_gene
WARNING: No match seen for SIGLEC16 among Q96LC7 SPECIAL_unaccounted_gene
WARNING: No match seen for C17orf80 among Q969F1 Q9BSJ5 Q9H825 Q9UKD1 Q9Y692 P16070 SPECIAL_unaccounted_gene
WARNING: No match seen for FBXL18 among Q96A19 Q96EW2 Q9Y620 P43355 Q8IWZ3 O15480 O15481 Q8N7X4 Q07617 SPECIAL_unaccounted_gene
WARNING: No match seen for ARMCX4 among Q9H3H5 P15812 SPECIAL_unaccounted_gene
WARNING: No match seen for SAC3D1 among Q8NF86 SPECIAL_unaccounted_gene
WARNING: No match seen for RPSAP58 among Q14657 Q86X02 P25208 P08865 SPECIAL_unaccounted_gene
WARNING: No match seen for C9orf106 among P05814 SPECIAL_unaccounted_gene
WARNING: No match seen for POTEKP among Q9NVL8 Q8TC41 Q9ULC3 P02774 SPECIAL_unaccounted_gene
WARNING: No match seen for SSX6P among Q16385 Q16385 Q99909 Q7RTT5 SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene
WARNING: No match seen for APOA4 among P69905 P19823 P02652 Q8N9F7 P02656 Q9H1J1 Q14624 P19652 P02763 P04217 P69891 P68871 P02768 P00751 P02790 P02042 P00915 P01024 P02787 SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene
WARNING: No match seen for LOC100507703 among P04439 SPECIAL_unaccounted_gene
WARNING: No match seen for IGKC among O75635 O00204 Q9UMR2 P26447 P16050 Q9UBH0 P29373 Q9Y6R7 P01024 SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene
WARNING: No match seen for IGHG2 among O75635 O00204 Q9UMR2 P26447 P16050 Q9UBH0 P29373 Q9Y6R7 P01024 SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene
WARNING: No match seen for IGHG1 among O75635 O00204 Q9UMR2 P26447 P16050 Q9UBH0 P29373 Q9Y6R7 P01024 SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene SPECIAL_unaccounted_gene
WARNING: No match seen for OR4K3 among Q9NQG5 P19387 O15514 P24928 P30876 P62487 P52434 P19388 P61218 P36954 P52435 O94762 Q96P16 P53803 P62875 Q5VT52 P0CAP1 Q9Y5B0 Q8IXW5 Q92576 Q9HCN4 Q9GZY4 Q9UHW5 SPECIAL_unaccounted_gene
WARNING: No match seen for ZNF724 among Q3ZCQ8 Q9UEG4 Q6ZMS4 SPECIAL_unaccounted_gene
WARNING: No match seen for IGHM among Q86TB9 Q6PJG9 SPECIAL_unaccounted_gene
WARNING: No match seen for NPIPB7 among Q9Y689 O95136 P07360 SPECIAL_unaccounted_gene
WARNING: No match seen for PLEKHA8P1 among Q96JA3 SPECIAL_unaccounted_gene
WARNING: No match seen for DHRS4L2 among Q9BTZ2 SPECIAL_unaccounted_gene
WARNING: No match seen for PPP1R15B among P43115 SPECIAL_unaccounted_gene
WARNING: No match seen for HLA-H among Q13296 P13747 O95672 SPECIAL_unaccounted_gene
WARNING: No match seen for MRRFP1 among Q96E11 SPECIAL_unaccounted_gene
WARNING: No match seen for PIPSL among Q96I34 Q99755 O60331 P53611 Q86XD5 Q9H832 Q8N414 Q15788 P10114 O75781 Q6PGQ7 SPECIAL_unaccounted_gene
WARNING: No match seen for C3orf14 among Q9NW75 Q9C073 Q9NZM6 Q7RTV5 Q9NVR0 P29459 Q96BQ1 Q8NDH6 Q9HBI5 Q96ST8 Q9NRD5 Q05084 Q13563 Q9BX67 Q9Y624 SPECIAL_unaccounted_gene
WARNING: No match seen for KLRA1P among Q8NDV1 O15466 Q9NPC4 P30511 SPECIAL_unaccounted_gene
'''

In [2]:
# Make list
text_signal = "WARNING: No match seen for "
genenames_with_no_match = []
lines_with_identifier = 0
for l in s.split('\n'):
    if text_signal in l:
        #print(l) #For DEBUGGING; uncomment when debugging, make for loop line like `for l in s.split('\n')[:10]` to limit
        lines_with_identifier += 1
        gn = l.split(text_signal,1)[1].split()[0]
        genenames_with_no_match.append(gn)
print(f"There were {lines_with_identifier} lines with the text '{text_signal}' and\n{len(genenames_with_no_match)} identifiers mined from those lines.") #sanity check
# limit those to unique to remove possible duplicates because why waste time repeating lookup more than once.
genenames_with_no_match = list(set(genenames_with_no_match))
print(f"After step to remove any possible duplicates, there were {len(genenames_with_no_match)} unique identifiers mined from those lines.")

75 identifiers mined from those lines.
After step to remove any possible duplicates, there were 75 unique identifiers mined from those lines.


In [3]:
from unipressed import IdMappingClient
import time
request = IdMappingClient.submit(
    source="GeneCards", dest="UniProtKB", ids=genenames_with_no_match
)
time.sleep(len(genenames_with_no_match)*1.1)
results_list = list(request.each_result())

In [6]:
# See if any missed.
# Turns out some like `PRR5-ARHGAP8` map to several, in this example: 'B1AHC3','B1AHC4','H0Y9T8'
# and so cannot just use `results_list` directly.
# Let's just take first instance listed for those with more than one:
raw_results_list = results_list.copy()
results_list = [next((d for d in results_list if d['from'] == x), x) for x in {d['from'] for d in results_list}]
print(f"Number of gene names submitted: {len(genenames_with_no_match)}")
print(f"Number of gene names mapped to Human Genes: {len(results_list)}")
gns_mapped = [x['from'] for x in results_list]
missing = list(set(genenames_with_no_match) - set (gns_mapped))
print(f"Those {len(missing)} not mapped so far: {missing}")

Number of gene names submitted: 75
Number of gene names mapped to Human Genes: 61
Those 14 not mapped so far: ['LINC01667', 'LOC100507703', 'CCDC84', 'AKAP2', 'KLRA1P', 'MRRFP1', 'C17orf80', 'INTS4P2', 'C9orf106', 'GGT2', 'GK3P', 'C3orf14', 'RPSAP58', 'USP41']


In [17]:
# to show what this looks like for understanding code below
print(results_list[:4])

[{'from': 'SNRPGP15', 'to': 'A8MWD9'}, {'from': 'IGHA1', 'to': 'P01876'}, {'from': 'NME2P1', 'to': 'O60361'}, {'from': 'DSCR4', 'to': 'P56555'}]


14 is a much more manageable list to deal with by hand.

Before doing that going to get the pieces of 

For piecing together the handling for these I need these three pieces for each, like these for gene name `OFCC1`:
OFCC1 Q8IZS5


special_genename_lookup_dict['OFCC1'] = 'Q8IZS5'


`,'OFCC1'`

In [20]:
# Make the piece #1 pieces
[print(f"{d['from']} {d['to']}")for d in results_list];

SNRPGP15 A8MWD9
IGHA1 P01876
NME2P1 O60361
DSCR4 P56555
PABPC4L P0CB38
IGLC7 A0M8Q6
SSX6P Q7RTT6
GPATCH4 Q5T3I0
CEP170P1 Q96L14
FRMD8P1 Q9BZ68
CROCCP2 Q86T23
OR4K3 Q96R72
FBXL18 Q96ME1
HMGB1P1 B2RPK0
FBLL1 A6NHQ2
WASH3P C4AMC7
ABHD18 Q0P651
PZP P20742
IGLC3 P0DOY3
HECTD4 Q9Y4D8
SOWAHA Q2M3V2
SIGLEC16 A6NMB1
TEX15 Q9BXT5
PPP1R15B Q5SWA1
PRSS46P E5RG02
PLEKHA8P1 O95397
SAC3D1 A6NKF1
KIR3DP1 A0A0G2JN01
MRPL45 Q9BRJ2
NEDD8-MDP1 E9PL57
HSPA7 P48741
ARMCX4 Q5H9R4
IGHM P01871
PIPSL A2A3N6
APOBEC3D Q96AK3
POTEKP Q9BYX7
IGKC P01834
CIRBP-AS1 Q8TBR5
ZSCAN12 O43309
APOA4 P06727
IGHG1 P01857
CTSL3P Q5NE16
ZNF724 A8MTY0
PRKY O43930
NPIPB7 O75200
DHRS4L2 Q6PKH6
HLA-H P01893
IGHA2 P01877
PRR5-ARHGAP8 B1AHC3
EP400P1 Q6ZTU2
PATJ Q8NI35
LILRB3 O75022
TRAC P01848
POM121C A8CG34
IGLC2 P0DOY2
LILRA6 Q6PI73
IGHG3 P01860
IGHG2 P01859
RUNDC1 Q96C34
RPS26P11 Q5JNZ5
HSP90AA4P Q58FG1


In [22]:
# Make the piece #2 pieces, like `special_genename_lookup_dict['OFCC1'] = 'Q8IZS5'`
itermed = "\n".join([f"special_genename_lookup_dict[`{d['from']}`] = `{d['to']}`" for d in results_list])
print(itermed.replace("`","'"))

special_genename_lookup_dict['SNRPGP15'] = 'A8MWD9'
special_genename_lookup_dict['IGHA1'] = 'P01876'
special_genename_lookup_dict['NME2P1'] = 'O60361'
special_genename_lookup_dict['DSCR4'] = 'P56555'
special_genename_lookup_dict['PABPC4L'] = 'P0CB38'
special_genename_lookup_dict['IGLC7'] = 'A0M8Q6'
special_genename_lookup_dict['SSX6P'] = 'Q7RTT6'
special_genename_lookup_dict['GPATCH4'] = 'Q5T3I0'
special_genename_lookup_dict['CEP170P1'] = 'Q96L14'
special_genename_lookup_dict['FRMD8P1'] = 'Q9BZ68'
special_genename_lookup_dict['CROCCP2'] = 'Q86T23'
special_genename_lookup_dict['OR4K3'] = 'Q96R72'
special_genename_lookup_dict['FBXL18'] = 'Q96ME1'
special_genename_lookup_dict['HMGB1P1'] = 'B2RPK0'
special_genename_lookup_dict['FBLL1'] = 'A6NHQ2'
special_genename_lookup_dict['WASH3P'] = 'C4AMC7'
special_genename_lookup_dict['ABHD18'] = 'Q0P651'
special_genename_lookup_dict['PZP'] = 'P20742'
special_genename_lookup_dict['IGLC3'] = 'P0DOY3'
special_genename_lookup_dict['HECTD4'] = 'Q9Y4D8'
s

In [16]:
# Make the piece #3 pieces (thought it'd be the easiest and so I did first, but because of number of quotes involved in code and what I need in putput I cannot do it in one list comprehension)
itermed = " ".join([f",`{d['from']}`" for d in results_list])
print(itermed.replace("`","'"))

,'SNRPGP15' ,'IGHA1' ,'NME2P1' ,'DSCR4' ,'PABPC4L' ,'IGLC7' ,'SSX6P' ,'GPATCH4' ,'CEP170P1' ,'FRMD8P1' ,'CROCCP2' ,'OR4K3' ,'FBXL18' ,'HMGB1P1' ,'FBLL1' ,'WASH3P' ,'ABHD18' ,'PZP' ,'IGLC3' ,'HECTD4' ,'SOWAHA' ,'SIGLEC16' ,'TEX15' ,'PPP1R15B' ,'PRSS46P' ,'PLEKHA8P1' ,'SAC3D1' ,'KIR3DP1' ,'MRPL45' ,'NEDD8-MDP1' ,'HSPA7' ,'ARMCX4' ,'IGHM' ,'PIPSL' ,'APOBEC3D' ,'POTEKP' ,'IGKC' ,'CIRBP-AS1' ,'ZSCAN12' ,'APOA4' ,'IGHG1' ,'CTSL3P' ,'ZNF724' ,'PRKY' ,'NPIPB7' ,'DHRS4L2' ,'HLA-H' ,'IGHA2' ,'PRR5-ARHGAP8' ,'EP400P1' ,'PATJ' ,'LILRB3' ,'TRAC' ,'POM121C' ,'IGLC2' ,'LILRA6' ,'IGHG3' ,'IGHG2' ,'RUNDC1' ,'RPS26P11' ,'HSP90AA4P'


In [13]:
# for development; to keep kernel active
import time

def executeSomething():
    #code here
    print ('.')
    time.sleep(480) #60 seconds times 8 minutes

while True:
    executeSomething()

.
.
.


KeyboardInterrupt: 