In [1]:
from rapidfuzz import fuzz
from rapidfuzz import process
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
df = pd.read_parquet("../preds_strainselect.pqt")

In [3]:
df = df.drop(columns=["label_rel","label"])

In [4]:
df = df.rename(columns={"score":"ner_score",})

In [58]:
df["ner"].unique()

array(['ISOLATE', 'MEDIUM', 'PHENOTYPE', 'ORGANISM', 'COMPOUND', 'EFFECT',
       'DISEASE', 'SPECIES'], dtype=object)

In [79]:
words = df[df["ner"]=="ORGANISM"].word_qc.value_counts()
query_words = words[(words > 4)].index
all_words = words.index

In [80]:
cutoff = 95

In [81]:
result = process.cdist(query_words, all_words, scorer=fuzz.token_sort_ratio, score_cutoff=cutoff, workers=10)
indices = np.argwhere(result >= cutoff)

In [82]:
word_indices = list(zip(all_words[indices[:,0]], all_words[indices[:,1]]))
matchesdf = pd.DataFrame(word_indices)

In [83]:
scores = result[indices[:,0], indices[:,1]]
matchesdf['score'] = scores

In [84]:
unique_matches = matchesdf[matchesdf[0] != matchesdf[1]]

In [85]:
word_counts = df.word_qc.value_counts()
unique_matches.loc[:, 'total_count_0'] = unique_matches[0].map(word_counts)
unique_matches.loc[:, 'total_count_1'] = unique_matches[1].map(word_counts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_matches.loc[:, 'total_count_0'] = unique_matches[0].map(word_counts)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_matches.loc[:, 'total_count_1'] = unique_matches[1].map(word_counts)


In [86]:
unique_matches.loc[:, 'consensus_word'] = unique_matches.apply(lambda x: x[0] if x['total_count_0'] > x['total_count_1'] else x[1], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_matches.loc[:, 'consensus_word'] = unique_matches.apply(lambda x: x[0] if x['total_count_0'] > x['total_count_1'] else x[1], axis=1)


In [87]:
unique_matches

Unnamed: 0,0,1,score,total_count_0,total_count_1,consensus_word
8,balb/c mice,balb/cj mice,95.652176,4082,39,balb/c mice
9,balb/c mice,balbc mice,95.238098,4082,10,balb/c mice
10,balb/c mice,balb/ca mice,95.652176,4082,3,balb/c mice
11,balb/c mice,"balb/c, mice",95.652176,4082,2,balb/c mice
12,balb/c mice,bablb/c mice,95.652176,4082,1,balb/c mice
...,...,...,...,...,...,...
13423,severe combined immunodeficient,severe combined immunodeficient (,96.875000,5,1,severe combined immunodeficient
13424,severe combined immunodeficient,severe combined immune-deficient,95.238098,5,1,severe combined immunodeficient
13430,codium fragile,"codium fragile,",96.551727,5,1,codium fragile
13433,t. subterraneum,t. subterraneaum,96.774193,5,1,t. subterraneum


In [88]:
unique_matches.sort_values(by="score", ascending=False).iloc[-30:]

Unnamed: 0,0,1,score,total_count_0,total_count_1,consensus_word
4281,c6/36 mosquito cells,c3/36 mosquito cells,95.0,15,4,c6/36 mosquito cells
4590,conventional chickens,conventional chicks,95.0,14,3,conventional chickens
4544,g. mellonella larva,g. mellonella) larvae,95.0,14,2,g. mellonella larva
6815,rhodotorula glutinis,rhodotorula glutinus,95.0,9,1,rhodotorula glutinis
579,raw264.7 macrophages,raw246.7 macrophages,95.0,137,1,raw264.7 macrophages
13292,phytophthora capsica,phytophthora capsici,95.0,5,45,phytophthora capsici
6518,fusarium moniliforme,fusarium moniloforme,95.0,9,1,fusarium moniliforme
3096,biserrula pelecinus,biserrula pelecinus l,95.0,22,2,biserrula pelecinus
9021,pseudosciaene crocea,pseudosciaena crocea,95.0,7,8,pseudosciaena crocea
9037,myoporum bontioides,myoporum bontioides a,95.0,7,2,myoporum bontioides


In [44]:
# Create a dictionary to group words based on common connections
grouped_words = defaultdict(list)
for _, row in unique_matches.iterrows():
    grouped_words[row['consensus_word']].append(row)

# Create a dictionary to map each consensus word to all connected words
consensus_to_words = defaultdict(set)

# Iterate through each group to check their abundances and select the consensus word
for group_key, group_values in grouped_words.items():
    # Calculate the total count for each word in the group
    total_counts = {word: sum(unique_matches[unique_matches[0] == word]['total_count_0']) + sum(unique_matches[unique_matches[1] == word]['total_count_1']) for word in [row[0] for row in group_values] + [row[1] for row in group_values]}
    # Select the word with the highest total count as the consensus word
    consensus_word = max(total_counts, key=total_counts.get)
    
    # Add all words in the group to the set of the consensus word
    for row in group_values:
        consensus_to_words[consensus_word].update([row[0], row[1]])

# Save the mapping to a file
import json
with open('consensus_to_words_mapping.json', 'w') as f:
    json.dump(
        {k: list(v) for k, v in consensus_to_words.items()},
        f,
        indent=4
    )



In [45]:
# Replace the "word_qc" column in the df dataframe using the consensus_to_words dictionary
df['word_qc_group'] = df['word_qc'].apply(lambda x: next((k for k, v in consensus_to_words.items() if x in v), x))

In [46]:
len(consensus_to_words)

1369

In [47]:
df

Unnamed: 0,formatted_text,text,end_strain,entity_group_strain,score_strain,start_strain,word_strain,end,entity_group,ner_score,...,score_rel,word_strain_qc,word_qc,score_full,vertex,vertex_type,StrainSelectID,score_partial,score_parts,word_qc_group
0,"(@STRAIN$), isolated from a @ISOLATE$ that was...","(A18), isolated from a deep-water sediment sam...",4,B,0.931742,1,a18,49.0,B,0.988911,...,0.999886,a18,deep-water sediment sample,,,,,,,deep-water sediment sample
1,"1), Planothidium sp. (@STRAIN$), which was iso...","1), Planothidium sp. (st. 2), which was isolat...",27,B,0.740087,22,st. 2,66.0,B,0.989779,...,0.999889,st. 2,peatland,,,,,,,peatland
2,A FAW strain (@STRAIN$ 89034‐R) resistant to M...,A FAW strain (MON 89034‐R) resistant to MON 89...,17,B,0.521139,14,mon,140.0,B,0.826722,...,0.999880,mon,field,,,,,,,field
3,A mesophilic @ISOLATE$ culture @STRAIN$-22 (La...,A mesophilic cheese culture CHN-22 (Lactococcu...,31,B,0.828548,28,chn,19.0,B,0.944847,...,0.999896,chn,cheese,,,,,,,cheese
4,A negative control (DNA extracted from @ISOLAT...,A negative control (DNA extracted from water) ...,134,B,0.924880,113,l. interrogans atcc^®,44.0,B,0.832074,...,0.736157,l. interrogans atcc^®,water,,,,,,,water
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11676,The effectiveness of @STRAIN$ was confirmed by...,The effectiveness of S. boulardii CNCM I-745 w...,44,B,0.951697,21,s. boulardii cncm i-745,264.0,B,0.589171,...,0.689224,s. boulardii cncm i-745,acute gastroenteritis,,,,,,,acute gastroenteritis
11677,The lack of protection by the strains of @STRA...,The lack of protection by the strains of L. ac...,42,B,0.571035,41,l,175.0,B,0.663700,...,0.966521,l,nec,,,,,,,nec
11678,The lack of protection by the strains of L. ac...,The lack of protection by the strains of L. ac...,61,B,0.508241,60,l,175.0,B,0.663700,...,0.954678,l,nec,,,,,,,nec
11679,"There have been reports of improved cognition,...","There have been reports of improved cognition,...",208,B,0.717988,205,vsl,140.0,B,0.661192,...,0.992038,vsl,cirrhosis,,,,,,,cirrhosis


after doing the replacement would be nice to redo the clustering again to see if there are some that should be changed manually and do this in the replacement part in `rel_pred.smk`