In [6]:
# COMPUTES IONS MOST SIMILAR TO A GIVEN ION BASED ON THEIR WEIGHT (INTENSITY, TFIDF, LSI) IN EACH REGION

# SET DATASET NAME
ds_name = 'wb xenograft in situ metabolomics test - rms_corrected'

# model param : tfidf or lsi or corpus
model = "corpus"
# fdr threshold <50%, DO NOT CHANGE this parameter although it means nothing!
threshold = 50

import os
from os import path
import pandas as pd
import gensim
from gensim import corpora
import pickle

base_path = '/opt/data/'
ann_path = path.join(base_path, 'pixel-annot-export')
cor_path = path.join(base_path, 'pixel-corpora/pixel_'+model+'.mm')
map_path = path.join(base_path, 'pixel-ind2ord')

ds_df_path = path.join(ann_path, 'ds_df.msgpack')
ds_df = pd.read_msgpack(ds_df_path)
ds_ind = ds_df[ds_df['name'] == ds_name].index.tolist()[0]

pind2ord_path = path.join(map_path, '{}.pkl'.format(ds_ind))
with open(pind2ord_path, 'rb') as f: pind2ord = pickle.load(f)

pixel_df_path = path.join(ann_path, 'pixel_df_list/{}.msgpack'.format(ds_ind))
pixel_df = pd.read_msgpack(pixel_df_path)
pixel_ids = [pind2ord[i] for i in pixel_df.p_ind.unique() if i in pind2ord]

pixel_corpus = gensim.corpora.MmCorpus(cor_path)

print("Number of pixels in dataset = %d" % (len(pixel_ids)))
ion_counter = 0
ion_idx = {}
ion_corpus = []
for pid in pixel_ids:
    for (ion,val) in pixel_corpus[pid]:
        if not ion in ion_idx: 
            ion_idx[ion] = ion_counter
            ion_corpus.append([])
            ion_counter+=1
        ion_corpus[ion_idx[ion]].append((pid,val))
print("Ion corpus generated, size = ", len(ion_corpus))

from gensim import similarities
from gensim.similarities import MatrixSimilarity
from gensim import models

## tfidf, doesn't shown advantages compared to intensities, but slower
#tfidf_model = gensim.models.TfidfModel(ion_corpus)
#tfidf_corpus = tfidf_model[ion_corpus]
#sim_index = gensim.similarities.docsim.MatrixSimilarity(tfidf_corpus)

# intensities
sim_index = gensim.similarities.docsim.MatrixSimilarity(ion_corpus)
print('Ion similarity computed')

Number of pixels in dataset = 25781
Ion corpus generated, size =  222
Ion similarity computed


In [7]:
# SET SUMFORMULA NAME, ADDUCT
sf = 'C6H9NO5'
adduct = '+Na'

ion_df_path = path.join(ann_path, 'ion_df.msgpack')
ion_df = pd.read_msgpack(ion_df_path)
ion_ind = ion_df[ion_df.formula == sf][ion_df.adduct == adduct].index.tolist()[0]

sim_matrix = sim_index[ion_corpus[ion_idx[ion_ind]]]
                    
ion_sim = [(index, row.formula+row.adduct, sim_matrix[ion_idx[index]]) for index, row in ion_df.iterrows() if index in ion_idx]
ion_sim.sort(key=lambda x: x[2],reverse=True)




KeyError: 43736

In [None]:
# NUMBER OF MOST SIMILAR IONS
n = 40
# quantile param
q = 99.9

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

def hotspot_removal(img):
    perc99_v = np.percentile(img, q=99.9)
    img[img > perc99_v] = perc99_v
    return img

counter = 0
csv_ions = []
for (ion_ind, ion_name, value) in ion_sim:
    if counter == n: break
    counter+=1
    print('%s %s: %f' % (ion_df.at[ion_ind,'formula'], ion_df.at[ion_ind,'adduct'], value))
    csv_ions.append('%s,%s,%f' % (ion_df.at[ion_ind,'formula'], ion_df.at[ion_ind,'adduct'], value))
    
    ion_sub_df = pixel_df[pixel_df.ion_ind == ion_ind]
    
    max_x = pixel_df['x'].max()
    max_y = pixel_df['y'].max()
    arr = np.zeros([max_x+1, max_y+1])

    for index, row in ion_sub_df.iterrows():
        arr[row['x'].astype(np.int64)][row['y'].astype(np.int64)] = row['int']
        
    arr = np.rot90(arr, 1)
    arr = hotspot_removal(arr)
    plt.pcolormesh(arr,cmap='viridis')
    plt.axes().set_aspect('equal', 'datalim')
    plt.axes().axis('off')
    plt.show()

In [None]:
for e in csv_ions: print(e)