In [1]:
import pandas as pd
import torch
from transformers import CLIPUserModel
import pickle as pkl
from transformers import CLIPTokenizerFast
import os
from tqdm import tqdm
import numpy as np
import seaborn as sns

In [3]:
class RankingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings1, user_id, doc_id):
        self.encodings1 = encodings1
        self.user_id = user_id
        self.doc_id = doc_id

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings1.items()}
        item['user_id'] = torch.tensor(self.user_id[idx])
        item['doc_id'] = torch.tensor(self.doc_id[idx])
        return item

    def __len__(self):
        return len(self.user_id)
    
    
def getRankedPids(model, val_dataset):
    model.cuda()
    model.eval()
    dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=256)

    logits, pids = list(), list()
    with torch.no_grad():
        for b in tqdm(dataloader):
            b = {k:v.cuda() for k, v in b.items()}
            output = model(**b)

            logits.append(output['logits_per_image'].cpu().numpy()[0,:])
            pids.append(b['doc_id'].cpu().numpy())
            
            
    logits = list(np.concatenate(logits))
    pids = list(np.concatenate(pids))
    _, sorted_pids = zip(*sorted(zip(logits, pids),reverse=True))
    
    return sorted_pids 

def getTopNArticles(df, sorted_pids,user_id, n=100):
    userPids = set(df[df.user_id == user_id].PID)
    sample_pids = list()
    i = 0
    for l in sorted_pids:
        if l not in userPids:
            sample_pids.append(l)
            i += 1
        if i == n:
            break
            
    X = df[df.PID.isin(sample_pids)]
    X = X.set_index("PID").loc[sample_pids]
    return X

In [4]:
#Load train/val data
train_dataset = pkl.load(open("../book_data/rankingDataset/train_dataset.pkl", "rb"))
test_dataset = pkl.load(open("../book_data/rankingDataset/test_dataset.pkl", "rb"))

In [5]:

model = CLIPUserModel.from_pretrained("openai/clip-vit-base-patch32")

#model.load_state_dict(torch.load("../results/checkpoint-9500/pytorch_model.bin"))
model.load_state_dict(torch.load("../results/checkpoint-22000/pytorch_model.bin"))




Some weights of CLIPUserModel were not initialized from the model checkpoint at openai/clip-vit-base-patch32 and are newly initialized: ['user_embedder.weight', 'user_projection.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [6]:
#Play around

In [7]:
#Load DFs
df = pd.read_csv("../clipData/whole.csv")
train = pd.read_csv("../clipData/train.csv")
test = pd.read_csv("../clipData/test.csv")

#Load Mappers
uid2i = pkl.load(open("../clipData/uid2i.pkl", "rb"))
i2uid = pkl.load(open("../clipData/i2uid.pkl", "rb"))

uid2name = pkl.load(open("../clipData/uid2name.pkl", "rb"))
name2uid = pkl.load(open("../clipData/name2uid.pkl", "rb"))

#Remove duplicates
df_no_dup = df.drop_duplicates(subset ="PID",
                     keep = False, inplace = False)

#Fix data
tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
pids, encodings = list(df_no_dup.PID), tokenizer(list(df_no_dup.Abstract), truncation=True, padding=True)

In [8]:
#Find userid from name

In [9]:
df[df.name.str.contains("Käll, L")].head(2)

Unnamed: 0.1,Unnamed: 0,user_id,name,AllName,NameList,UidList,Title,Name,Journal,Year,...,Keywords,Categories,ResearchSubjects,Abstract,FullTextLink,PublicationType,ContentType,PID,JSON,i
2709,3042,u1gqsept,"Käll, Lukas","['McIlwain, Sean', 'Tamura, Kaipo', 'Kertesz-F...","['Käll, Lukas']",['u1gqsept'],Crux : Rapid Open Source Protein Tandem Mass S...,"McIlwain, Sean;Tamura, Kaipo;Kertesz-Farkas, A...",Journal of Proteome Research,2014.0,...,"['Peptide Identification', 'Proteomics Data', ...",['Bioinformatik (beräkningsbiologi) (10203)'],,Efficiently and accurately analyzing big prote...,,Artikel i tidskrift,Refereegranskat,761918,"{""AllName"":[""McIlwain, Sean"",""Tamura, Kaipo"",""...",1569
2739,3074,u1gqsept,"Käll, Lukas","['Ashwood, C.', 'Bittremieux, W.', 'Deutsch, E...","['Käll, Lukas']",['u1gqsept'],Proceedings of the EuBIC-MS 2020 Developers’ M...,"Ashwood, C.;Bittremieux, W.;Deutsch, E. W.;Don...",Etudes Lawrenciennes,2020.0,...,"['Benchmark development', 'Bioinformatics', 'C...","['Analytisk kemi (10401)', 'Bioinformatik och ...",,The 2020 European Bioinformatics Community for...,,Artikel i tidskrift,Refereegranskat,1591617,"{""AllName"":[""Ashwood, C."",""Bittremieux, W."",""D...",1569


In [10]:
#let's use Johan Boye
user_id = "u1gqsept"
uids = [uid2i[user_id]] * len(pids)

In [11]:
#Create dataset
val_dataset =  RankingDataset(encodings, uids, pids)

In [12]:
sorted_pids = getRankedPids(model, val_dataset)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 136/136 [00:13<00:00, 10.13it/s]


In [13]:
X = getTopNArticles(df_no_dup, sorted_pids, user_id, 50)

In [14]:

for i, row in X.iterrows():
    print(row.Title, "\n")
    print(row.AllName.rstrip("[").lstrip("']"),"\n")
    print(row.Abstract, "\n")
    print("------- \n")

Statistical and machine learning methods to analyze large-scale mass spectrometry data 

['The, Matthew'] 

Modern biology is faced with vast amounts of data that contain valuable information yet to be extracted. Proteomics, the study of proteins, has repositories with thousands of mass spectrometry experiments. These data gold mines could further our knowledge of proteins as the main actors in cell processes and signaling. Here, we explore methods to extract more information from this data using statistical and machine learning methods.First, we present advances for studies that aggregate hundreds of runs. We introduce MaRaCluster, which clusters mass spectra for large-scale datasets using statistical methods to assess similarity of spectra. It identified up to 40% more peptides than the state-of-the-art method, MS-Cluster. Further, we accommodated large-scale data analysis in Percolator, a popular post-processing tool for mass spectrometry data. This reduced the runtime for a draft h

In [17]:
model = model.cuda()
model.eval()
dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=256)
acc_list = list()
with torch.no_grad():
    for b in tqdm(dataloader):
        b = {k:v.cuda() for k, v in b.items()}
        output = model(**b)
        acc = sum([p==v for p,v in zip(torch.argmax(output['logits_per_image'],0), np.arange(256))]) / 256
        acc_list.append(acc.cpu().numpy()[None])
        

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 96/96 [00:09<00:00, 10.34it/s]
