In [1]:
dnodes_file = '/home/katya/Dropbox/metabolite annotation/170720 all .8 0 .8/Dnodes.csv'
sim_file = '/home/katya/Dropbox/metabolite annotation/170720 all .8 0 .8/dataset_similarity.csv'

# IF THERE IS A NEED TO SAVE GENERATED CORPORA AND MODELS, if not put None
corpora_dir = '/home/katya/Dropbox/metabolite annotation/170720 all .8 0 .8/corpora'
model_dir = '/home/katya/Dropbox/metabolite annotation/170720 all .8 0 .8/models'
sim_dir = '/home/katya/Dropbox/metabolite annotation/170720 all .8 0 .8/sim_index'
misc_dir = '/home/katya/Dropbox/metabolite annotation/170720 all .8 0 .8/mics'

# SET THE MODELS TO BE TRAINED
tfidf = True
lsi = True
lda = False

# Number of topics for LSI and LDA
n_topics = 100

#Threshold for molecule values
t = 0.5
# Threshold for molecule frequencies
f = 0.8

val_t = int((1-float(t))*100)

# CREATE A CORPUS
corpus = []
doc_ids = []
molecule_names = []
first = True

import gensim
from gensim import corpora
import csv
import os
import pickle

if corpora_dir and (not os.path.exists(corpora_dir)): os.makedirs(corpora_dir)
if model_dir and (not os.path.exists(model_dir)): os.makedirs(model_dir)
if sim_dir and (not os.path.exists(sim_dir)): os.makedirs(sim_dir)
if misc_dir and (not os.path.exists(misc_dir)): os.makedirs(misc_dir)

dictionary = {}

with open(dnodes_file, 'r') as csvfile:
    rowreader = csv.reader(csvfile, delimiter=',')
    for row in rowreader:
        if first: 
            molecule_names = row[44:-2]
            print('Number of sumformulas = ' + str(len(molecule_names)))
            first = False
        else:
            doc_ids.append(row[0])
            molecule_values =  [int((1-float(mv))*100) for mv in row[44:-2]]
            ds = []
            for i in range(0,len(molecule_names)):
                if molecule_values[i]>val_t:
                        ds.append((i,molecule_values[i]))
            corpus.append(ds)

print('Number of documents = %d' % len(corpus))

if corpora_dir: corpora.MmCorpus.serialize(os.path.join(corpora_dir,'corpus.mm'), corpus)
    
dictionary = {i:v for (i,v) in enumerate(molecule_names)}

if misc_dir:
    with open(os.path.join(misc_dir,'dataset_names.pkl'), 'wb') as f: pickle.dump(doc_ids, f)
    with open(os.path.join(misc_dir,'dictionary.pkl'), 'wb') as f: pickle.dump(dictionary, f)



Number of sumformulas = 6046
Number of documents = 1421


In [2]:
# RUN MODELING

from gensim import models

if tfidf:
    tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)
    tfidf_corpus = tfidf_model[corpus]
    if model_dir: tfidf_model.save(os.path.join(model_dir,'model.tfidf'))
    if corpora_dir: corpora.MmCorpus.serialize(os.path.join(corpora_dir,'corpus_tfidf.mm'), tfidf_corpus)
    tfidf_model = None
    
    if lsi:
        lsi_model = gensim.models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=n_topics)
        lsi_corpus = lsi_model[tfidf_corpus]
        if model_dir: lsi_model.save(os.path.join(model_dir,'model.lsi'))
        if corpora_dir:corpora.MmCorpus.serialize(os.path.join(corpora_dir,'corpus_lsi.mm'), lsi_corpus)
        lsi_model = None
if lda:
    lda_model = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics=n_topics, id2word = dictionary, iterations=3000, passes = 10, workers=4)
    lda_corpus = lda_model[corpus]
    if corpora_dir:corpora.MmCorpus.serialize(os.path.join(corpora_dir,'corpus_lda.mm'), lda_corpus)
    if model_dir: lda_model.save(os.path.join(model_dir,'model.lda'))
    lda_model = None

In [3]:
# RUN SIMILARITY MODELING

from gensim.similarities import MatrixSimilarity

if tfidf:
    tfidf_index = MatrixSimilarity(tfidf_corpus)
    tfidf_sims = tfidf_index[tfidf_corpus]
    if sim_dir: tfidf_index.save(os.path.join(sim_dir,'tfidf.index'))

if lsi:
    lsi_index = MatrixSimilarity(lsi_corpus)
    lsi_sims = tfidf_index[lsi_corpus]
    if sim_dir: lsi_index.save(os.path.join(sim_dir,'lsi.index'))
        
if lda:
    lda_index = MatrixSimilarity(lda_corpus)
    lda_sims = tfidf_index[lda_corpus]
    if sim_dir: lda_index.save(os.path.join(sim_dir,'lda.index'))

In [4]:
# OUTPUT SIMILARITY RESULTS INTO FILE

simoutput_file = open(sim_file, 'w')

simoutput_file.write('ID1,ID2,tfidf_similarity,lsi_similarity,lda_similarity\n')
counter = 0
for i in range(0,len(doc_ids)):
    for j in range(i+1,len(doc_ids)):
        simoutput_file.write(doc_ids[i]+',')
        simoutput_file.write(doc_ids[j])

        if tfidf: dist_tfidf = tfidf_sims[i][j]
        if lsi: dist_lsi = (lsi_sims[i][j]+1)/2
        if lda: dist_lda = lda_sims[i][j]

        if tfidf: simoutput_file.write(','+str(dist_tfidf))
        else: simoutput_file.write(',-')
        if lsi: simoutput_file.write(','+str(dist_lsi))
        else: simoutput_file.write(',-')
        if lda: simoutput_file.write(','+str(dist_lda))
        else: simoutput_file.write(',-')
        simoutput_file.write('\n')