In [5]:
import os

base_dir = '/home/katya/Dropbox/metabolite annotation/170720 all .8 0 .8/'

groups_file = os.path.join(base_dir,'datasets_groups.csv')
metadata_file = os.path.join(base_dir,'Dnodes.csv')
mol4groups_file = os.path.join(base_dir,'molecules4groups/molecules4metadata.csv')

corpora_dir = os.path.join(base_dir,'corpora')
misc_dir = os.path.join(base_dir,'mics')

# IF CLASSES4GROUPS IS REQUIRED, OTHERWISE PUT None
classes4groups_dir = os.path.join(base_dir,'classes4groups/metadata_groups')
sf2mol_file = os.path.join(base_dir,'Anodes01.csv')
mol2classes_file = '/home/katya/Dropbox/metabolite annotation/ClassesInfo/HMDB_36_classyfire_21_annotations.csv'

# SET THE MODEL
model = "tfidf"

# OUTPUT WEIGHTS OR NOT
weights = True

# USE METADATA INSTEAD OF MANUALLY DEFINED GROUPS
metadata = True

#######################################################################

import csv
import pickle

corpus_file = os.path.join(corpora_dir,'corpus_'+model+'.mm')
dict_file = os.path.join(misc_dir,'dictionary.pkl')
dsnames_file = os.path.join(misc_dir,'dataset_names.pkl')

with open(dict_file, 'rb') as f: dictionary = pickle.load(f)
with open(dsnames_file, 'rb') as f: doc_ids = pickle.load(f)

# READ FILE WITH GROUPS
groups = {}
if not metadata:
    counter = 0
    with open(groups_file, 'r') as csvfile:
            rowreader = csv.reader(csvfile, delimiter=',')
            for row in rowreader:
                if counter == 0:
                    group_names = row[1:]
                else:
                    for i in range(1,len(row)):
                        if int(row[i]) > 0:
                            if not group_names[i-1] in groups: groups[group_names[i-1]] = []
                            groups[group_names[i-1]].append(doc_ids.index(row[0]))
                counter += 1
else:
    metadata_column_range = (7,27)
    counter = 0
    with open(metadata_file, 'r') as csvfile:
        rowreader = csv.reader(csvfile, delimiter=',')
        for row in rowreader:
            if counter == 0:
                group_names = row[metadata_column_range[0]:metadata_column_range[1]+1]
            else:
                doc_id = doc_ids.index(row[0])
                for i, md in enumerate(row[metadata_column_range[0]:metadata_column_range[1]+1]):
                    g_name = group_names[i] + '::' + md
                    if not g_name in groups: groups[g_name] = []
                    groups[g_name].append(doc_id)
            counter += 1

In [6]:
# READ CLASSES DATA
if classes4groups_dir:
    if not os.path.exists(classes4groups_dir): os.makedirs(classes4groups_dir)
        
    sf2mols = {}
    firstrow = True
    with open(sf2mol_file , 'r') as csvfile:
        rowreader = csv.reader(csvfile, delimiter=',')
        for row in rowreader:
            if firstrow: 
                firstrow = False
                mol_names_index = row.index("comp_ids")
            else: 
                sf_name = row[0]
                mol_names = row[mol_names_index].split(',')
                sf2mols[sf_name] = mol_names

    mol2classes = {}
    firstrow = True
    with open(mol2classes_file, 'r') as csvfile:
        rowreader = csv.reader(csvfile, delimiter=',')
        for row in rowreader:
            if firstrow: firstrow = False
            else: 
                mol_name = row[0]
                class_id = row[1]
                class_name = row[2]
                if mol_name not in mol2classes: mol2classes[mol_name] = []
                mol2classes[mol_name].append(class_name)

    sf2classes = {}
    classes2sf = {}
    for sf_name in sf2mols:
        sf2classes[sf_name] = []
        cl_tmp = {}
        for mol_name in sf2mols[sf_name]:
            if mol_name in mol2classes:
                for class_name in mol2classes[mol_name]:
                    if class_name not in classes2sf: classes2sf[class_name] = []
                    classes2sf[class_name].append(sf_name)

                    if class_name not in cl_tmp: cl_tmp[class_name] = 0
                    cl_tmp[class_name] += 1
            else:
                print('No classes for molecule: ' + mol_name)
        for c in cl_tmp:
            sf2classes[sf_name].append((c,(cl_tmp[c]/len(sf2mols[sf_name]))))
    sf2mols = {}
    mol2classes = {}
            
def classes4group(sf2c, c2sf, sf_group):
    result = []
    cl_dict = {}
    for (sf,w) in sf_group:
        for (cl,p) in sf2c[sf]:
            if cl not in cl_dict: cl_dict[cl] = []
            cl_dict[cl].append((sf,w,p))
    for cl in cl_dict:
        sfs = []
        ka_measure = 0

        for (sf,w,p) in cl_dict[cl]:
            sfs.append(sf)
            ka_measure += w*p

        ka_measure_norm = ka_measure / len(c2sf[cl])

        result.append((cl,len(c2sf[cl]),len(cl_dict[cl]),ka_measure_norm,sfs))
        

    return result

In [7]:
import numpy
import gensim
from gensim import corpora

corpus = gensim.corpora.MmCorpus(corpus_file)

ofile = open(mol4groups_file, 'w') 

for group_name in sorted(groups):
    ofile.write('"'+group_name+'"')
    molecules2values = {}
    for dataset_id in groups[group_name]:
        molecule_values = corpus[dataset_id]
        for (mol_id,value) in molecule_values:
            mol_name = dictionary[mol_id]
            if not mol_name in molecules2values: molecules2values[mol_name] = []
            molecules2values[mol_name].append(value)

    molecules2average = []
    for mol_name in molecules2values:
        for i in range(0,len(groups[group_name])-len(molecules2values[mol_name])):
            molecules2values[mol_name].append(0)
        molecules2average.append((mol_name,numpy.mean(molecules2values[mol_name])))
    molecules2average.sort(key=lambda x: x[1],reverse=True)

    # OUTPUT MOLECULES4GROUPS
    sf_group = []
    for (m,a) in molecules2average:
        sf_group.append((m,a))
        ofile.write(','+m)
        if weights: ofile.write(','+str(a))
    ofile.write('\n')
    
    # OUTPUT CLASSES4GROUPS
    if classes4groups_dir:
        c4g_list = classes4group(sf2classes, classes2sf, sf_group)
        
        gfile = open(os.path.join(classes4groups_dir, group_name.replace('/','!') + '.tsv'), 'w')
        gfile.write('CLASS_NAME\tSIZE\t#FOUND\tKA_MEASURE_NORM\tSUMFORMULAS\n')
        for el in c4g_list:
            gfile.write('%s\t%d\t%d\t%f' % el[:-1])
            for sf in el[-1]: gfile.write('\t%s' % sf)
            gfile.write('\n')

KeyboardInterrupt: 