In [6]:
import os
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
import joblib
import topicwizard

TFIDF = False
PORT = 5151

root = "../scripts/2_post_analyses/"
data_dir = os.path.abspath(os.path.join(root, "..", "..", "data"))
results_dir = os.path.abspath(os.path.join(root, "..", "..", "results", "topicmodeling"))
if not os.path.exists(results_dir):
    os.makedirs(results_dir, exist_ok=True)

df = pd.read_csv(os.path.join(data_dir, "ml_datasets_matrix", "df_only_pk_only_adme_genes.csv"))

print(df.columns)

prot_names_df = pd.read_csv(os.path.join(data_dir, "other", "pgkb_gene_uniprot_mapping.tsv"), sep="\t")

prot2gene = {}
for v in prot_names_df.values:
    prot2gene[v[2]] = v[1]

df.rename(columns=prot2gene, inplace=True)

inchikeys = list(df["Unnamed: 0"])
df = df.drop(columns=["Unnamed: 0"], inplace=False)

# Create documents

# 1-9: weight 1
# 10-49: weight 2
# 50+: weight 3
documents_list = []
cols = list(df.columns)
for v in df.values:
    document = []
    for i, x in enumerate(v):
        if x == 0:
            continue
        elif x < 10:
            n = 1
        elif x < 50:
            n = 2
        else:
            n = 3
        document += [cols[i]] * n
    documents_list += [" ".join(document)]

def create_documents():
    f0 = open("{0}/compound_docs.txt".format(results_dir), "w")
    f1 = open("{0}/inchikeys.txt".format(results_dir), "w")
    for i, g in enumerate(documents_list):
        f0.write(g + "\n")
        f1.write(inchikeys[i] + "\n")
    f0.close()
    f1.close()

create_documents()

# Topic modeling

num_topic_trials = [10, 11, 12, 13, 14, 15, 16]

def tokenizer(x):
    return x.split(" ")

if not TFIDF:
    vectorizer = CountVectorizer(lowercase=False, tokenizer=tokenizer)
else:
    vectorizer = TfidfVectorizer(lowercase=False, tokenizer=tokenizer)

def get_texts():
    with open("{0}/compound_docs.txt".format(results_dir), "r") as f:
        texts = [x.rstrip() for x in f.readlines()]
    return texts

def get_doc_names():
    with open("{0}/inchikeys.txt".format(results_dir), "r") as f:
        doc_names = [x.rstrip() for x in f.readlines()]
    return doc_names

texts = get_texts()
doc_names = get_doc_names()
 
def fit_topic_pipeline(num_topics):
    mdl = NMF(n_components=num_topics, max_iter=1000)
    topic_pipeline = Pipeline(
       [
          ("vec", vectorizer),
          ("mdl", mdl),
       ]
    )
    texts = get_texts()
    doc_names = get_doc_names()
    topic_pipeline.fit(texts)
    return topic_pipeline

NUM_TOPICS = 10
topic_pipeline = fit_topic_pipeline(num_topics=NUM_TOPICS)
vec = topic_pipeline["vec"]
mdl = topic_pipeline["mdl"]
    
data_for_wizard = {"corpus": texts, "pipeline": topic_pipeline, "document_names": doc_names}

joblib.dump(data_for_wizard, os.path.join(results_dir, "data_for_wizard.joblib"))

data_for_wizard = joblib.load(os.path.join(results_dir, "data_for_wizard.joblib"))

Index(['Unnamed: 0', 'O00337', 'O15244', 'O15245', 'O15438', 'O15439',
       'O15440', 'O15528', 'O60656', 'O60706', 'O75469', 'O75751', 'O75828',
       'O94956', 'O95342', 'O95477', 'P02795', 'P04798', 'P05091', 'P05177',
       'P05181', 'P07099', 'P08183', 'P08263', 'P08684', 'P09211', 'P09488',
       'P0DMM9', 'P10632', 'P10635', 'P11245', 'P11509', 'P11712', 'P16152',
       'P16435', 'P16662', 'P18440', 'P19224', 'P20813', 'P20815', 'P21266',
       'P22309', 'P22310', 'P24462', 'P28845', 'P30711', 'P31513', 'P32320',
       'P33260', 'P33261', 'P33527', 'P35503', 'P35504', 'P35869', 'P36537',
       'P41235', 'P41440', 'P46721', 'P47989', 'P50225', 'P50226', 'P51580',
       'P54855', 'P78329', 'Q01740', 'Q06278', 'Q07973', 'Q12882', 'Q14973',
       'Q14994', 'Q4U2R8', 'Q5T3U5', 'Q86VW1', 'Q8TCC7', 'Q92887', 'Q9HAS3',
       'Q9HAW7', 'Q9HAW8', 'Q9HAW9', 'Q9HB55', 'Q9NPD5', 'Q9UIG8', 'Q9UNQ0',
       'Q9Y6L6'],
      dtype='object')



The parameter 'token_pattern' will not be used since 'tokenizer' is not None'

