In [1]:
import os
import pandas as pd
import CQHDimensionalPhenotyper as cqh

# Score the corpus

In [2]:
pmids = pd.read_csv("../data/metadata.csv", encoding="latin-1", index_col=None)
pmids = pmids["PMID"].dropna().astype(int).values
pmids.shape

(18155,)

In [3]:
domains = ["negative", "positive", "cognitive", "social", "arousal_regulatory"]
scores = {domain: [] for domain in domains}
for pmid in pmids:
    raw_text = open("../../nlp/corpus/{}.txt".format(pmid), "r").read()
    results = cqh.count_document(raw_text)
    results = cqh.reduce_counts(results)
    for domain in domains:
        scores[domain].append(results[domain])

In [4]:
df = pd.DataFrame(scores, index=pmids)
df.to_csv("data/scores_cqh.csv", columns=domains)

# Format the term lists

In [5]:
doms, tkns = [], []
labels = ["NEGATIVE_VALENCE", "POSITIVE_VALENCE", "COGNITIVE_SYSTEMS", "SOCIAL_PROCESSES", "AROUSAL_REGULATION"]
for i, domain in enumerate(domains):
    dom_tkns = cqh.DOMAIN_TOKEN_MAP[domain]
    tkns += dom_tkns
    doms += [labels[i]] * len(dom_tkns)
seed_df = pd.DataFrame({"DOMAIN": doms, "TOKEN": tkns})
seed_df.to_csv("lists/lists_cqh.csv", index=None)