In [23]:
import sys, os, time
import cPickle as pickle
from glob import glob

In [2]:
from sqlalchemy import create_engine, MetaData, select

In [3]:
import pandas as pd
import numpy as np

In [4]:
engine = create_engine('mysql+mysqldb://jason:password@localhost/wos')
metadata = MetaData(engine)
metadata.reflect()

In [5]:
def parse_tree_one_line(line):
    line = line.strip().split(' ')
    cl = line[0]
    paper_id = line[2].strip('"')
    return paper_id, cl

with open('data/relaxmap_cluster_treefiles/wos_2_cluster_4654150.tree', 'r') as f:
    rows = []
    for line in f:
        if line[0] != "#":
            row = parse_tree_one_line(line)
            rows.append(row)
df = pd.DataFrame(rows, columns=['UID', 'cl'])
df['cl_bottom'] = df['cl'].apply(lambda x: ':'.join(x.split(':')[:-1]))

df['cl_top'] = df['cl'].apply(lambda x: x.split(':')[0])

In [6]:
cl = '1:1:1'
subset = df[df.cl_bottom==cl]

t_uid = subset.iloc[0]['UID']


tbl = metadata.tables['pubInfo_minimal']
sq = tbl.select(tbl.c.UID==t_uid)
df_result = pd.read_sql(sq, engine)

uids = subset['UID'].tolist()
tbl = metadata.tables['pubInfo_minimal']
sq = tbl.select(tbl.c.UID.in_(uids))
df_result = pd.read_sql(sq, engine)

In [7]:
for colname in df_result.columns.tolist():
    notnull = df_result[colname].notnull().sum()
    print("{}: {}".format(colname, notnull))

UID: 996
abstract_text: 400
heading: 996
keyword: 144
pubyear: 996
subheading: 870
subject_extended: 996
subject_traditional: 996
title_item: 996
title_source: 996


In [8]:
df_result.heading.value_counts()

Science & Technology                     994
Science & Technology&&Social Sciences      2
Name: heading, dtype: int64

In [9]:
for cl, cnt in df.cl_bottom.value_counts().iteritems():
    print(cl, cnt)

('1:1:1', 1976)
('2:1:2', 1406)
('2:1:1', 1339)
('1:1:2', 1229)
('6:1:1', 1172)
('3:1:3', 1100)
('2:1:3', 1094)
('3:1:1', 995)
('5:1', 962)
('1:1:5', 952)
('2:2:2', 916)
('2:1:5', 896)
('1:1:7', 866)
('1:1:3', 818)
('2:1:4', 804)
('1:2:1', 771)
('5:4', 768)
('1:1:8', 758)
('3:1:2', 726)
('6:1:2', 691)
('2:2:6', 688)
('1:1:17', 669)
('2:1:6', 661)
('2:3:1', 653)
('1:1:6', 650)
('2:1:12', 640)
('5:3', 627)
('2:2:4', 608)
('1:1:12', 608)
('1:3:1', 585)
('6:1:3', 570)
('3:1:4', 569)
('2:1:13', 557)
('3:1:5', 537)
('6:1:5', 535)
('1:1:13', 533)
('5:8', 533)
('1:1:14', 523)
('3:1:6', 512)
('2:3:2', 506)
('4:1:3', 502)
('5:6', 497)
('2:2:7', 488)
('1:1:18', 482)
('2:2:3', 469)
('1:4:2', 466)
('1:1:9', 455)
('1:1:15', 453)
('2:1:15', 446)
('1:1:19', 433)
('2:2:5', 431)
('2:3:3', 431)
('4:1:6', 426)
('2:1:8', 420)
('2:3:7', 420)
('1:2:3', 410)
('1:1:21', 410)
('6:1:4', 403)
('2:1:10', 403)
('3:1:10', 402)
('3:1:8', 398)
('1:2:2', 397)
('1:3:3', 396)
('2:3:4', 394)
('2:1:7', 392)
('5:7', 389)
('

In [10]:
def get_query_result(df_tree, cl, engine, metadata):
    subset = df_tree[df_tree['cl_bottom']==cl]

    uids = subset['UID'].tolist()
    tbl = metadata.tables['pubInfo_minimal']
    sq = tbl.select(tbl.c.UID.in_(uids))
    df_result = pd.read_sql(sq, engine)
    return df_result

In [11]:
def interpret_query_result(df_result):
    d = dict()
    d['counts'] = dict()
    for colname in df_result.columns.tolist():
        notnull = df_result[colname].notnull().sum()
        d['counts'][colname] = notnull
    d['heading_value_counts'] = df_result['heading'].value_counts()
    d['pubyear_value_counts'] = df_result['pubyear'].value_counts()
    d['subheading_value_counts'] = df_result['subheading'].value_counts()
    d['subject_extended_value_counts'] = df_result['subject_extended'].value_counts()
    d['title_source_value_counts'] = df_result['title_source'].value_counts()
    return d

In [12]:
def parse_tree_one_line(line):
    line = line.strip().split(' ')
    cl = line[0]
    paper_id = line[2].strip('"')
    return paper_id, cl

def parse_tree(fname):
    with open(fname, 'r') as f:
        rows = []
        for line in f:
            if line[0] != "#":
                row = parse_tree_one_line(line)
                rows.append(row)
                
    df = pd.DataFrame(rows, columns=['UID', 'cl'])
    
    df['cl_bottom'] = df['cl'].apply(lambda x: ':'.join(x.split(':')[:-1]))

    df['cl_top'] = df['cl'].apply(lambda x: x.split(':')[0])
    
    return df

In [13]:
def get_results_for_multiple_clusters(df_tree, n=10, engine=engine, metadata=metadata):
    # get results for biggest n clusters from one tree file
    d = dict()
    i = 0
    for cl, cnt in df_tree['cl_bottom'].value_counts().iteritems():
        d[cl] = dict()
        d[cl]['total_count'] = cnt
        df_result = get_query_result(df_tree, cl, engine, metadata)
        d_result = interpret_query_result(df_result)
        for k, v in d_result.items():
            d[cl][k] = v
        i += 1
        if i == n:
            break
    return d

In [14]:
def get_fname_from_number(number):
    g = glob('data/relaxmap_cluster_treefiles/wos_{}_cluster*.tree'.format(number))
    if len(g) == 1:
        return g[0]
    else:
        raise RuntimeError("number: {} -- file not found".format(number))

In [15]:

d = dict()
for i in range(2, 70):
    try:
        fname = get_fname_from_number(i)
        df_tree = parse_tree(fname)
        fname_tail = os.path.split(fname)[1]
        d[fname_tail] = get_results_for_multiple_clusters(df_tree, n=10)
        print("collected data for relaxmap cluster {}".format(i))
    except RuntimeError:
        print("relaxmap cluster {} tree file not found. skipping".format(i))

collected data for relaxmap cluster 2
collected data for relaxmap cluster 3
collected data for relaxmap cluster 4
collected data for relaxmap cluster 5
collected data for relaxmap cluster 6
collected data for relaxmap cluster 7
relaxmap cluster 8 tree file not found. skipping
collected data for relaxmap cluster 9
collected data for relaxmap cluster 10
collected data for relaxmap cluster 11
collected data for relaxmap cluster 12
collected data for relaxmap cluster 13
collected data for relaxmap cluster 14
collected data for relaxmap cluster 15
collected data for relaxmap cluster 16
collected data for relaxmap cluster 17
collected data for relaxmap cluster 18
collected data for relaxmap cluster 19
collected data for relaxmap cluster 20
collected data for relaxmap cluster 21
collected data for relaxmap cluster 22
collected data for relaxmap cluster 23
collected data for relaxmap cluster 24
collected data for relaxmap cluster 25
collected data for relaxmap cluster 26
collected data for rel

In [16]:
def print_results1(relaxmap_cluster_data, colname):
    fieldname = "{}_value_counts".format(colname)
    vals = []
    for cl, d in relaxmap_cluster_data.iteritems():
        x = d[fieldname]
        topitem = x.index[0]
        vals.append(topitem)
    print(" | ".join(vals))

In [17]:
def results2_get_topitem(s):
    # if top two values are within [10%] of each other, return both
    a, b = s.values[:2]
    err = float(a - b) / a
    if err < .1:
        return "{}/{}".format(s.index[0], s.index[1])
    else:
        return s.index[0]
    
    

def print_results2(relaxmap_cluster_data, colname):
    fieldname = "{}_value_counts".format(colname)
    vals = []
    for cl, d in relaxmap_cluster_data.iteritems():
        x = d[fieldname]
        topitem = results2_get_topitem(x)
        vals.append(topitem)
    print(" | ".join(vals))

In [18]:

for k, v in sorted(d.items()):
    print(k)
    print_results1(v, 'subject_extended')
    print("")

wos_10_cluster_808980.tree
Electrochemistry | Engineering | Engineering | Chemistry | Chemistry | Surgery | Transplantation | Surgery | Neurosciences & Neurology | Chemistry

wos_11_cluster_8808000.tree
Radiology, Nuclear Medicine & Medical Imaging | Cardiovascular System & Cardiology | Cardiovascular System & Cardiology | Radiology, Nuclear Medicine & Medical Imaging | Psychiatry | Psychiatry | Psychology | Psychology | Psychology | Polymer Science

wos_12_cluster_1889140.tree
Virology | Pharmacology & Pharmacy | Cell Biology | Cell Biology | Virology | Virology | Virology | Virology | Virology | Immunology

wos_13_cluster_7542179.tree
Chemistry | Chemistry | Physics | Chemistry | Chemistry | Engineering | Environmental Sciences & Ecology | Engineering | Chemistry | Chemistry

wos_14_cluster_2748381.tree
Neurosciences & Neurology | Chemistry | Neurosciences & Neurology | Neurosciences & Neurology | Neurosciences & Neurology | Neurosciences & Neurology | Neurosciences & Neurology | Neu

In [19]:
for k, v in sorted(d.items()):
    print(k)
    print_results1(v, 'title_source')
    print("")

wos_10_cluster_808980.tree
ELECTROCHIMICA ACTA | JOURNAL OF CHEMICAL AND ENGINEERING DATA | INDUSTRIAL & ENGINEERING CHEMISTRY RESEARCH | JOURNAL OF PHYSICAL CHEMISTRY B | CHEMICAL COMMUNICATIONS | AMERICAN JOURNAL OF SPORTS MEDICINE | STEM CELLS AND DEVELOPMENT | CYTOTHERAPY | BRAIN RESEARCH | JOURNAL OF POWER SOURCES

wos_11_cluster_8808000.tree
AMERICAN JOURNAL OF NEURORADIOLOGY | STROKE | STROKE | STROKE | BRITISH JOURNAL OF PSYCHIATRY | JOURNAL OF THE AMERICAN ACADEMY OF CHILD AND ADOLESCENT PSYCHIATRY | SOCIAL DEVELOPMENT | CHILD DEVELOPMENT | JOURNAL OF SCHOOL PSYCHOLOGY | JOURNAL OF POWER SOURCES

wos_12_cluster_1889140.tree
JOURNAL OF VIROLOGY | BIOORGANIC & MEDICINAL CHEMISTRY LETTERS | JOURNAL OF BIOLOGICAL CHEMISTRY | MOLECULAR AND CELLULAR BIOLOGY | JOURNAL OF VIROLOGY | JOURNAL OF VIROLOGY | JOURNAL OF VIROLOGY | AIDS RESEARCH AND HUMAN RETROVIRUSES | JOURNAL OF VIROLOGY | JOURNAL OF IMMUNOLOGY

wos_13_cluster_7542179.tree
CHEMICAL COMMUNICATIONS | JOURNAL OF PHOTOCHEMIST

In [20]:
for k, v in sorted(d.items()):
    print(k)
    print_results2(v, 'subject_extended')
    print("")

wos_10_cluster_808980.tree
Electrochemistry | Engineering | Engineering | Chemistry | Chemistry | Surgery | Transplantation | Surgery/Cell Biology | Neurosciences & Neurology | Chemistry/Electrochemistry

wos_11_cluster_8808000.tree
Radiology, Nuclear Medicine & Medical Imaging | Cardiovascular System & Cardiology | Cardiovascular System & Cardiology | Radiology, Nuclear Medicine & Medical Imaging | Psychiatry | Psychiatry | Psychology | Psychology | Psychology | Polymer Science/Energy & Fuels

wos_12_cluster_1889140.tree
Virology | Pharmacology & Pharmacy | Cell Biology | Cell Biology | Virology | Virology | Virology | Virology | Virology | Immunology

wos_13_cluster_7542179.tree
Chemistry | Chemistry | Physics | Chemistry | Chemistry | Engineering | Environmental Sciences & Ecology/Chemistry | Engineering/Chemistry | Chemistry | Chemistry

wos_14_cluster_2748381.tree
Neurosciences & Neurology | Chemistry | Neurosciences & Neurology | Neurosciences & Neurology | Neurosciences & Neurol

In [24]:
with open('data/relaxmapclusters_vc_2-70.pickle', 'wb') as outf:
    pickle.dump(d, outf)