In [1]:
import sys, os, time
from glob import glob

In [2]:
from sqlalchemy import create_engine, MetaData, select

In [3]:
import pandas as pd
import numpy as np

In [4]:
engine = create_engine('mysql+mysqldb://jason:password@localhost/wos')
metadata = MetaData(engine)
metadata.reflect()

In [15]:
def get_query_result(uids, engine, metadata):
    tbl = metadata.tables['pubInfo_minimal']
    sq = tbl.select(tbl.c.UID.in_(uids))
    df_result = pd.read_sql(sq, engine)
    return df_result

In [7]:
def interpret_query_result(df_result):
    d = dict()
    d['counts'] = dict()
    for colname in df_result.columns.tolist():
        notnull = df_result[colname].notnull().sum()
        d['counts'][colname] = notnull
    d['heading_value_counts'] = df_result['heading'].value_counts()
    d['pubyear_value_counts'] = df_result['pubyear'].value_counts()
    d['subheading_value_counts'] = df_result['subheading'].value_counts()
    d['subject_extended_value_counts'] = df_result['subject_extended'].value_counts()
    d['title_source_value_counts'] = df_result['title_source'].value_counts()
    return d

In [8]:
def parse_tree_one_line(line):
    line = line.strip().split(' ')
    cl = line[0]
    paper_id = line[2].strip('"')
    return paper_id, cl

def parse_tree(fname):
    with open(fname, 'r') as f:
        rows = []
        for line in f:
            if line[0] != "#":
                row = parse_tree_one_line(line)
                rows.append(row)
                
    df = pd.DataFrame(rows, columns=['UID', 'cl'])
    
    df['cl_bottom'] = df['cl'].apply(lambda x: ':'.join(x.split(':')[:-1]))

    df['cl_top'] = df['cl'].apply(lambda x: x.split(':')[0])
    
    return df

In [16]:
def get_results_for_multiple_clusters(df_tree, n=10, engine=engine, metadata=metadata):
    # get results for biggest n clusters from one tree file
    d = dict()
    i = 0
    for cl, cnt in df_tree['cl_bottom'].value_counts().iteritems():
        d[cl] = dict()
        d[cl]['total_count'] = cnt
        subset = df_tree[df_tree['cl_bottom']==cl]
        uids = subset['UID'].tolist()
        df_result = get_query_result(uids, engine, metadata)
        d_result = interpret_query_result(df_result)
        for k, v in d_result.items():
            d[cl][k] = v
        i += 1
        if i == n:
            break
    return d

In [17]:
def get_fname_from_number(number):
    g = glob('data/relaxmap_cluster_treefiles/wos_{}_cluster*.tree'.format(number))
    if len(g) == 1:
        return g[0]
    else:
        raise RuntimeError("number: {} -- file not found".format(number))

In [18]:
fname = get_fname_from_number(4)
df_tree = parse_tree(fname)


In [50]:
def get_df_result_stepwise(df_tree, step=5000):
    start = time.clock()
    uids = df_tree['UID']
    df_result = pd.DataFrame([])
    step = 5000
    for i in range(0, len(uids), step):
        uids_subset = uids[i:min(i+step, len(uids))]
        df_subset = get_query_result(uids_subset, engine, metadata)
        if df_result.empty:
            df_result = df_subset
        else:
            df_result = df_result.append(df_subset)
        # print("{}:{}. Shape: {}".format(i, min(i+step, len(uids)), df_result.shape))
    end = time.clock()
    print("done. took {:.1f} seconds".format(end-start))
    return df_result

In [58]:
def get_results_one_cluster(fname):
    df_tree = parse_tree(fname)
    # fname_tail = os.path.split(fname)[1]
    step = max(df_tree.shape[0]/20, 5000)
    df_result = get_df_result_stepwise(df_tree, step=step)
    d_result = interpret_query_result(df_result)
    return d_result

In [59]:
g = glob('data/relaxmap_cluster_treefiles/*.tree')

In [63]:
%%time
d = {}
for fname in g:
    print('getting results for {}'.format(fname))
    fname_tail = os.path.split(fname)[1]
    d_result = get_results_one_cluster(fname)
    d[fname_tail] = {
        'num_results': d_result['counts']['UID'],
        'subject_extended_value_counts': d_result['subject_extended_value_counts']
    }

getting results for data/relaxmap_cluster_treefiles/wos_59_cluster_2858839.tree
done. took 1.1 seconds
getting results for data/relaxmap_cluster_treefiles/wos_60_cluster_5084895.tree
done. took 1.1 seconds
getting results for data/relaxmap_cluster_treefiles/wos_49_cluster_12269262.tree
done. took 1.2 seconds
getting results for data/relaxmap_cluster_treefiles/wos_73_cluster_7053040.tree
done. took 1.1 seconds
getting results for data/relaxmap_cluster_treefiles/wos_62_cluster_8956094.tree
done. took 1.1 seconds
getting results for data/relaxmap_cluster_treefiles/wos_77_cluster_1223978.tree
done. took 1.1 seconds
getting results for data/relaxmap_cluster_treefiles/wos_57_cluster_1769913.tree
done. took 1.1 seconds
getting results for data/relaxmap_cluster_treefiles/wos_4_cluster_5639372.tree
done. took 2.9 seconds
getting results for data/relaxmap_cluster_treefiles/wos_71_cluster_4328450.tree
done. took 1.2 seconds
getting results for data/relaxmap_cluster_treefiles/wos_64_cluster_514176

In [64]:
import cPickle as pickle
with open('relaxmap_cluster_subject_extended_data.pickle', 'wb') as outf:
    pickle.dump(d, outf)

In [68]:
for k, v in d.items():
    print(k)
    print(v['subject_extended_value_counts'][0:3])
    print

wos_71_cluster_4328450.tree
Chemistry              4844
Cell Biology           4764
Genetics & Heredity    3617
Name: subject_extended, dtype: int64

wos_12_cluster_1889140.tree
Virology                               7876
Immunology                             4822
Science & Technology - Other Topics    2611
Name: subject_extended, dtype: int64

wos_27_cluster_2479591.tree
Materials Science    9684
Chemistry            3451
Cell Biology         3133
Name: subject_extended, dtype: int64

wos_88_cluster_7184875.tree
Neurosciences & Neurology    5177
Ophthalmology                4669
Physiology                   2111
Name: subject_extended, dtype: int64

wos_17_cluster_2864584.tree
Pediatrics    4033
Physiology    3524
Chemistry     2259
Name: subject_extended, dtype: int64

wos_60_cluster_5084895.tree
Genetics & Heredity          3741
Neurosciences & Neurology    3690
Physiology                   2018
Name: subject_extended, dtype: int64

wos_73_cluster_7053040.tree
Virology             

In [84]:
for i in range(20, 30):
    try:
        fname = os.path.split(get_fname_from_number(i))[1]
    except RuntimeError:
        continue
    print(fname)
    x = d[fname]['subject_extended_value_counts']
    j = 0
    for subject, c in x.iteritems():
        print(subject)
        j += 1
        if j == 3:
            break
    print("")

wos_20_cluster_1029834.tree
Zoology
Transplantation
Polymer Science

wos_21_cluster_8376148.tree
Cell Biology
Oncology
Biochemistry & Molecular Biology

wos_22_cluster_3058305.tree
Astronomy & Astrophysics
Physics
Science & Technology - Other Topics

wos_23_cluster_4494585.tree
Astronomy & Astrophysics
Endocrinology & Metabolism
Geology

wos_24_cluster_10026951.tree
Physics
Materials Science
Optics

wos_25_cluster_2929392.tree
Physics
Engineering
Materials Science

wos_26_cluster_3234467.tree
Physics
Zoology
Life Sciences & Biomedicine - Other Topics

wos_27_cluster_2479591.tree
Materials Science
Chemistry
Cell Biology

wos_28_cluster_1349764.tree
Electrochemistry
Chemistry
Materials Science

wos_29_cluster_4669990.tree
Pharmacology & Pharmacy
Neurosciences & Neurology
Physiology

