In [165]:
from __future__ import print_function, division
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.models import ColumnDataSource, Range1d

output_notebook()

In [319]:
tsv = pd.read_csv('/Users/Gully/Documents/Projects/2_active/bigMech/work/2016-07-27-pathwayLogic/tsv1file/PMC2173577.scidp.discourse.tsv',
                 sep='\t')
pmcId = "PMC2173577"
tsv.head(5)

Unnamed: 0,SentenceId,Clause Text,Codes,ExperimentValues,ExperimentSpan,Paragraph,Headings,FloatingBox?,Discourse Type,friesSentenceId,friesEventsIds,friesEventsDetails,friesEventText
0,s1,Mechanisms through which,[],[],[],-,,False,none,-,-,-,-
1,s1,Sos-1 coordinates the activation of Ras and Rac,[],[],[],-,,False,none,-,-,-,-
2,s2,Signaling from receptor tyrosine kinases ( RTK...,[],[],[],p4,,False,none,-,-,-,-
3,s2,requires the sequential activation of the smal...,[],[],[],p4,,False,fact,-,-,-,-
4,s3,"Son of sevenless ( Sos-1 ) , a bifunctional gu...",[],[],[],p4,,False,fact,-,-,-,-


In [323]:
import re
from sets import Set

def read_codes(es):

    if( es != es ):
        es = "[]"
    
    removeCommaAndRegex = re.compile(r", and ", re.IGNORECASE)
    es = removeCommaAndRegex.sub(",", es)

    removeAndRegex = re.compile(r" and ", re.IGNORECASE)
    es = removeAndRegex.sub(",", es)
    
    codes = Set()

    for c in re.findall('[Ss]{0,1}\d+[\s,]{0,2}[A-Za-z,;\-\s]*', es):

        #print(c)
        
        simpleM = re.match('\d+$', c)
        simpleSubM = re.match('\d+[\s,]{0,2}([A-Za-z])', c)
        intM = re.match('\d+[\s,]{0,2}([A-Za-z]+)\-([A-Za-z]+)', c)
        comma2M = re.match('\d+[\s,]{0,2}([A-Za-z]+)[;,]\s{0,1}([A-Za-z]+)', c)
        comma3M = re.match('\d+[\s,]{0,2}([A-Za-z]+)[;,]\s{0,1}([A-Za-z]+)[;,]\s{0,1}([A-Za-z]+)', c)
        
        suppM = re.match('([Ss]){1,1}\d+', c)
        
        figM = re.match('(\d+)', c)
        fig = figM.group(1)
          
        if( intM is not None ):
            start = ord(intM.group(1))
            end = ord(intM.group(2))
            for ascii_code in range(start, end+1): 
                codes.add(fig + chr(ascii_code))
                #print("      int:" + fig + chr(ascii_code))
         
        elif( comma3M is not None ):
            codes.add(fig + comma3M.group(1))
            codes.add(fig + comma3M.group(2))
            codes.add(fig + comma3M.group(3))
            #print("    comma3:" + fig + comma3M.group(1))
            #print("    comma3:" + fig + comma3M.group(2))
            #print("    comma3:" + fig + comma3M.group(3))
        
        elif( comma2M is not None ):
            codes.add(fig + comma2M.group(1))
            codes.add(fig + comma2M.group(2))
            #print("    comma2:" + fig + comma2M.group(1))
            #print("    comma2:" + fig + comma2M.group(2))
        
        elif( simpleM is not None ):
            codes.add(fig)
            #print("      simple:" + fig)
              
        elif( simpleSubM is not None ):
            codes.add(fig + simpleSubM.group(1))
            #print("      simpleSub:" + fig + simpleSubM.group(1))
    
    return codes

In [358]:
gantt_rows = []
gantt2_rows = []

dtypes = ["fact","hypothesis","problem","goal"  ,"method","result","implication"]
colors = ["Snow" ,"Snow"    ,"Snow" ,"LightGray","LightGray"  ,"Thistle"  ,"Plum"] 
colors_s = pd.Series(colors, index=dtypes)

all_codes = Set()   

clause_max = -1
clause_min = 1000

for i,row in tsv.iterrows():
    es = row['ExperimentValues']
    exptSpan = row['ExperimentSpan']
    dt = row['Discourse Type']
    sid = row['SentenceId']
    paragraph = row['Paragraph']
    heading = row['Headings']
    floatingBox = row['FloatingBox?']

    if( heading != heading ):
        heading = ""

    #if(not floatingBox):
    #    clause_max = i
        
    if( re.match('^Result', heading) is None or floatingBox):
        continue
    
    if( i > clause_max):
        clause_max = i
    if( i < clause_min):
        clause_min = i
    
    codes = read_codes(es)    
    for c in codes:
        gantt_rows.append([c, i, dt, heading])
        all_codes.add(c)
                
    spanCodes = read_codes(exptSpan)
    for c in spanCodes:
        gantt2_rows.append([c, i, dt, heading])
        
codes_s = pd.Series(range(len(all_codes)), index=sorted(list(all_codes)))

gantt_df = pd.DataFrame.from_records(gantt_rows, columns=['expt','clause_id','discourse_type', 'heading']) 
gantt_df = gantt_df.sort(columns=['clause_id'], ascending=True)

gantt2_df = pd.DataFrame.from_records(gantt2_rows, columns=['expt','clause_id','discourse_type', 'heading']) 
gantt2_df = gantt2_df.sort(columns=['clause_id'], ascending=True)

#print(codes_s.loc[gantt_df['expt'].tolist()].tolist())

gantt_df['expt_id'] = codes_s.loc[gantt_df['expt'].tolist()].tolist()
gantt2_df['expt_id'] = codes_s.loc[gantt2_df['expt'].tolist()].tolist()

gantt2_df['color'] = colors_s.loc[gantt2_df['discourse_type'].tolist()].tolist()

print(gantt2_df)

    expt  clause_id discourse_type  heading  expt_id      color
0     1A         58         method  Results        0  LightGray
1     1A         59         method  Results        0  LightGray
2     1A         60         method  Results        0  LightGray
3     1A         61         method  Results        0  LightGray
4     1B         62         result  Results        1    Thistle
5     1C         63           goal  Results        2  LightGray
6     1C         64         method  Results        2  LightGray
7     1C         65         method  Results        2  LightGray
8     1C         66           fact  Results        2       Snow
9     1C         67           fact  Results        2       Snow
10    1C         68         result  Results        2    Thistle
11    1C         69         method  Results        2  LightGray
12    1C         70         result  Results        2    Thistle
13    1C         71         result  Results        2    Thistle
14    1C         72         result  Resu

In [359]:
G=figure(title=pmcId, width=800, height=600, 
         x_range=Range1d(clause_min, clause_max), y_range=list(codes_s.index.values))
G.xaxis.axis_label="Clause #"
G.yaxis.axis_label="Figure Code"

gantt2_df['top']=gantt2_df['expt_id']+0.8
gantt2_df['bottom']=gantt2_df['expt_id']+1.2
gantt2_df['left']=gantt2_df['clause_id']-0.5
gantt2_df['right']=gantt2_df['clause_id']+0.5

cds2 = ColumnDataSource(gantt2_df)
G.quad(left='left', right='right', bottom='bottom', top='top',source=cds2, line_color="gray", color='color')

cds = ColumnDataSource(gantt_df)
G.scatter('clause_id', 'expt', source=cds, marker='x', size=15,
              line_color="red", fill_color="red4")

#G.rect(,"Item",source=CDS)
show(G)