# Exploration: Using pandas to slurp OpenSesame CoNLL files

1. index num,
2. word form, 
3. lemma (plurals, past tenses, etc. get converted to dictionary form), 

4. POS (part of speech), 
?
5. ?dependency number (for dependency parsing), 
6. Frame Name
7. 

- B: begin
- I: in
- O: out

In [1]:
from collections import OrderedDict
import pandas as pd

#conll = pd.read_csv('resiliance_excerpt.conll', header=None, sep='\t') 
conll = pd.read_csv('resiliance.conll', header=None, sep='\t') 
conll.columns = ["word_index", "word_form", "?", "lemma", "?", "pos", "sent_num", "?", "?", "?", "?", "?", "lemma_POS", "frame_name", "FE_Name_BIO"]
#columns_to_drop = [2, 4, 6, 7, 8, 9, 10, 11]
conll.drop('?', axis='columns', inplace=True)

 
print(conll)

      word_index     word_form       lemma   pos  sent_num   lemma_POS  \
0              1          then        Then    RB         0           _   
1              2             ,           ,     ,         0           _   
2              3            we          we   PRP         0           _   
3              4         model       model   VBP         0           _   
4              5       traffic     traffic    NN         0           _   
5              6      response    response    NN         0  response.n   
6              7            to          to    TO         0           _   
7              8        random      random    VB         0           _   
8              9       roadway         UNK    JJ         0           _   
9             10   disruptions  disruption   NNS         0           _   
10            11           and         and    CC         0           _   
11             1          then        Then    RB         0           _   
12             2             ,        

In [None]:
import csv


## TODO:
- split the dataframe by sentences
- group identical sentences together (also assuming that within our text, all sentences are distinct)
    - we also know that identical sentences will occur sequentially

# Approach
- classes to define:
    - Word:
        - Lexical item -- the word itself
        - POS -- one of the NLTK POS markers
        - 
    - Sentence
- Iterate through dataframe
- Keep running of list of words ==> sentence
    - this list will refresh as soon as numbering restarts
- Hash table where key is sentence and value is the labelled sentence 

## Pseudo class example:

```
Sentence 1: (JSON form, should be quite interchangeable with a class)
// currently 1-indexed... switch to 0 indexing?
    {
        words = ['then', ',', 'we', 'model', 'traffic', 'response', 'to', 'random', 'roadway', 'disruptions', 'and']
        frame_descriptions = [
                                {'Response': [
                                                {"response.n": 6},
                                                {
                                                    "S-Agent": [3],
                                                    "S-Response": [5],
                                                    "Trigger", [7,10]
                                                }
                                             ]
                                 },
                                 {'Coincidence': [
                                                     {"random.a": 8},
                                                     {
                                                         "State_of_affairs": [9, 10]
                                                     }
                                                 ]
                                 }
                                 ...
                             ]
    }
```
General algorithm as follows:



In [2]:
class FrameElement:
    def __init__(self, fe_name, index, is_target=False):
        self.fe_name = fe_name
        self.indices = [index]
        self.is_target = is_target
    def append_index(self, index):
        self.indices.append(index)
    def __repr__(self):
        return f'{self.fe_name}[{self.is_target} target]: {self.indices}\n'
        
class FrameDescription:
    def __init__(self):
        self.frame_name = ""
        self.frame_elements = {}
    def set_frame_name(self, frame_name):
        self.frame_name = frame_name
    def set_LU(self, lex_unit, lu_pos):
        self.lex_unit = lex_unit
        self.lu_pos = lu_pos
        self.frame_elements[lex_unit] = FrameElement(lex_unit, lu_pos, True)
    def add_FE(self, fe_name, index):
        if fe_name[1]=='-':
            fe_name = fe_name[2:]
        if fe_name in self.frame_elements:
            self.frame_elements[fe_name].append_index(index)
        else:
            self.frame_elements[fe_name] = FrameElement(fe_name, index)
        """
        if fe_name in self.frame_elements:
            self.frame_elements[fe_name].append(index)
        else:
            self.frame_elements[fe_name] = [index]
        """
    def __bool__(self):
        return bool(self.frame_name or self.frame_elements)
class Sentence:
    def __init__(self, sentence_id):
        self.sentence_id = sentence_id
        self.words = []
        self.pos_list = []
        self.frame_descriptions = {}
    def add_word(self, word):
        self.words.append(word)
    def add_pos(self, pos):
        self.pos_list.append(pos)
    def add_frame(self, frame_description):
        self.frame_descriptions[frame_description.frame_name] = frame_description
    def __str__(self):
        output_str = f"{self.sentence_id}: {' '.join(self.words)}\n"
        output_str += f"{' '.join(self.pos_list)}"
        for fname in self.frame_descriptions.keys():
            output_str += f'{fname}: {self.frame_descriptions[fname].lex_unit} ({self.frame_descriptions[fname].lu_pos})\n'
            output_str += f'\t{self.frame_descriptions[fname].frame_elements}\n'
            #output_str += '\n'
        #output_str += f"{[self.frame_descriptions[fname].frame_name for fname in self.frame_descriptions.keys()]}"
        return output_str
        

In [3]:
sentences = []
current_frame = FrameDescription()
current_sentence = Sentence(0)
for row in conll.itertuples(name="LU_labelled"):
    # if current word is first in its sentence
    if int(row.word_index) == 1:
        # store found frame into prev sentence
        if current_frame:
            current_sentence.add_frame(current_frame)
            current_frame = FrameDescription()
        # if last saved sentence exists and it shares same id
        if len(sentences) > 0 and sentences[-1].sentence_id == int(row.sent_num):
            current_sentence = sentences[-1]
        # else create a new sentence and save it
        else:
            current_sentence = Sentence(int(row.sent_num))
            sentences.append(current_sentence)
    # if current word hasn't been seen before
    if row.word_index > len(current_sentence.words):
        current_sentence.add_word(row.word_form)
        current_sentence.add_pos(row.pos)
    if row.frame_name != '_':
        current_frame.set_frame_name(row.frame_name)
        
    if row.lemma_POS != '_':
        current_frame.set_LU(row.lemma_POS, row.word_index)
        
    if row.FE_Name_BIO != 'O':
        current_frame.add_FE(row.FE_Name_BIO, row.word_index)
current_sentence.add_frame(current_frame)
sent_str = [str(sent) for sent in sentences]
print('\n'.join(sent_str))

0: then , we model traffic response to random roadway disruptions and
RB , PRP VBP NN NN TO VB JJ NNS CCResponse: response.n (6)
	{'Agent': Agent[False target]: [3]
, 'Response': Response[False target]: [5]
, 'response.n': response.n[True target]: [6]
, 'Trigger': Trigger[False target]: [7, 8, 9, 10]
}
Coincidence: random.a (8)
	{'random.a': random.a[True target]: [8]
, 'State_of_affairs': State_of_affairs[False target]: [9, 10]
}

1: recalculate expected delays to determine the sensitivity of each city to
NN VBN NNS TO VB DT NN IN DT NN TOExpectation: expect.v (2)
	{'expect.v': expect.v[True target]: [2]
, 'Phenomenon': Phenomenon[False target]: [3]
}
Change_event_time: delay.n (3)
	{'delay.n': delay.n[True target]: [3]
, 'Interval': Interval[False target]: [3]
}
Contingency: determine.v (5)
	{'determine.v': determine.v[True target]: [5]
, 'Outcome': Outcome[False target]: [6, 7, 8, 9, 10]
}
Political_locales: city.n (10)
	{'city.n': city.n[True target]: [10]
, 'Locale': Locale[False 

In [5]:
from jinja2 import Template
with open('conllvis/template.html.jinja2') as file_:
    template = Template(file_.read())
print(template.render(stylesheet_name="styles/template.css",
                     title="resilience",
                     sentences=sentences))

<html>
    <head>
        <link rel="stylesheet" type="text/css" href="styles/template.css">
        <meta charset="utf-8">
        <title></title>
    </head>
    <body>
        
            <p>then , we model traffic response to random roadway disruptions and</p>
            <div id="frame_display">
                <div class = "sentence sent0">
                    <div class="label">Word Form</div>
                    <div class="lex-unit">then</div>
                    <div class="lex-unit">,</div>
                    <div class="lex-unit">we</div>
                    <div class="lex-unit">model</div>
                    <div class="lex-unit">traffic</div>
                    <div class="lex-unit">response</div>
                    <div class="lex-unit">to</div>
                    <div class="lex-unit">random</div>
                    <div class="lex-unit">roadway</div>
                    <div class="lex-unit">disruptions</div>
                    <div class="lex-unit">and</div>


In [6]:
with open('conllvis/styles/template.css.jinja2') as fh2:
    template1 = Template(fh2.read())
print(template1.render(sentences=sentences))

#frame_display {
    text-align: center;
}
.sentence {
    display: inline-grid;
    column-gap: 4px;
}
.lex-unit {
    display: flex;
    justify-content: left;
    font: 2em "Open Sans", sans-serif;
}
.pos {
    display: flex;
    justify-content: center;
}
.label {
    grid-column: 1;
    border-right: 4px solid red;
}
.frame_element {
    background-color: yellow;
}
.target {
    background-color: orange;
}


.sent0 {
    grid-template-columns: repeat(12, auto);
}

.sent1 {
    grid-template-columns: repeat(12, auto);
}

.sent3 {
    grid-template-columns: repeat(7, auto);
}

.sent4 {
    grid-template-columns: repeat(9, auto);
}

.sent5 {
    grid-template-columns: repeat(8, auto);
}

.sent7 {
    grid-template-columns: repeat(13, auto);
}

.sent8 {
    grid-template-columns: repeat(13, auto);
}

.sent9 {
    grid-template-columns: repeat(13, auto);
}

.sent10 {
    grid-template-columns: repeat(15, auto);
}

.sent11 {
    grid-template-columns: repeat(12, auto);
}

.sent12 {
    

In [5]:
"""class LexUnit:
    def __init__(self, lex_unit, pos, target, frame, fe):
        self.lex_unit = lex_unit # the label / "word" itself
        self.pos = pos
        self.target = OrderedDict([(frame, target)]) if target != '_' else OrderedDict()
        self.frame = [frame] if frame != '_' else []
        self.fe = OrderedDict([(frame, fe)]) if fe != 'O' else OrderedDict()
    def sameWord(self, other):
        if isinstance(other, LexUnit):
            return self.lex_unit == other.lex_unit and self.pos == other.pos
        return False
    def mergeInto(self, other):
        self.target.update(other.target)
        self.frame.extend(other.frame)
        self.fe.update(other.fe)
    def __repr__(self):
        out_str = f'{self.lex_unit}({self.pos})[{self.frame}]\nTargets:{self.target}\nFEs:{self.fe}'
        return out_str
class Sentence:
    def __init__(self):
        self.word_strs = [] # list of word strings
        self.lex_units = [] # list of LexUnits
    def __hash__(self):
        return hash(tuple(self.words_strs))
    def sameSentence(self, obj): # contains the same words in the same order
        if isinstance(obj, Sentence):
            return self.word_strs == obj.word_strs
        return False
    def addLexUnit(self, lu):
        self.word_strs.append(lu.lex_unit)
        self.lex_units.append(lu)
        
    def mergeInto(self, other): # same sentence but describing different frames
        if self.sameSentence(other):
            for i in range(len(self.lex_units)): # iterate through
                self.lex_units[i].target.update(other.lex_units[i].target)
                self.lex_units[i].frame.extend(other.lex_units[i].frame)
                self.lex_units[i].fe.update(other.lex_units[i].fe)
        else:
            raise Exception("unidentical sentences")
    def __repr__(self):
        fstring = ''
        for lex_unit in self.lex_units:
            fstring += str(lex_unit)
            fstring += '\n\n'
        return fstring
"""

'class LexUnit:\n    def __init__(self, lex_unit, pos, target, frame, fe):\n        self.lex_unit = lex_unit # the label / "word" itself\n        self.pos = pos\n        self.target = OrderedDict([(frame, target)]) if target != \'_\' else OrderedDict()\n        self.frame = [frame] if frame != \'_\' else []\n        self.fe = OrderedDict([(frame, fe)]) if fe != \'O\' else OrderedDict()\n    def sameWord(self, other):\n        if isinstance(other, LexUnit):\n            return self.lex_unit == other.lex_unit and self.pos == other.pos\n        return False\n    def mergeInto(self, other):\n        self.target.update(other.target)\n        self.frame.extend(other.frame)\n        self.fe.update(other.fe)\n    def __repr__(self):\n        out_str = f\'{self.lex_unit}({self.pos})[{self.frame}]\nTargets:{self.target}\nFEs:{self.fe}\'\n        return out_str\nclass Sentence:\n    def __init__(self):\n        self.word_strs = [] # list of word strings\n        self.lex_units = [] # list of LexU

In [6]:
"""
sentences = OrderedDict()
current_sentence = Sentence()
current_sentence_words = []
for row in conll.itertuples(name="LU_labelled"):
    print(row)
    if row.index == 1 and current_sentence.word_strs: 
        # we're beginning a new sentence: save old one and clear out current_sentence
        if tuple(current_sentence_words) in sentences:
            sentences[tuple(current_sentence_words)].mergeInto(current_sentence)
        else:
            sentences[tuple(current_sentence_words)] = current_sentence
        current_sentence = Sentence()
        current_sentence_words = []
    current_sentence_words.append(row.LU)
    current_sentence.addLexUnit(LexUnit(row.LU, row.POS, row.Target, row.Frame, row.FEs))
"""

'\nsentences = OrderedDict()\ncurrent_sentence = Sentence()\ncurrent_sentence_words = []\nfor row in conll.itertuples(name="LU_labelled"):\n    print(row)\n    if row.index == 1 and current_sentence.word_strs: \n        # we\'re beginning a new sentence: save old one and clear out current_sentence\n        if tuple(current_sentence_words) in sentences:\n            sentences[tuple(current_sentence_words)].mergeInto(current_sentence)\n        else:\n            sentences[tuple(current_sentence_words)] = current_sentence\n        current_sentence = Sentence()\n        current_sentence_words = []\n    current_sentence_words.append(row.LU)\n    current_sentence.addLexUnit(LexUnit(row.LU, row.POS, row.Target, row.Frame, row.FEs))\n'

In [7]:
sentences

[<__main__.Sentence at 0x7f4d21cae5c0>,
 <__main__.Sentence at 0x7f4d21cae630>,
 <__main__.Sentence at 0x7f4d21caeb38>,
 <__main__.Sentence at 0x7f4d21caebe0>,
 <__main__.Sentence at 0x7f4d21caecf8>,
 <__main__.Sentence at 0x7f4d21caed68>,
 <__main__.Sentence at 0x7f4d21caeeb8>,
 <__main__.Sentence at 0x7f4d21caef60>,
 <__main__.Sentence at 0x7f4d21cb3128>,
 <__main__.Sentence at 0x7f4d21cb3208>,
 <__main__.Sentence at 0x7f4d21cb3278>,
 <__main__.Sentence at 0x7f4d21cb3320>,
 <__main__.Sentence at 0x7f4d21cb33c8>,
 <__main__.Sentence at 0x7f4d21cb3470>,
 <__main__.Sentence at 0x7f4d21cb3550>,
 <__main__.Sentence at 0x7f4d21cb35c0>,
 <__main__.Sentence at 0x7f4d21cb36a0>,
 <__main__.Sentence at 0x7f4d21cb3710>,
 <__main__.Sentence at 0x7f4d21cb3780>,
 <__main__.Sentence at 0x7f4d482bf0f0>,
 <__main__.Sentence at 0x7f4d482bf0b8>,
 <__main__.Sentence at 0x7f4d21cb3828>,
 <__main__.Sentence at 0x7f4d21cb3940>,
 <__main__.Sentence at 0x7f4d21cb39e8>,
 <__main__.Sentence at 0x7f4d21cb3a58>,


In [8]:
it = iter(sentences)
print(next(it))
print(next(it))

0: then , we model traffic response to random roadway disruptions and
Response: response.n (6)
	{'S-Agent': [3], 'S-Response': [5], 'B-Trigger': [7], 'I-Trigger': [8, 9, 10]}
Coincidence: random.a (8)
	{'B-State_of_affairs': [9], 'I-State_of_affairs': [10]}

1: recalculate expected delays to determine the sensitivity of each city to
Expectation: expect.v (2)
	{'S-Phenomenon': [3]}
Change_event_time: delay.n (3)
	{'S-Interval': [3]}
Contingency: determine.v (5)
	{'B-Outcome': [6], 'I-Outcome': [7, 8, 9, 10]}
Political_locales: city.n (10)
	{'S-Locale': [10]}



In [9]:
print(sentences[('then', ',', 'we', 'model', 'traffic', 'response', 'to', 'random', 'roadway', 'disruptions', 'and')
])

TypeError: list indices must be integers or slices, not tuple