In [15]:
from estnltk.visualisation.span_visualiser.fancy_span_visualisation import DisplaySpans
from typing import Mapping, Any, Tuple, Sequence, Union
from estnltk import Text, Layer

In [2]:
def value_mapper_segment_type(segment: Tuple[str, Sequence], normal = '#ffffff00', conflicting = 'red', ambiguous = 'orange') -> str:
    if len(segment[1]) != 1:
        return conflicting
    elif len(segment[1][0].annotations) > 1:
        return ambiguous
    else: 
        return normal

In [3]:
from collections import defaultdict
u = {}
u.setdefault('red')
u[1]='blue'
u
v = defaultdict(lambda:'red')
v[1]= 'blue'
v[1]
v[2]

'red'

In [4]:
class DisplayAmbiguousSpans(DisplaySpans):
    """
    Displays overlaps between spans and spans with multiple annotations
    
    Default background color scheme is following:
    * normal spans are transparent
    * overlapping spans are red
    * ambigious spans are orange
    The opacity level indicates the number of overlaps and annotations
    
    Color scheme is controlled by two dictionary like class attributes
    * span_coloring[int]
    * annotation_coloring[int]
    
    Assigning corresponding elements redefines the coloring for a particular
    number of annotations or spans, e.g. conflict_color[2] = 'blue'.
    
    Assigning corresponding attributes redefines the entire color scheme.
    The assignable object color_scheme must support color_scheme[i] for anu int 
    """
    
    def __init__(self):
        super(DisplayAmbiguousSpans, self).__init__(styling="direct")
        
        # Define two shades of red for overlaps
        self.span_coloring = defaultdict(lambda:'#FF0000')
        self.span_coloring[2] = '#FF5050'

        # Define transparent + two shades of orange for ambigious annotations
        self.annotation_coloring = defaultdict(lambda:'#F59B00')
        self.annotation_coloring[1] = '#FFA50000'
        self.annotation_coloring[2] = '#FFA500'

        self.span_decorator.bg_mapping = self.__bg_mapper
        
    
    def __bg_mapper(self, segment: Tuple[str, Sequence]) -> str:
        if len(segment[1]) != 1:
            return self.span_coloring[segment[1]]
        
        return self.annotation_coloring[len(segment[1][0].annotations)]


In [16]:
class DisplayPostagsSpans(DisplaySpans):
    
    def __init__(self, layer:str='morph_analysis', tagset:str='EstMorf', ambiguity_resolver:callable=None):
        super(DisplayPostagsSpans, self).__init__(styling="direct")

        self.morph_layer = layer
        self.tagset = tagset
        self.pos_coloring = defaultdict(lambda:'white')
        self.ambiguity_resolver = ambiguity_resolver or self.__default_resolver
        self.span_decorator.bg_mapping = self.__bg_mapper

        if self.tagset == 'EstMorf':
            self.pos_coloring['S'] = 'orange'
            self.pos_coloring['H'] = 'orange'
            self.pos_coloring['A'] = 'yellow'
            self.pos_coloring['U'] = 'yellow'
            self.pos_coloring['C'] = 'yellow'
            self.pos_coloring['N'] = 'yellow'
            self.pos_coloring['O'] = 'yellow'
            self.pos_coloring['V'] = 'lime'
            self.pos_coloring['*'] = 'gray'
            
        # Define two shades of red for overlaping tokenisation
        self.span_coloring = defaultdict(lambda:'#FF0000')
        self.span_coloring[2] = '#FF5050'
            
    def __call__(self, object:Union[Text, Layer]) -> str:
        if isinstance(object, Text):
            return super(DisplayPostagsSpans, self).__call__(object[self.morph_layer])
        elif isinstance(object, Layer):
            return super(DisplayPostagsSpans, self).__call__(object)
        else:
            raise ValueError('Invalid input')
            
            
    def __default_resolver(self, span) -> str:
        pos_tags = set(span.partofspeech)
        if len(pos_tags) == 1:
            return next(iter(pos_tags));
        return '*'

    
    def __bg_mapper(self, segment: Tuple[str, Sequence]) -> str:
        if len(segment[1]) != 1:
            self.span_coloring[len(segment[1])]
        
        return self.pos_coloring[self.ambiguity_resolver(segment[1][0])]

In [8]:
t=Text("""
Silver Ükssilma lugu

Silver, kus on sinu kullamäed? 
Merepõhja vara maha jäi koos laevaga. 

Silver, kus on sinu julged teod? 
Merepõhja noorus maha jäi koos laevaga. 

Silver, kus on sinu vasak silm? 
Merepõhja silm maha jäi koos laevaga. 

Silver, kus on sinu röövlisalk? 
Merepõhja poisid maha jäid koos laevaga. 

Refr. Üksi ma veel kõrtsu laua taga, 
piigad põlvedel oma viimseid päevi magan. 

Silver, kus on sinu poisipõnn? 
Merel seilab minu poisipõnn musta lipu all. 

Silver, kus on sinu sünnipaik? 
Laevakajutis mu sünnipaik musta lipu all. 

Silver, kus on sinu õige koht? 
Laeva tekil minu õige koht musta lipu all. 

Silver, kus on sinu röövlisalk? 
Merepõhja poisid maha jäid koos laevaga. 

Refr. Üksi ma veel … 

Merel sõidab minu poisipõnn, 
ahhoi!
""")
t.analyse('morphology')

text
"Silver Ükssilma luguSilver, kus on sinu kullamäed? Merepõhja vara maha jäi koos laevaga. Silver, kus on sinu julged teod? Merepõhja noorus maha jäi koos laevaga. Silver, kus on sinu vasak silm? Merepõhja silm maha jäi koos laevaga. Silver, kus on sinu röövlisalk? Merepõhja poisid maha jäid koos laevaga. Refr. Üksi ma veel kõrtsu laua taga, piigad põlvedel oma viimseid päevi magan. Silver, kus on sinu poisipõnn? Merel seilab minu poisipõnn musta lipu all. Silver, kus on sinu sünnipaik? Laevakajutis mu sünnipaik musta lipu all. Silver, kus on sinu õige koht? Laeva tekil minu õige koht musta lipu all. Silver, kus on sinu röövlisalk? Merepõhja poisid maha jäid koos laevaga. Refr. Üksi ma veel … Merel sõidab minu poisipõnn, ahhoi!"

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,22
words,normalized_form,,,False,150
morph_analysis,"lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,150


In [155]:
display_ambigous = DisplayAmbiguousSpans()
display_ambigous(t.morph_analysis)

In [18]:
display_postags = DisplayPostagsSpans()
display_postags(t.morph_analysis)
display_postags(t)

In [6]:
%config IPCompleter.greedy=True

In [6]:
def display_ambigious(layer, bg_color = 'orange'):
    """
    Colors all ambigious spans  
    """
    
    disp2 = DisplaySpans(styling="direct")
    disp2.span_decorator.bg_mapping = lambda segment: color_first_cases(segment, color_dict, ambiguous)
    disp2(t.morph_analysis)    

In [2]:
def color_postags(segment: Tuple[str, Sequence],  color_dict, ambiguous ) -> str:
    """
    Colors based on postags
    """

    if len(segment[1]) != 1:
        
        return 'red'
    
    # It might be better to convert it to string     
    pos_tags = getattr(segment[1][0], 'partofspeech')

    # Ambigous POS tagging
    
    postags_set = sorted(list(set(pos_tags)))
    
    if len(postags_set) > 1:
        return ambiguous

    return color_dict.get(pos_tags[0], 'white')

In [3]:
def display_postags(text, color_dict= {'S': 'orange', 'H': 'orange', 'A': 'yellow', 'U': 'yellow', 'C': 'yellow', 'N': 'yellow', 
                       'O': 'yellow', 'V': 'lime'}, ambiguous= 'white'):
    
    """
    Displays some postags in different colors. Colors and postags can be defined by user
    """
    
    t = Text(text).tag_layer()
    disp2 = DisplaySpans(styling="direct")
    disp2.span_decorator.bg_mapping = lambda segment: color_postags(segment, color_dict, ambiguous)
    disp2(t.morph_analysis)


In [4]:
display_postags('Samojeedid on valged karvapallid.')

In [5]:
display_postags('Partitsiibid on jäänud  mitmeseks.')

In [6]:
color_dict = {'S': 'lightgreen', 'H': 'lightgreen', 'A': 'pink'}

In [7]:
display_postags('Samojeedid on valged karvapallid.', color_dict = color_dict)

In [8]:
display_postags('Partitsiibid on jäänud  mitmeseks.', ambiguous = 'red')

In [9]:
def color_first_cases(segment: Tuple[str, Sequence], color_dict, ambiguous ) -> str:
    """
    Colors the first 3 and a half cases
    """
    #print(segment[1])
    # Tokenization conflict
    if len(segment[1]) != 1:
        
        return 'red'
    
    # It might be better to convert it to string     
    #pos_tags = getattr(segment[1][0], 'partofspeech')
    forms = getattr(segment[1][0], 'form')
    # Ambigous POS tagging
    
    forms_set = sorted(list(set(forms)))
    
    if len(forms) > 1:
        return 'red'

    color_dict = {'sg n': 'orange', 'pl n': 'orange', 'sg g': 'yellow', 'pl g': 'yellow', 'sg p': 'lightgreen',
                 'pl p': 'lightgreen', 'adt': 'pink'}
    
    return color_dict.get(forms_set[0], 'white')

In [10]:
def display_first_cases(text, color_dict = {'sg n': 'orange', 'pl n': 'orange', 'sg g': 'yellow', 'pl g': 'yellow', 'sg p': 'lightgreen',
                 'pl p': 'lightgreen', 'adt': 'pink'}, ambiguous = 'red'):
    """
    Displays the first 3 and a half cases in different colors, can be defined by user
    """
    t = Text(text).tag_layer()
    disp2 = DisplaySpans(styling="direct")
    disp2.span_decorator.bg_mapping = lambda segment: color_first_cases(segment, color_dict, ambiguous)
    disp2(t.morph_analysis)


In [11]:
display_first_cases('aias sadas saia ja leiva peale kukkus õun ja vette hüppasid surnud kalad')

In [12]:
def color_compounds(segment: Tuple[str, Sequence]) -> str:
    """
    Colors compound words in millennial pink
    """

    # Tokenization conflict
    if len(segment[1]) != 1:
        
        return 'green'
    
    root_tokens = getattr(segment[1][0], 'root_tokens')
 
    for token in root_tokens:
        if len(token) > 1:
            return '#ffb6c1'

    color_dict = {}

    return color_dict.get(root_tokens[0], 'white')

In [13]:
def display_compounds(text):
    """
    Displays compound words
    """
    t = Text(text).tag_layer()
    disp3 = DisplaySpans(styling="direct")
    disp3.span_decorator.bg_mapping = lambda segment: color_compounds(segment)
    disp3(t.morph_analysis)

In [14]:
display_compounds('rukkililled ja jääkarud on tavalised liitsõnad, aga roosid ja rebased mitte')