In [1]:
import spacy
import textacy
from spacy import displacy
import re
import matplotlib.pyplot as plt
import networkx as nx

In [2]:
nlp = spacy.load("ru_core_news_sm")

In [3]:
class text_treatment:
    def __init__(self, text):
        self.text = text
        self.clean_sent = []
    def clean(self):
        #removing paragraph numbers
        self.text = re.sub('[0-9]+.\t','',str(self.text))
        #removing new line characters
        self.text = re.sub('\n ','',str(self.text))
        self.text = re.sub('\n',' ',str(self.text))
        #removing apostrophes
        self.text = re.sub("'s",'',str(self.text))
        #removing hyphens
        self.text = re.sub("-",' ',str(self.text))
        self.text = re.sub(" – ",' ',str(self.text))
        #removing quotation marks
        self.text = re.sub('\"','',str(self.text))
        #removing any reference to outside text
        self.text = re.sub("[\(\[].*?[\)\]]", "", str(self.text))
        #removing urls
        self.text = re.sub(r'http\S+', "", str(self.text))
        #removing hashtags
        self.text = re.sub('#',"", str(self.text))
    def sentences(self):
    # split sentences and questions
        self.text = re.split('(?<!\..)[.?!]\s+', self.text)
        for sent in self.text:
            self.clean_sent.append(sent)

In [4]:
class text_triplets:
    def __init__(self, sentences):
        self.sentences = sentences
        self.verb_patterns = [[{"POS":"VERB"},{"POS":"ADP"}],[{"POS":"VERB"}],[{"POS":"ADV"},{"POS":"VERB"}]]
        self.noun_patterns = [[{"POS":"NOUN"}],[{"POS":"PROPN"}],[{"POS":"PROPN"},{"POS":"PORPN"}],[{"POS":"ADJ"},{"POS":"ADJ"},{"POS":"NOUN"}],[{"POS":"ADJ"},{"POS":"NOUN"}],[{"POS":"NOUN"},{"POS":"PROPN"}],[{"POS":"DET"},{"POS":"PROPN"}],[{"POS":"DET"},{"POS":"NOUN"}],[{"POS":"DET"},{"POS":"NOUN"},{"POS":"PROPN"}]]
        self.edges = []
        self.verbp = []
        self.edges_2 = []
        self.verbp_2 = []
        
    def contains_root(self, verb_phrase, root):
        vp_start = verb_phrase.start
        vp_end = verb_phrase.end
        if (root.i >= vp_start and root.i <= vp_end):
            return True
        else:
            return False
        
    def find_root_of_sentence(self, doc):
        root_token = None
        for token in doc:
            if (token.dep_ == "ROOT"):
                root_token = token
        return root_token
    
    def get_verb_phrases(self, doc):
        root = self.find_root_of_sentence(doc)
        verb_phrases = textacy.extract.matches.token_matches(doc, self.verb_patterns)
        new_vps = []
        for verb_phrase in verb_phrases:
            if (self.contains_root(verb_phrase, root)):
                new_vps.append(verb_phrase)
        return new_vps
    
    def longer_verb_phrase(self, verb_phrases):
        longest_length = 0
        longest_verb_phrase = None
        for verb_phrase in verb_phrases:
            if len(verb_phrase) > longest_length:
                longest_length = len(verb_phrase)
                longest_verb_phrase = verb_phrase
        return longest_verb_phrase
    
    def find_noun_phrases(self, doc):
        noun_phrases = textacy.extract.matches.token_matches(doc,self.noun_patterns)
        new_nph = []
        for noun_phrase in noun_phrases:
            new_nph.append(noun_phrase)
        return new_nph
        
    def longer_noun_phrase(self, noun_phrases):
        longest_length = 0
        noun_phrase_temp = None
        longest_noun_phrases = []
        for noun_phrase in noun_phrases:
            if noun_phrase_temp == None:
                longest_length = len(noun_phrase)
                noun_phrase_temp = noun_phrase
            if (str(noun_phrase_temp) in str(noun_phrase)) or (str(noun_phrase) in str(noun_phrase_temp)) and (noun_phrase_temp != None):
                if len(noun_phrase) > longest_length:
                    longest_lenght = len(noun_phrase)
                    noun_phrase_temp = noun_phrase
            elif (str(noun_phrase_temp) not in str(noun_phrase)) and (str(noun_phrase) not in str(noun_phrase_temp)) and (noun_phrase_temp != None):
                longest_noun_phrases.append(noun_phrase_temp)
                noun_phrase_temp = noun_phrase
                longest_length = len(noun_phrase)
        longest_noun_phrases.append(noun_phrase_temp)
        return longest_noun_phrases
    
    def find_triplet(self, sentence):
        doc = nlp(sentence)
        verb_phrases = self.get_verb_phrases(doc)
        noun_phrases = self.find_noun_phrases(doc)
        verb_phrase = None
        noun_phrase = None
        if (len(verb_phrases) > 1):
            verb_phrase = self.longer_verb_phrase(list(verb_phrases))
        else:
            verb_phrase = verb_phrases[0]
        if (len(noun_phrases) > 2):
            noun_phrase = self.longer_noun_phrase(noun_phrases)
        else:
            noun_phrase = noun_phrases
        if len(noun_phrase) == 2:
            left_noun_phrase = noun_phrase[0]
            right_noun_phrase = noun_phrase[1]
        else:
            words = str(sentence).split()
            left_noun_phrase =[]
            right_noun_phrase = []
            verb_pos = None
            i = 0
            for j in range(len(words)):
                if words[j] in str(verb_phrase) and len(words[j]) > 4:
                    verb_pos = i
                if words[j] in str(noun_phrase) and verb_pos == None:
                    left_noun_phrase.append(noun_phrase[i])
                    i += 1
                else:
                    right_noun_phrase = noun_phrase[i:len(noun_phrase)]   
        return (left_noun_phrase, verb_phrase, right_noun_phrase)

        
    def triplets_array(self):
        for sentence in self.sentences:
            (a,b,c) = self.find_triplet(sentence)
            self.edges.append([a,c])
            self.verbp.append(b)
        c = 0
        for i in self.edges:
            if type(i[0]) == list:
                if type(i[1]) == list:
                    for a in i[0]:
                        for b in i[1]:
                            self.edges_2.append([a,b])
                            self.verbp_2.append(self.verbp[c])
                else:
                    for a in i[0]:
                        self.edges_2.append([a,i[1]])
                        self.verbp_2.append(self.verbp[c])
            else:
                if type(i[1]) == list:
                    for b in i[1]:
                        self.edges_2.append([i[0],b])
                        self.verbp_2.append(self.verbp[c])
                else:
                    self.edges_2.append([i[0],i[1]])
                    self.verbp_2.append(self.verbp[c])
            c += 1
            
    def lemmatize(self):
        for i in range(len(self.edges_2)):
            for j in range(len(self.edges_2[i])):
                self.edges_2[i][j] = self.edges_2[i][j].lemma_
            self.verbp_2[i] = self.verbp_2[i].lemma_
    
    def displacy_show(self):
        for sentence in self.sentences:
            sentence = nlp(sentence)
            displacy.render(sentence, style='dep', jupyter=True)

In [5]:
class visualizer:
    def __init__(self,edges,edges_labels):
        self.edges = edges
        self.labels = edges_labels
        self.edge_labels = {}
    
    def edge_labels_create(self):
        for i in range(len(self.edges)):
            self.edge_labels[tuple(self.edges[i])] = self.labels[i]
    
    def visualize(self):
        edges = self.edges
        G = nx.DiGraph()
        G.add_edges_from(edges)
        pos = nx.spring_layout(G, k=.6)
        plt.figure(figsize=(17,15))
        nx.draw(
            G, pos, edge_color='black', width=1, linewidths=1,
            node_size=4300, node_color='pink',node_shape="o", alpha=1,arrows=True,
            labels={node: node for node in G.nodes()}
        )
        nx.draw_networkx_edge_labels(
            G, pos,
            edge_labels=self.edge_labels,
            font_color='red'
        )
        plt.axis('off')
        plt.show()
            