In [1]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        return node_weight.items()
    
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [2]:
from gensim.models import Word2Vec, KeyedVectors # to load the model
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

model = KeyedVectors.load_word2vec_format('lemmas.sg.s200.w2v.bin', binary=True)

In [3]:
def precision(tp, fp):
    return tp / (tp + fp)

def recall(tp, fn):
    return tp / (tp + fn)

def f1(p, r):
    return 2 * (p * r) / (p + r)

In [4]:
real_5_keywords = set(["elurikkus", "aasta", "omavalitsus", "inimene", "rohevõrgustik"])
real_10_keywords = real_5_keywords | set(["hoidmine", "liik", "keskkond", "kultuur", "planeering"])

In [5]:
from estnltk import Text
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup as bs
import re
import html
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import numpy
import networkx as nx

def is_substantive(word):
    return len(set(word["analysis"][0]["partofspeech"]) & set(['S'])) > 0

def is_adjective_or_substantive(word):
    return len(set(word["analysis"][0]["partofspeech"]) & set(['S', 'A'])) > 0

stop_words = [x.strip() for x in open("estonian-stopwords-lemmas.txt", "r", encoding="utf-8").readlines()]
set_stop_words = set(stop_words)

def method_generator(text):
    method1 = Counter()
    lemma_text = ""
    
    for word in text.words:
        lemma = word["analysis"][0]["lemma"]
        lemma_text += lemma + " "
        if is_adjective_or_substantive(word):
            method1[lemma] += 1
    tr4w = TextRank4Keyword()
    tr4w.analyze(lemma_text, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False, stopwords=stop_words)
    method2 = Counter()
    for item, value in tr4w.get_keywords():
        text_item = Text(item).tag_analysis()
        if is_adjective_or_substantive(text_item.words[0]):
            method2[item] = value
    
    method3 = []
    groups = []

    for item, value in method2.items():
        text_item = Text(item).tag_analysis()
        if is_substantive(text_item.words[0]):
            method3.append(item)

    for item in method3:
        added = False
        for group in groups:
            if not added:
                for other in group:
                    try:
                        if model.similarity(item, other) > 0.5:
                            group.append(item)
                            added = True
                            break
                    except Exception:
                        pass

        if not added:
            groups.append([item])

    method3 = Counter()
    new_groups = []

    for group in groups:
        best = [0, ""]
        for elem in group:
            potential = method2[elem]
            if potential > best[0]:
                best = [method2[elem], elem]
        method3[best[1]] = best[0]
    
    sets = [(5, real_5_keywords), (10, real_10_keywords)]
    methods = [("Esimese", method1), ("Teise", method2), ("Kolmanda", method3)]
    results = []
    for count, answer in sets:

        print("\tkus otsib " + str(count) + " võtmesõna:\n")
        for prefix, method in methods:
            method_res = method.most_common(count)
            res = [x for x, y in method_res]
            results.append(res)
            print("\tAndis " + prefix + " lähenemine võtmesõnadeks:\n\t" + ", ".join(res))
            tp = len(answer & set(res))
            fp = len(set(res) - answer)
            fn = len(answer - set(res))
            res_percision = precision(tp, fp)
            res_recall = recall(tp, fn)
            res_f1 = f1(res_percision, res_recall)
            print("\tMille täpsus on " + str(round(res_percision, 2)) + ", saagis on "  + str(round(res_recall, 2)), end="")
            print(", ning F1 skoor on " + str(round(res_f1, 2)), end="\n\n")
    
    return results

def lemma_scores(text):
    scores = Counter()
    for word in text.words:
        lemma = word["analysis"][0]["lemma"]
        if is_adjective_or_substantive(word) and lemma not in set_stop_words:
            scores[lemma] += 1
        else:
            scores[lemma] = 0
    return scores


def first_text_aggregation_method(nr_of_sentences, text):
    lemma_repetitions = lemma_scores(text)
    sentence_scores = Counter()
    for sentence in text.sentence_texts:
        sentence = Text(sentence).tag_analysis()
        score = 0
        for word in sentence.words:
            lemma = word["analysis"][0]["lemma"]
            score += lemma_repetitions[lemma]
        score /= len(sentence.words)
        sentence_scores[sentence.text] = score
    text = "\n".join([x[0] for x in sentence_scores.most_common(nr_of_sentences)])
    return Text(text).tag_analysis()

# def string_to_float(string):
#     return float("".join([str(ord(x)) for x in string]))

def cosine_similarity_between_lemmas(this, other):
#     this = [[string_to_float(x) for x in this]]
#     other = [[string_to_float(x) for x in other]]
#     return cosine_similarity(this, other)[0][0]
    return sum([x in this for x in other])

def second_text_aggregation_method(nr_of_sentences, text):
    lemma_repetitions = lemma_scores(text)
    sentences_dict = dict()
    summarize_text = []
    
    for sentence in text.sentence_texts:
        pre = sentence
        sentence = Text(sentence).tag_analysis()
        sentences_dict[pre] = [x["analysis"][0]["lemma"] for x in sentence.words]

    matrix = numpy.array([numpy.array([cosine_similarity_between_lemmas(this, other) for this in sentences_dict.values()])
              for other in sentences_dict.values()])
    sentence_similarity_graph = nx.from_numpy_array(matrix)
    scores = nx.pagerank(sentence_similarity_graph)
    
    sentences = sentences_dict.keys()
    ranked_sentences = sorted(((scores[i], sentence) for i, sentence in enumerate(sentences)), reverse=True) 
    
    for i in range(nr_of_sentences):
        summarize_text.append(ranked_sentences[i][1])
    
    text = " ".join(summarize_text)
    return Text(text).tag_analysis()


arr = []
with open("artikkel_voru_linna_lehest.txt", "r", encoding="utf-8") as f:
    text = Text("\n".join(f.readlines())).tag_analysis()
    print("Täis teksti meetodiga, |")
    arr = method_generator(text)
    print("Esimese tekstikokkuvõtmise algoritmiga, kus valiti 5 lauset, |")
    arr += method_generator(first_text_aggregation_method(5, text))
    print("Esimese tekstikokkuvõtmise algoritmiga, kus valiti 10 lauset, |")
    arr += method_generator(first_text_aggregation_method(10, text))
    print("Teise tekstikokkuvõtmise algoritmiga, kus valiti 5 lauset, |")
    arr += method_generator(second_text_aggregation_method(5, text))
    print("Teise tekstikokkuvõtmise algoritmiga, kus valiti 10 lauset, |")
    arr += method_generator(second_text_aggregation_method(10, text))


Täis teksti meetodiga, |
	kus otsib 5 võtmesõna:

	Andis Esimese lähenemine võtmesõnadeks:
	elurikkus, linn, aasta, omavalitsus, inimene
	Mille täpsus on 0.8, saagis on 0.8, ning F1 skoor on 0.8

	Andis Teise lähenemine võtmesõnadeks:
	elurikkus, aasta, omavalitsus, inimene, liik
	Mille täpsus on 0.8, saagis on 0.8, ning F1 skoor on 0.8

	Andis Kolmanda lähenemine võtmesõnadeks:
	elurikkus, aasta, omavalitsus, koht, kord
	Mille täpsus on 0.6, saagis on 0.6, ning F1 skoor on 0.6

	kus otsib 10 võtmesõna:

	Andis Esimese lähenemine võtmesõnadeks:
	elurikkus, linn, aasta, omavalitsus, inimene, kohalik, liik, uuring, rohevõrgustik, vald
	Mille täpsus on 0.6, saagis on 0.6, ning F1 skoor on 0.6

	Andis Teise lähenemine võtmesõnadeks:
	elurikkus, aasta, omavalitsus, inimene, liik, bioloog, koht, rohevõrgustik, lind, maa
	Mille täpsus on 0.6, saagis on 0.6, ning F1 skoor on 0.6

	Andis Kolmanda lähenemine võtmesõnadeks:
	elurikkus, aasta, omavalitsus, koht, kord, väärtus, teadvus, võimalus, v

In [6]:
positions = [Counter(), Counter()]
counts = [Counter(), Counter()]

short = [x for x in arr if len(x) == 5]
long  = [x for x in arr if len(x) == 10]

for i, sizes in enumerate([short, long]):
    for case in sizes:
        for pos, elem in enumerate(case):
            positions[i][elem] += pos
            counts[i][elem] += 1

total = [Counter(), Counter()]
            
for i, reps in enumerate([5, 10]):
    for item in counts[i]:
        total[i][item] = reps - positions[i][item] / counts[i][item]

for i, reps in enumerate([5, 10]):
    print(str(reps) + "st kõige sagedasemad võtmesõnad on: ")
    print(", ".join([x[0] for x in total[i].most_common(reps)]))
    print()

5st kõige sagedasemad võtmesõnad on: 
elurikkus, aasta, linn, inimene, omavalitsus

10st kõige sagedasemad võtmesõnad on: 
elurikkus, aasta, inimene, omavalitsus, linn, taim, sotsiaalne, toetav, hoidmine, artikkel



Hinnang:

---

Lõppkokkuvõteks
 - Sõna "uuring" oli huvitav, et tuli esimeseks - anomaalia
 - Esimene tekstikokkuvõtte meetod and erinevaid tulemusid võrreldes teise kokkuvõtva meetodiga
 - kõik 3 erinevat võtmesõnade eraldamise algoritmi andsid väga sarnaseid tulemusi jälle
 - Kokkuvõtvate algoritmide F1 indeksid ei olnud kõige paremad. Aga suuresti võib olla see tingitud sellest, et minu poolt valitud võtmesõnad pole just kõige paremad esindamaks antud teksti
 - "elurikkus", "aasta", "linn", "inimene" ning "omavalitsus" on läbivalt top 5 hulgas. Need 5 on kindlalt head võtmesõna
 - Teine kokkuvõttev meetod tahab väga sõna "artikkel" panna võtmesõnaks. Isiklikult ma ei arva, et see peaks olema seal
 - Päris palju on kordi, kus on näiteks kasutatud "planeering", "planeerimispraktika" või "planneerimine" kasutatud võtmesõnana. Lugeja võib mõtliskleda, et kas võiks iseenesest neid üheks lugeda.


