In [1]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        return node_weight.items()
    
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [2]:
from gensim.models import Word2Vec, KeyedVectors # to load the model
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

model = KeyedVectors.load_word2vec_format('lemmas.sg.s200.w2v.bin', binary=True)

In [3]:
def precision(tp, fp):
    return tp / (tp + fp)

def recall(tp, fn):
    return tp / (tp + fn)

def f1(p, r):
    return 2 * (p * r) / (p + r)

In [4]:
real_5_keywords = set(["elurikkus", "aasta", "omavalitsus", "inimene", "rohevõrgustik"])
real_10_keywords = real_5_keywords | set(["hoidmine", "liik", "keskkond", "kultuur", "planeering"])

In [34]:
from estnltk import Text
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup as bs
import re
import html
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import numpy
import networkx as nx
from gensim import corpora, models
import gensim


def is_text(word):
    return len(set(word["analysis"][0]["partofspeech"]) & set(['Z'])) < 1

def is_substantive(word):
    return len(set(word["analysis"][0]["partofspeech"]) & set(['S'])) > 0

def is_adjective_or_substantive(word):
    return len(set(word["analysis"][0]["partofspeech"]) & set(['S', 'A'])) > 0

stop_words = [x.strip() for x in open("estonian-stopwords-lemmas.txt", "r", encoding="utf-8").readlines()]
set_stop_words = set(stop_words)

def method_generator(text):
    textRank = Counter()
    
    lemma_text = ""
    un_stopped_text = []
    
    for word in text.words:
        lemma = word["analysis"][0]["lemma"]
        lemma_text += lemma + " "
        if lemma not in stop_words and is_text(word):
            un_stopped_text.append(lemma)
        
        if is_substantive(word):
            textRank[lemma] += 1
    
    tr4w = TextRank4Keyword()
    tr4w.analyze(lemma_text, candidate_pos = ['NOUN'], window_size=4, lower=False, stopwords=stop_words)
    
    LDA_texts = [un_stopped_text]
    dictionary = corpora.Dictionary(LDA_texts)
    corpus = [dictionary.doc2bow(text) for text in LDA_texts]
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=1, id2word = dictionary, passes=20)
    
    LSA_texts = [un_stopped_text]
    dictionary = corpora.Dictionary(LSA_texts)
    corpus = [dictionary.doc2bow(text) for text in LSA_texts]
    lsamodel = gensim.models.LsiModel(corpus, num_topics=1, id2word = dictionary)
    
    def get_text_rank_words_from_model(model, nr_of_words):
        method_res = method.most_common(nr_of_words)
        return [x for x, y in method_res]
    
    def get_words_from_model(model, nr_of_words):
        return [x.split('"')[1] for x in model.print_topics(num_words=nr_of_words)[0][1].split("+")]

    sets = [(5, real_5_keywords), (10, real_10_keywords)]
    methods = [("TextRank", textRank, get_text_rank_words_from_model),
               ("LDA", ldamodel, get_words_from_model),
               ("LSA", lsamodel, get_words_from_model)]
    results = []
    for count, answer in sets:

        print("\tkus otsib " + str(count) + " võtmesõna:\n")
        for prefix, method, resolver in methods:
            res = resolver(method, count)
            results.append(res)
            print("\tAndis " + prefix + " lähenemine võtmesõnadeks:\n\t" + ", ".join(res))
            tp = len(answer & set(res))
            fp = len(set(res) - answer)
            fn = len(answer - set(res))
            res_percision = precision(tp, fp)
            res_recall = recall(tp, fn)
            res_f1 = f1(res_percision, res_recall)
            print("\tMille täpsus on " + str(round(res_percision, 2)) + ", saagis on "  + str(round(res_recall, 2)), end="")
            print(", ning F1 skoor on " + str(round(res_f1, 2)), end="\n\n")
    
    return results

In [61]:
arr = []
with open("artikkel_voru_linna_lehest.txt", "r", encoding="utf-8") as f:
    text = Text("\n".join(f.readlines())).tag_analysis()
    print("Täis teksti meetodiga, |")
    arr = method_generator(text)

Täis teksti meetodiga, |
	kus otsib 5 võtmesõna:

	Andis TextRank lähenemine võtmesõnadeks:
	elurikkus, linn, aasta, omavalitsus, inimene
	Mille täpsus on 0.8, saagis on 0.8, ning F1 skoor on 0.8

	Andis LDA lähenemine võtmesõnadeks:
	elurikkus, linn, aasta, omavalitsus, inimene
	Mille täpsus on 0.8, saagis on 0.8, ning F1 skoor on 0.8

	Andis LSA lähenemine võtmesõnadeks:
	elurikkus, linn, aasta, omavalitsus, inimene
	Mille täpsus on 0.8, saagis on 0.8, ning F1 skoor on 0.8

	kus otsib 10 võtmesõna:

	Andis TextRank lähenemine võtmesõnadeks:
	elurikkus, linn, aasta, omavalitsus, inimene, liik, loodus, rohevõrgustik, vald, uuring
	Mille täpsus on 0.6, saagis on 0.6, ning F1 skoor on 0.6

	Andis LDA lähenemine võtmesõnadeks:
	elurikkus, linn, aasta, omavalitsus, inimene, kohalik, sageli, liik, uuring, elama
	Mille täpsus on 0.5, saagis on 0.5, ning F1 skoor on 0.5

	Andis LSA lähenemine võtmesõnadeks:
	elurikkus, linn, aasta, omavalitsus, inimene, kohalik, sageli, liik, elama, vald
	Mil

In [62]:
positions = [Counter(), Counter()]
counts = [Counter(), Counter()]

short = [x for x in arr if len(x) == 5]
long  = [x for x in arr if len(x) == 10]

for i, sizes in enumerate([short, long]):
    for case in sizes:
        for pos, elem in enumerate(case):
            positions[i][elem] += pos
            counts[i][elem] += 1

total = [Counter(), Counter()]
            
for i, reps in enumerate([5, 10]):
    for item in counts[i]:
        total[i][item] = reps - positions[i][item] / counts[i][item]

for i, reps in enumerate([5, 10]):
    print(str(reps) + "st kõige sagedasemad võtmesõnad on: ")
    print(", ".join([x[0] for x in total[i].most_common(reps)]))
    print()

5st kõige sagedasemad võtmesõnad on: 
elurikkus, linn, aasta, omavalitsus, inimene

10st kõige sagedasemad võtmesõnad on: 
elurikkus, linn, aasta, omavalitsus, inimene, kohalik, loodus, sageli, liik, rohevõrgustik



In [63]:
import pandas as pd

real_keywords = [real_5_keywords, real_10_keywords]
text_rank_keywords = [short[0], long[0]]
lda_keywords = [short[1], long[1]]
lsa_keywords = [short[2], long[2]]

combinations = [("kuldvõtmesõnade", real_keywords), ("TextRank", text_rank_keywords), 
               ("LSA", lsa_keywords), ("LDA", lda_keywords)]

data = []
for i in range(len(combinations) - 1):
    for j in range(i + 1, len(combinations)):
        for k in range(2):
            first_name, first = combinations[i]
            first_xs = first[k]
            second_name, second = combinations[j]
            second_xs = second[k]
            overlap = sum([x in second_xs for x in first_xs]) / len(first_xs)
            data.append([first_name, second_name, len(first_xs), str(overlap * 100) + "%"])

pd.DataFrame(data, columns=["method 1", "method 2", "nr of keywords", "overlap"])

Unnamed: 0,method 1,method 2,nr of keywords,overlap
0,kuldvõtmesõnade,TextRank,5,80.0%
1,kuldvõtmesõnade,TextRank,10,60.0%
2,kuldvõtmesõnade,LSA,5,80.0%
3,kuldvõtmesõnade,LSA,10,50.0%
4,kuldvõtmesõnade,LDA,5,80.0%
5,kuldvõtmesõnade,LDA,10,50.0%
6,TextRank,LSA,5,100.0%
7,TextRank,LSA,10,70.0%
8,TextRank,LDA,5,100.0%
9,TextRank,LDA,10,70.0%


Hinnang:

---

Lõppkokkuvõteks
 - kõik 3 erinevat võtmesõnade eraldamise meetodit andsid väga sarnaseid tulemusi aga TextRank andis kõige paremaid tulemusi
 - Kokkuvõtvate algoritmide F1 indeksid ei olnud kõige paremad. Aga suuresti võib olla see tingitud sellest, et minu poolt valitud võtmesõnad pole just kõige paremad esindamaks antud teksti
 - "elurikkus", "aasta", "linn", "inimene" ning "omavalitsus" on läbivalt top 5 hulgas. Need 5 on kindlalt head võtmesõna
 - LSA ning LDA annavad väga sarnaseid tulemusid kuid erinevaid kuldvõtmesõnadest ning TextRank võtmesõnadest
 - Selle nädala tulemused on kindlasti palju paremad kui eelmise nädala omad. Ehk siis LDA ning LSA on head vahendid võtmesõnade eraldamiseks.

