In [1]:
import nltk

In [2]:
from nltk.collocations import *


In [3]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
fourgram_measures = nltk.collocations.QuadgramAssocMeasures()

In [4]:
## importing output-file.txt which contains all text filed labelled as beginner-level
with open('output-leicht.txt', encoding = 'utf-8') as text:
    textoutput = text.read()

In [5]:
### make textoutput processable for CollocationFinder functions
text = nltk.wordpunct_tokenize(textoutput)
##Filter out punctuation marks etc
tokens = [word for word in text if word.isalpha()]

## Create finders
finderb = BigramCollocationFinder.from_words(tokens)
findert = TrigramCollocationFinder.from_words(tokens)
finderf = QuadgramCollocationFinder.from_words(tokens)

## optional: filter, so that only those n-grams remain that appear 2+ times
finderb.apply_freq_filter(2)
findert.apply_freq_filter(2)
finderf.apply_freq_filter(2)


In [6]:
### Named Entity Recognition (NER) preparation
import spacy
# Load the spaCy language model
nlp = spacy.load("de_core_news_sm")

# NLP model is applied to processable part of textoutput (maximal length is 1000000)
doc = nlp(textoutput[:1000000])
## if len(textoutput) > 1000000 the nlp function has to be applied to its split parts; 
# in our case the length was 1051064, thus nlp was applied twice
doc2 = nlp(textoutput[1000000:])

    

In [8]:
## collect all named entities that are a location or person in one set
entities = [tuple(nltk.wordpunct_tokenize(ent.text)) for ent in doc.ents if ent.label_ in {'PER', 'LOC'}]
# since len(textoutput) > 1000000, scanning had to be applied on both doc and doc2 and the collected entities combined
entities = entities + [tuple(nltk.wordpunct_tokenize(ent.text)) for ent in doc2.ents if ent.label_ in {'PER', 'LOC'}]



In [29]:
#### Functions for different association measures, all of which return lists with ngrams, measure values and amount of occurences

#### Attempt to filter out collocations with 'gibt' and named entities, 
## to make other collocations visible/ list shorter -> doesn't filter out all 'gibt' but some of them


### PMI: degree of association between words by comparing observed co-occurrence frequency with 
### expected co-occurrence frequency if they were independent -> less relevant
## for a specified ngram-type, return the top n n-grams concerning its PMI value
def toppmi(ngram,n):
    topn = []
    if (ngram == "bigram"):
        for bigram in finderb.nbest(bigram_measures.pmi, n):
            ## entries with 'gibt' and named entities are not included
            if not('gibt' in bigram or bigram in entities):
                pmi = finderb.score_ngram(bigram_measures.pmi, bigram[0], bigram[1])
                rf = finderb.ngram_fd[bigram]
                topn.append((bigram, pmi, rf))
                
    if (ngram == "trigram"):
        for trigram in findert.nbest(trigram_measures.pmi, n):
            ## entries with 'gibt' and named entities are not included
            if not ('gibt' in trigram or trigram in entities):
                pmi = findert.score_ngram(trigram_measures.pmi, trigram[0], trigram[1], trigram[2])
                rf = findert.ngram_fd[trigram]
                topn.append((trigram, pmi, rf))
                
    if (ngram == "fourgram"):
        for fourgram in finderf.nbest(fourgram_measures.pmi, n):
            ## entries with 'gibt' and named entities are not included
            if not('gibt' in fourgram or fourgram in entities):
                pmi = finderf.score_ngram(fourgram_measures.pmi, fourgram[0], fourgram[1], fourgram[2], fourgram[3])
                rf = finderf.ngram_fd[fourgram]
                topn.append((fourgram, pmi, rf))
    return topn

### Likelihood: 10 N-grams with highest likelihood (=statistical measure indicating the strength of association between words)
## for a specified ngram-type, return the top n n-grams concerning its likelihood ratio
## further details: https://stackoverflow.com/questions/21165702/nltk-collocations-for-specific-words
def toplikelihood(ngram, n):
    topn = []
    if (ngram == "bigram"):
        for bigram in finderb.nbest(bigram_measures.likelihood_ratio, n):
            ## entries with 'gibt' and named entities are not included
            if not('gibt' in bigram or bigram in entities):
                pmi = finderb.score_ngram(bigram_measures.likelihood_ratio, bigram[0], bigram[1])
                rf = finderb.ngram_fd[bigram]
                topn.append((bigram, pmi, rf))
                
    if (ngram == "trigram"):
        for trigram in findert.nbest(trigram_measures.likelihood_ratio, n):
            ## entries with 'gibt' and named entities are not included
            if not('gibt' in trigram or trigram in entities):
                pmi = findert.score_ngram(trigram_measures.likelihood_ratio, trigram[0], trigram[1], trigram[2])
                rf = findert.ngram_fd[trigram]
                topn.append((trigram, pmi, rf))
                
    if (ngram == "fourgram"):
        for fourgram in finderf.nbest(fourgram_measures.likelihood_ratio, n):
            ## entries with 'gibt' and named entities are not included
            if not('gibt' in fourgram or fourgram in entities):
                pmi = finderf.score_ngram(fourgram_measures.likelihood_ratio, fourgram[0], fourgram[1], fourgram[2], fourgram[3])
                rf = finderf.ngram_fd[fourgram]
                topn.append((fourgram, pmi, rf))
    return topn

  

### Normalized frequency: calculates and normalizes the raw frequency of an ngram, thus the appearance probability, 
## sorted first by frequency and second alphabetically [(tuple, rf value, frequency), ...]
def topfreq(ngram, n):
    topn = []
    if (ngram == "bigram"):
        for bigram in finderb.score_ngrams(bigram_measures.raw_freq)[:n]:
        ## entries with 'gibt' and named entities are not included
            if not('gibt' in bigram or bigram in entities):
                rf = finderb.ngram_fd[bigram[0]]
                topn.append(bigram + (rf,))
                
    if (ngram == "trigram"):
        for trigram in findert.score_ngrams(trigram_measures.raw_freq)[:n]:
            ## entries with 'gibt' and named entities are not included
            if not('gibt' in trigram or trigram in entities):
                rf = findert.ngram_fd[trigram[0]]
                topn.append(trigram + (rf,))
                
    if (ngram == "fourgram"):
        for fourgram in finderf.score_ngrams(fourgram_measures.raw_freq)[:n]:
            ## entries with 'gibt' and named entities are not included
            if not(('gibt' in fourgram)  or (fourgram in entities)):
                rf = finderf.ngram_fd[fourgram[0]]
                topn.append(fourgram + (rf,))
                
    return topn


### For given ngram: creates new file based on top n entries according to each measure value, 
### writes them down, listed beneath each other, in new text file called "top[n][ngram]s.txt"
### Important: amount of included entries in final document != n, 
### since the entries with 'gibt' and named entities are counted but not written down.

def topnsummary(ngram, n):
    with open('top' + str(n) + str(ngram) + 's.txt', 'w', encoding ='utf-8') as output_file:
        output_file.write("Top " + str(n) + " values\n\nPMI:\n")
        for i in toppmi(ngram, n): output_file.write(str(i) + "\n")
        output_file.write("\n\nLikelihoodratio:\n") 
        for i in toplikelihood(ngram, n): output_file.write(str(i) + "\n")                  
        output_file.write("\n\n Top values Frequency:\n")
        for i in topfreq(ngram, n): output_file.write(str(i) + "\n")
                          
### Exact amount, for result of running  with n = 200: view cell no 18.
def printsummary(ngram, n):
    print("Top " + str(n) + ngram + " values gibt and named entities filtered out \n\nPMI:\n")
    print(str(len(toppmi(ngram, n))) + "\n")
    print("Likelihoodratio:\n") 
    print(str(len(toplikelihood(ngram, n))) + "\n")           
    print("Frequency:\n")         
    print(str(len(topfreq(ngram, n))) + "\n") 



In [13]:
### Produce the files with the top 200 ngrams:
topnsummary("bigram", 200)
topnsummary("trigram", 200)
topnsummary("fourgram", 200)