In [1]:
import nltk

In [2]:
from nltk.collocations import *


In [3]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
fourgram_measures = nltk.collocations.QuadgramAssocMeasures()

In [10]:
## importing output-file.txt, if it was created already with box below
with open('output-leicht.txt', encoding = 'utf-8') as text:
    textoutput = text.read()

In [8]:
## ONLY HAS TO BE DONE ONCE: import all text files saved in texts.zip, and merge them into one big file
## important: make sure that all files in texts.zip are not empty, in this case it meant deleting the following files: 
# Deutsch perfekt 12_2018, 12_2019, and all Deutsch perfekt starting from 2011, as they were all empty

# importing required modules 
from zipfile import ZipFile
import os

## Loop to open all text files in the texts.zip and merge them into one final file output-leicht.txt
with open('output-leicht.txt', 'w', encoding ='utf-8') as output_file:
    with ZipFile("texts.zip", "r") as z:
         # Iterate through each folder in the zip file
        z.extractall()
        # Iterate through each file in the zip
        for file in z.namelist(): 
            # append the file content
            with open(file, 'r', encoding ='utf-8') as f:
                ## Merge only the easy texts (A1 and A2)
                if "leicht" in f.name:
                    output_file.write(f.read())
             
##
## Import final file as textoutput
with open('output-leicht.txt', encoding = 'utf-8') as text:
    textoutput = text.read()

In [11]:
### make textoutput processable for CollocationFinder functions
text = nltk.wordpunct_tokenize(textoutput)
##Filter out punctuation marks etc
tokens = [word for word in text if word.isalpha()]

## Create finders
finderb = BigramCollocationFinder.from_words(tokens)
findert = TrigramCollocationFinder.from_words(tokens)
finderf = QuadgramCollocationFinder.from_words(tokens)

## optional: filter, so that only those n-grams remain that appear 2+ times
finderb.apply_freq_filter(2)
findert.apply_freq_filter(2)
finderf.apply_freq_filter(2)


In [13]:
### Named Entity Recognition (NER) preparation
import spacy
# Load the spaCy language model
nlp = spacy.load("de_core_news_sm")

# NLP model is applied to processable part of textoutput (maximal length is 1000000)
doc = nlp(textoutput[:1000000])
## if len(textoutput) > 1000000 the document has to be extended (potentially with doc3, doc4, etc., view next box) 
doc2 = nlp(textoutput[1000000:])

    

In [12]:
### Depending on len(doc)
'''doc3 = nlp(textoutput[2000000:3000000])
entities = entities + [tuple(nltk.wordpunct_tokenize(ent.text)) for ent in doc3.ents if ent.label_ in {'PER', 'LOC'}]
doc4 = nlp(textoutput[3000000:4000000])
entities = entities + [tuple(nltk.wordpunct_tokenize(ent.text)) for ent in doc4.ents if ent.label_ in {'PER', 'LOC'}]
doc5 = nlp(textoutput[4000000:])
entities = entities + [tuple(nltk.wordpunct_tokenize(ent.text)) for ent in doc5.ents if ent.label_ in {'PER', 'LOC'}]'''

In [14]:
## collect all named entities that are a location or person in one set
entities = [tuple(nltk.wordpunct_tokenize(ent.text)) for ent in doc.ents if ent.label_ in {'PER', 'LOC'}]
# if len(textoutput) > 1000000, this has to be adjusted/added with doc 3, doc etc. (view box above)
entities = entities + [tuple(nltk.wordpunct_tokenize(ent.text)) for ent in doc2.ents if ent.label_ in {'PER', 'LOC'}]



In [113]:
#### Functions for different association measures, all of which return lists with ngrams, measure values and amount of occurences

#### Attempt to filter out collocations with 'gibt' and named entities, 
## to make other collocations visible/ list shorter -> doesn't filter out all 'gibt' but some of them

### PMI: degree of association between words by comparing observed co-occurrence frequency with 
### expected co-occurrence frequency if they were independent -> less relevant
## for a specified ngram-type, return the top n n-grams concerning its PMI value
def toppmi(ngram,n):
    topn = []
    if (ngram == "bigram"):
        for bigram in finderb.nbest(bigram_measures.pmi, n):
            ## entries with 'gibt' and named entities are not included
            if not('gibt' in bigram or bigram in entities):
                pmi = finderb.score_ngram(bigram_measures.pmi, bigram[0], bigram[1])
                rf = finderb.ngram_fd[bigram]
                topn.append((bigram, pmi, rf))
                
    if (ngram == "trigram"):
        for trigram in findert.nbest(trigram_measures.pmi, n):
            ## entries with 'gibt' and named entities are not included
            if not ('gibt' in trigram or trigram in entities):
                pmi = findert.score_ngram(trigram_measures.pmi, trigram[0], trigram[1], trigram[2])
                rf = findert.ngram_fd[trigram]
                topn.append((trigram, pmi, rf))
                
    if (ngram == "fourgram"):
        for fourgram in finderf.nbest(fourgram_measures.pmi, n):
            ## entries with 'gibt' and named entities are not included
            if not('gibt' in fourgram or fourgram in entities):
                pmi = finderf.score_ngram(fourgram_measures.pmi, fourgram[0], fourgram[1], fourgram[2], fourgram[3])
                rf = finderf.ngram_fd[fourgram]
                topn.append((fourgram, pmi, rf))
    return topn

### Likelihood: 10 N-grams with highest likelihood (=statistical measure indicating the strength of association between words)
## for a specified ngram-type, return the top n n-grams concerning its likelihood ratio
## further details: https://stackoverflow.com/questions/21165702/nltk-collocations-for-specific-words
def toplikelihood(ngram, n):
    topn = []
    if (ngram == "bigram"):
        for bigram in finderb.nbest(bigram_measures.likelihood_ratio, n):
            ## entries with 'gibt' and named entities are not included
            if not('gibt' in bigram or bigram in entities):
                pmi = finderb.score_ngram(bigram_measures.likelihood_ratio, bigram[0], bigram[1])
                rf = finderb.ngram_fd[bigram]
                topn.append((bigram, pmi, rf))
                
    if (ngram == "trigram"):
        for trigram in findert.nbest(trigram_measures.likelihood_ratio, n):
            ## entries with 'gibt' and named entities are not included
            if not('gibt' in trigram or trigram in entities):
                pmi = findert.score_ngram(trigram_measures.likelihood_ratio, trigram[0], trigram[1], trigram[2])
                rf = findert.ngram_fd[trigram]
                topn.append((trigram, pmi, rf))
                
    if (ngram == "fourgram"):
        for fourgram in finderf.nbest(fourgram_measures.likelihood_ratio, n):
            ## entries with 'gibt' and named entities are not included
            if not('gibt' in fourgram or fourgram in entities):
                pmi = finderf.score_ngram(fourgram_measures.likelihood_ratio, fourgram[0], fourgram[1], fourgram[2], fourgram[3])
                rf = finderf.ngram_fd[fourgram]
                topn.append((fourgram, pmi, rf))
    return topn

  

### Normalized frequency: calculates and normalizes the raw frequency of an ngram, thus the appearance probability, 
## sorted first by frequency and second alphabetically [(tuple, rf value, frequency), ...]
def topfreq(ngram, n):
    topn = []
    if (ngram == "bigram"):
        for bigram in finderb.score_ngrams(bigram_measures.raw_freq)[:n]:
        ## Einträge mit mit 'gibt' und named entities herausfiltern
            if not('gibt' in bigram or bigram in entities):
                rf = finderb.ngram_fd[bigram[0]]
                topn.append(bigram + (rf,))
                
    if (ngram == "trigram"):
        for trigram in findert.score_ngrams(trigram_measures.raw_freq)[:n]:
            ## Sachen mit 'gibt' und named entities herausfiltern
            if not('gibt' in trigram or trigram in entities):
                rf = findert.ngram_fd[trigram[0]]
                topn.append(trigram + (rf,))
                
    if (ngram == "fourgram"):
        for fourgram in finderf.score_ngrams(fourgram_measures.raw_freq)[:n]:
            ## Sachen mit 'gibt' und named entities herausfiltern
            if not(('gibt' in fourgram)  or (fourgram in entities)):
                rf = finderf.ngram_fd[fourgram[0]]
                topn.append(fourgram + (rf,))
                
    return topn


### For given ngram: creates new file based on top n entries according to each measure value, 
### writes them down, listed beneath each other, in new text file called "top[n][ngram]s.txt"
### Important: amount of included entries in final document != n, 
### since the entries with 'gibt' and named entities are counted but not written down.

def topnsummary(ngram, n):
    with open('top' + str(n) + str(ngram) + 's.txt', 'w', encoding ='utf-8') as output_file:
        output_file.write("Top " + str(n) + " values\n\nPMI:\n")
        for i in toppmi(ngram, n): output_file.write(str(i) + "\n")
        output_file.write("\n\nLikelihoodratio:\n") 
        for i in toplikelihood(ngram, n): output_file.write(str(i) + "\n")                  
        output_file.write("\n\n Top values Frequency:\n")
        for i in topfreq(ngram, n): output_file.write(str(i) + "\n")
                          




In [None]:
### proposal: to make len(list of ngrams) = n -> "while" loop that continues appending until len(topn) = n
### didn't work (didn't find end of loop, maybe the intial search set should have been set differently)
### but this was the idea for the function as used on the likelihood function, could be used analogically for the others:
'''def toplikelihood(ngram, n):
    topn = []
    if (ngram == "bigram"):
        ### take the top 4*n entries as set to search in for, so that you can fill up the topn even if there are a lot of ngrams with 'gibt' and named entities
         for bigram in finderb.nbest(bigram_measures.likelihood_ratio, 4 * n):
            while(len(topn) != n):
                ## entries with 'gibt' and named entities are not included
                if not(('gibt' in bigram) or (bigram in entities)):
                    pmi = finderb.score_ngram(bigram_measures.likelihood_ratio, bigram[0], bigram[1])
                    rf = finderb.ngram_fd[bigram]
                    topn.append((bigram, pmi, rf))
            return topn
    if (ngram == "trigram"):
        for trigram in findert.nbest(trigram_measures.likelihood_ratio, 4 * n):
            ### take the top 4*n entries as set to search in for, so that you can fill up the topn even if there are a lot of ngrams with 'gibt' and named entities
            while(len(topn) != n):
                ## entries with 'gibt' and named entities are not included
                if not(('gibt' in trigram) or (trigram in entities)):
                    pmi = findert.score_ngram(trigram_measures.likelihood_ratio, trigram[0], trigram[1], trigram[2])
                    rf = findert.ngram_fd[trigram]
                    topn.append((trigram, pmi, rf))
            return topn
    if (ngram == "fourgram"):
        for fourgram in finderf.nbest(fourgram_measures.likelihood_ratio, 4 * n):
            ### take the top 4*n entries as set to search in for, so that you can fill up the topn even if there are a lot of ngrams with 'gibt' and named entities
            while(len(topn) != n):
                ## entries with 'gibt' and named entities are not included
                if not (('gibt' in fourgram) or (fourgram in entities)):
                    pmi = finderf.score_ngram(fourgram_measures.likelihood_ratio, fourgram[0], fourgram[1], fourgram[2], fourgram[3])
                    rf = finderf.ngram_fd[fourgram]
                    topn.append((fourgram, pmi, rf))  
        return topn



In [114]:
## Demonstration of why it was useful to filter out some collocations with 'gibt', 
## -> many entries include 'gibt' but don't vary in terms of used grammar
# findert.nbest(trigram_measures.likelihood_ratio, 10)


In [115]:
#### Tests:
## for all relevant functions (without "gibt" and named entities), sorted by ngrams, Top 10), 
## Likelihoodratio is empty for Trigrams and Fourgrams because top entries all included 'gibt' or named entities
'''
### Bigrams
print("\n PMI:\n")
print(toppmi("bigram", 10))
print("\n Likelihoodratio:\n")
print(toplikelihood("bigram", 10))
print("\n Frequency \n")
print(topfreq("bigram", 10))

### Trigrams
print("\n PMI:\n")
print(toppmi("trigram", 10))
print("\n Likelihoodratio:\n")
print(toplikelihood("trigram", 10))
print("\n Frequency \n")
print(topfreq("trigram", 10))

### Fourgrams
print("\n PMI:\n")
print(toppmi("fourgram", 10))
print("\n Likelihoodratio:\n")
print(toplikelihood("fourgram", 10))
print("\n Frequency \n")
print(topfreq("fourgram", 10))
'''



'\n### Bigrams\nprint("\n PMI:\n")\nprint(toppmi("bigram", 10))\nprint("\n Likelihoodratio:\n")\nprint(toplikelihood("bigram", 10))\nprint("\n Frequency \n")\nprint(topfreq("bigram", 10))\n\n### Trigrams\nprint("\n PMI:\n")\nprint(toppmi("trigram", 10))\nprint("\n Likelihoodratio:\n")\nprint(toplikelihood("trigram", 10))\nprint("\n Frequency \n")\nprint(topfreq("trigram", 10))\n\n### Fourgrams\nprint("\n PMI:\n")\nprint(toppmi("fourgram", 10))\nprint("\n Likelihoodratio:\n")\nprint(toplikelihood("fourgram", 10))\nprint("\n Frequency \n")\nprint(topfreq("fourgram", 10))\n'

In [116]:
### Produce the files with the top 200 ngrams:
topnsummary("bigram", 200)
topnsummary("trigram", 200)
topnsummary("fourgram", 200)

In [117]:
### Use to view top n summary here instead of open text files:

## Top 200 Bigrams: 
print("\n Top 200 Bigrams without gibt and Named Entities\n")
topnsummary("bigram", 200)
with open('top200bigrams.txt', encoding = 'utf-8') as text:
    print(text.read())


## Top 200 Trigrams:
print("\n Top 200 Trigrams without gibt and Named Entities\n")
topnsummary("trigram", 200)
with open('top200trigrams.txt', encoding = 'utf-8') as text:
    print(text.read())
    
## Top 200 Fourgrams:
print("\n Top 200 Fourgrams without gibt and Named Entities\n") 
topnsummary("fourgram", 200)
with open('top200fourgrams.txt', encoding = 'utf-8') as text:
    print(text.read())



 Top 200 Bigrams with gibt and Named Entities


 Top 200 Bigrams without gibt and Named Entities

Top 200 values

PMI:
(('Alfred', 'Wegener'), 16.289406551091517, 2)
(('Arbeitsmarktund', 'Berufsforschung'), 16.289406551091517, 2)
(('Carnap', 'Bornheim'), 16.289406551091517, 2)
(('Championship', 'Cheese'), 16.289406551091517, 2)
(('Eisernen', 'Thron'), 16.289406551091517, 2)
(('European', 'XFEL'), 16.289406551091517, 2)
(('Farsi', 'Darsi'), 16.289406551091517, 2)
(('Further', 'Drachenstich'), 16.289406551091517, 2)
(('Nationalen', 'Waffenregister'), 16.289406551091517, 2)
(('Ostasien', 'Ensemble'), 16.289406551091517, 2)
(('Russell', 'Terrier'), 16.289406551091517, 2)
(('Saalfelder', 'Feengrotten'), 16.289406551091517, 2)
(('Schwarzwälder', 'Kirschtorte'), 16.289406551091517, 2)
(('Soap', 'Company'), 16.289406551091517, 2)
(('Trachycarpus', 'fortunei'), 16.289406551091517, 2)
(('Waldshut', 'Tiengen'), 16.289406551091517, 2)
(('allgemeine', 'Kreditsicherung'), 16.289406551091517, 2)
(('

Top 200 values

PMI:
(('Championship', 'Cheese', 'Contest'), 31.99385060146188, 2)
(('Brooklyn', 'Soap', 'Company'), 31.578813102183034, 2)
(('Dinner', 'for', 'One'), 31.40888810074072, 2)
(('Georg', 'Brauchle', 'Ring'), 31.40888810074072, 2)
(('Go', 'Sing', 'Choir'), 31.408888100740718, 3)
(('Want', 'It', 'At'), 31.408888100740718, 3)
(('Kleingärtnerverein', 'Dr', 'Schreber'), 30.99385060146188, 2)
(('It', 'At', 'All'), 30.993850601461876, 3)
(('World', 'Championship', 'Cheese'), 30.77145818012543, 2)
(('Des', 'Kaisers', 'Nachtigall'), 30.578813102183034, 3)
(('Alfred', 'Wegener', 'Instituts'), 30.40888810074072, 2)
(('Ausg', 'steckt', 'is'), 30.186495679404274, 2)
(('Ocean', 's', 'Seven'), 30.119381483545737, 2)
(('fund', 'v', 'Laute'), 30.08696000585336, 2)
(('Don', 't', 'Want'), 29.993850601461876, 3)
(('t', 'Want', 'It'), 29.993850601461876, 3)
(('Automobil', 'Clubs', 'ADAC'), 29.93495691240831, 2)
(('Herzog', 'Eberhard', 'Ludwig'), 29.671922506574514, 2)
(('Web', 'Dokumentation',

Top 200 values

PMI:
(('Want', 'It', 'At', 'All'), 46.698294651832235, 3)
(('World', 'Championship', 'Cheese', 'Contest'), 46.47590223049579, 2)
(('Don', 't', 'Want', 'It'), 45.698294651832235, 3)
(('t', 'Want', 'It', 'At'), 45.698294651832235, 3)
(('can', 't', 'get', 'enough'), 44.86821965327455, 4)
(('just', 'can', 't', 'get'), 44.86821965327455, 4)
(('Oh', 'death', 'by', 'sex'), 43.639400962778666, 2)
(('Allgemeinen', 'Deutschen', 'Automobil', 'Clubs'), 41.50189743902874, 2)
(('Spectrum', 'of', 'the', 'Seas'), 41.305977229053475, 6)
(('Deutschen', 'Automobil', 'Clubs', 'ADAC'), 40.17996934414137, 2)
(('Ulrich', 'Eichstädt', 'vom', 'Verband'), 39.77199865705113, 2)
(('Schutzgemeinschaft', 'für', 'allgemeine', 'Kreditsicherung'), 39.58513130025055, 2)
(('Web', 'Dokumentation', 'Worldwide', 'Berlin'), 39.46734021699237, 3)
(('Claus', 'von', 'Carnap', 'Bornheim'), 39.42683162392787, 2)
(('hohe', 'Dosis', 'Medikamente', 'gegeben'), 38.87386621641569, 2)
(('Post', 'ins', 'Blankeneser', 'T

In [64]:
### for looking at specific Named Entities, e.g. all locations

'''for e in set(doc.ents): 
    if (e.label_ == 'LOC'): 
        print(e.text) '''
    


Düsseldorf
Feuerwehrpieper
Spectre
Dorfkäserei Fritzenhaus bei Bern
Stadt
Berg am Laim
Guarda
Schwarzwald
Wildspitze
Deutsche Museum
Ostsee
Kilometeran
der Schweiz
la hinzugefügt
Schlösser Hohenschwangau
Klar
Bio-Lebensmittel
Clara-Zetkin-Park
Deutschland
Deutschland
Lemgo
Verbraucherzentralen
großen Maximilianstraße
Deutschland
Olpe
Ostalpen
bayerischen Alpen
Speck
Bayern
Zahl
Japaner
Teneriffa
Dänemark
Heidelberg
Dein Deutsch
Zermatt
Deutschland
YouTube-Tutorials
Ostsee
Kanal
Bremen
Österreich
Bio-Lebensmittel
Berlin
Optiker
Mecklenburg-Vorpommern
Sachsen
Niedersachsen
Berlin
Hessen
Kulturschock
Wissen
Mecklenburg
Montagvormittag
Stück
Tostedt
Altpapiertonne
Molitor
Attendorf
Königssee
Bächle
Marzipan
Bode-Museum
Scanner
Gscheid
Deutschland
Passau
Sachsen-Anhalt
Sankt Petersburg
Frankreich
Polen
Herrschaftsgebieten
Pazifik
Weinbauer
Grünanlagen der Stadt
Silvretta
Brasilien
Einsiedeln
Schneeberg
Piep
US-Amerikaner
Ova Lavirun
Regen
Österreich
Berlin
Stadt
Musikmachen
Siut
Polen
Schwe