In [4]:
### Required modules
from zipfile import ZipFile
import os
%run CollocationsPaket.ipynb

In [14]:
## import all text files saved in texts.zip, and merge them into one output file output-leicht.txt

## important: make sure that all files in texts.zip are not empty, in our case it meant deleting the following files: 
# Deutsch perfekt 12_2018, 12_2019, and all Deutsch perfekt starting from 2011, as they were all empty



## Loop to open all text files in the texts.zip and merge them into one final file output-leicht.txt
with open('output-leicht.txt', 'w', encoding ='utf-8') as output_file:
    with ZipFile("texts.zip", "r") as z:
         # Iterate through each folder in the zip file
        z.extractall()
        # Iterate through each file in the zip
        for file in z.namelist(): 
            # append the file content
            with open(file, 'r', encoding ='utf-8') as f:
                ## Merge only the easy texts (A1 and A2)
                if "leicht" in f.name:
                    output_file.write(f.read())
             
##
## Import final file as textoutput
with open('output-leicht.txt', encoding = 'utf-8') as text:
    textoutput = text.read()

In [15]:

def toppminotfiltered(ngram,n):
    """
    toppminotfiltered collects top ngrams when measured by pointwise mutual information (PMI), 
    with their respective PMI values and amount of occurences
    
    :ngram: string specifying whether to return top bigrams, trigrams or fourgrams
    :n: amount of top entries function initially looks at 
    :return: list of three-component-tuples 
    """
    topn = []
    if (ngram == "bigram"):
        for bigram in finderb.nbest(bigram_measures.pmi, n):
            pmi = finderb.score_ngram(bigram_measures.pmi, bigram[0], bigram[1])
            rf = finderb.ngram_fd[bigram]
            topn.append((bigram, pmi, rf))
                
    if (ngram == "trigram"):
        for trigram in findert.nbest(trigram_measures.pmi, n):
            pmi = findert.score_ngram(trigram_measures.pmi, trigram[0], trigram[1], trigram[2])
            rf = findert.ngram_fd[trigram]
            topn.append((trigram, pmi, rf))
                
    if (ngram == "fourgram"):
        for fourgram in finderf.nbest(fourgram_measures.pmi, n):
            pmi = finderf.score_ngram(fourgram_measures.pmi, fourgram[0], fourgram[1], fourgram[2], fourgram[3])
            rf = finderf.ngram_fd[fourgram]
            topn.append((fourgram, pmi, rf))
    return topn


def toplikelihoodnotfiltered(ngram, n):
    """
    toplikelihoodnotfiltered collects top ngrams when measured by pointwise mutual information (PMI), 
    with their respective Likelihood Ratio values and amount of occurences
    
    :ngram: string specifying whether to return top bigrams, trigrams or fourgrams
    :n: amount of top entries function initially looks at 
    :return: list of three-component-tuples 
    """
    topn = []
    if (ngram == "bigram"):
        for bigram in finderb.nbest(bigram_measures.likelihood_ratio, n):
            pmi = finderb.score_ngram(bigram_measures.likelihood_ratio, bigram[0], bigram[1])
            rf = finderb.ngram_fd[bigram]
            topn.append((bigram, pmi, rf))
                
    if (ngram == "trigram"):
        for trigram in findert.nbest(trigram_measures.likelihood_ratio, n):
            pmi = findert.score_ngram(trigram_measures.likelihood_ratio, trigram[0], trigram[1], trigram[2])
            rf = findert.ngram_fd[trigram]
            topn.append((trigram, pmi, rf))
                
    if (ngram == "fourgram"):
        for fourgram in finderf.nbest(fourgram_measures.likelihood_ratio, n):
            pmi = finderf.score_ngram(fourgram_measures.likelihood_ratio, fourgram[0], fourgram[1], fourgram[2], fourgram[3])
            rf = finderf.ngram_fd[fourgram]
            topn.append((fourgram, pmi, rf))
    return topn

  


def topfreqnotfiltered(ngram, n):
    """
    topfreqnotfiltered collects top ngrams when measured by pointwise mutual information (PMI), 
    with their respective Frequency values and amount of occurences
    
    :ngram: string specifying whether to return top bigrams, trigrams or fourgrams
    :n: amount of top entries function initially looks at 
    :return: list of three-component-tuples 
    """
    
    topn = []
    if (ngram == "bigram"):
        for bigram in finderb.score_ngrams(bigram_measures.raw_freq)[:n]:
            rf = finderb.ngram_fd[bigram[0]]
            topn.append(bigram + (rf,))
                
    if (ngram == "trigram"):
        for trigram in findert.score_ngrams(trigram_measures.raw_freq)[:n]:
            rf = findert.ngram_fd[trigram[0]]
            topn.append(trigram + (rf,))
                
    if (ngram == "fourgram"):
        for fourgram in finderf.score_ngrams(fourgram_measures.raw_freq)[:n]:
            rf = finderf.ngram_fd[fourgram[0]]
            topn.append(fourgram + (rf,))
                
    return topn


### Print amount of ngrams

def topnsummarynotfiltered(ngram, n):
    """
    topfreqnotfiltered counts length of output list of functions toppminotfiltered, toplikelihoodnotfiltered
    and topfreqnotfiltered applied on same parameters
    
    :ngram: string specifying whether to return top bigrams, trigrams or fourgrams
    :n: amount of top entries function initially looks at 
    :return: print of output length 
    """
    
    print("Top " + str(n) + ngram + " values not filtered\n\nPMI:\n")
    print(str(len(toppminotfiltered(ngram, n))) + "\n")
    print("\n\nLikelihoodratio:\n") 
    print(str(len(toplikelihoodnotfiltered(ngram, n))) + "\n")           
    print("\n\n Frequency:\n")         
    print(str(len(topfreqnotfiltered(ngram, n))) + "\n")


In [16]:
### FILTER OUT 'GIBT'

### FOR THE SAKE OF ESTIMATING THE CODE'S LIMITATIONS: filter out entries with 'gibt' and count output length

#### Functions that return top specified ngrams for three different association measures  
#### with their respective measure values and amount of occurences
#### and attempts to filter out entries with 'gibt'

### The three association measures are Pointwise Mutual Information, Likelihood Ratio and Frequency

### PMI: degree of association between words by comparing observed co-occurrence frequency with 
### expected co-occurrence frequency if they were independent 

### Likelihood Ratio: statistical measure indicating the strength of association between words
## further details: https://stackoverflow.com/questions/21165702/nltk-collocations-for-specific-words

### Normalized frequency: calculates and normalizes the raw frequency of an ngram, thus the appearance probability

def toppminogibt(ngram,n):
    """
    toppminogibt collects top ngrams when measured by pointwise mutual information (PMI), 
    with their respective PMI values and amount of occurences and then filters out entries with 'gibt'
    
    :ngram: string specifying whether to return top bigrams, trigrams or fourgrams
    :n: amount of top entries function initially looks at 
    :return: list of three-component-tuples 
    """
    
    topn = []
    if (ngram == "bigram"):
        for bigram in finderb.nbest(bigram_measures.pmi, n):
            ## entries with 'gibt' are not included
            if not('gibt' in bigram):
                pmi = finderb.score_ngram(bigram_measures.pmi, bigram[0], bigram[1])
                rf = finderb.ngram_fd[bigram]
                topn.append((bigram, pmi, rf))
                
    if (ngram == "trigram"):
        for trigram in findert.nbest(trigram_measures.pmi, n):
            ## entries with 'gibt' are not included
            if not ('gibt' in trigram):
                pmi = findert.score_ngram(trigram_measures.pmi, trigram[0], trigram[1], trigram[2])
                rf = findert.ngram_fd[trigram]
                topn.append((trigram, pmi, rf))
                
    if (ngram == "fourgram"):
        for fourgram in finderf.nbest(fourgram_measures.pmi, n):
            ## entries with 'gibt' are not included
            if not('gibt' in fourgram):
                pmi = finderf.score_ngram(fourgram_measures.pmi, fourgram[0], fourgram[1], fourgram[2], fourgram[3])
                rf = finderf.ngram_fd[fourgram]
                topn.append((fourgram, pmi, rf))
    return topn


def toplikelihoodnogibt(ngram, n):
    """
    toplikelihoodnogibt collects top ngrams when measured by Likelihood Ratio, 
    with their respective Likelihood Ratio values and amount of occurences and then filters out entries with 'gibt'
    
    :ngram: string specifying whether to return top bigrams, trigrams or fourgrams
    :n: amount of top entries function initially looks at 
    :return: list of three-component-tuples 
    """
    
    topn = []
    if (ngram == "bigram"):
        for bigram in finderb.nbest(bigram_measures.likelihood_ratio, n):
            ## entries with 'gibt' are not included
            if not('gibt' in bigram):
                pmi = finderb.score_ngram(bigram_measures.likelihood_ratio, bigram[0], bigram[1])
                rf = finderb.ngram_fd[bigram]
                topn.append((bigram, pmi, rf))
                
    if (ngram == "trigram"):
        for trigram in findert.nbest(trigram_measures.likelihood_ratio, n):
            ## entries with 'gibt' are not included
            if not('gibt' in trigram):
                pmi = findert.score_ngram(trigram_measures.likelihood_ratio, trigram[0], trigram[1], trigram[2])
                rf = findert.ngram_fd[trigram]
                topn.append((trigram, pmi, rf))
                
    if (ngram == "fourgram"):
        for fourgram in finderf.nbest(fourgram_measures.likelihood_ratio, n):
            ## entries with 'gibt' are not included
            if not('gibt' in fourgram):
                pmi = finderf.score_ngram(fourgram_measures.likelihood_ratio, fourgram[0], fourgram[1], fourgram[2], fourgram[3])
                rf = finderf.ngram_fd[fourgram]
                topn.append((fourgram, pmi, rf))
    return topn

  


def topfreqnogibt(ngram, n):
    """
    topfreqnogibt collects top ngrams when measured by Frequency, 
    with their respective Frequency values and amount of occurences and then filters out entries with 'gibt'
    
    :ngram: string specifying whether to return top bigrams, trigrams or fourgrams
    :n: amount of top entries function initially looks at 
    :return: list of three-component-tuples 
    """
    
    topn = []
    if (ngram == "bigram"):
        for bigram in finderb.score_ngrams(bigram_measures.raw_freq)[:n]:
            ## entries with 'gibt' are not included
            if not('gibt' in bigram):
                rf = finderb.ngram_fd[bigram[0]]
                topn.append(bigram + (rf,))
                
    if (ngram == "trigram"):
        for trigram in findert.score_ngrams(trigram_measures.raw_freq)[:n]:
            ## entries with 'gibt' are not included
            if not('gibt' in trigram):
                rf = findert.ngram_fd[trigram[0]]
                topn.append(trigram + (rf,))
                
    if (ngram == "fourgram"):
        for fourgram in finderf.score_ngrams(fourgram_measures.raw_freq)[:n]:
            ## entries with 'gibt' are not included
            if not('gibt' in fourgram):
                rf = finderf.ngram_fd[fourgram[0]]
                topn.append(fourgram + (rf,))
                
    return topn


### Print amount of ngrams

def topnsummarynogibt(ngram, n):
    """
    topnsummarynogibt counts length of output list of functions toppminogibt, toplikelihoodnogibt
    and topfreqnogibt applied on same parameters
    
    :ngram: string specifying whether to return top bigrams, trigrams or fourgrams
    :n: amount of top entries function initially looks at 
    :return: print of output length 
    """
    print("Top " + str(n) + ngram + " values no 'gibt'\n\nPMI:\n")
    print(str(len(toppminogibt(ngram, n))) + "\n")
    print("Likelihoodratio:\n") 
    print(str(len(toplikelihoodnogibt(ngram, n))) + "\n")           
    print("Frequency:\n")         
    print(str(len(topfreqnogibt(ngram, n))) + "\n") 
                          


In [17]:
### FILTER OUT NAMED ENTITIES

### FOR THE SAKE OF ESTIMATING THE CODE'S LIMITATIONS: filter out entries with Named Entities and count output length


#### Functions that return top specified ngrams for three different association measures  
#### with their respective measure values and amount of occurences
#### and attempts to filter out entries with Named Person/Location entities

### The three association measures are Pointwise Mutual Information, Likelihood Ratio and Frequency

### PMI: degree of association between words by comparing observed co-occurrence frequency with 
### expected co-occurrence frequency if they were independent 

### Likelihood Ratio: statistical measure indicating the strength of association between words
## further details: https://stackoverflow.com/questions/21165702/nltk-collocations-for-specific-words

### Normalized frequency: calculates and normalizes the raw frequency of an ngram, thus the appearance probability

def toppminone(ngram,n):
    """
    toppminone collects top ngrams when measured by pointwise mutual information (PMI), 
    with their respective PMI values and amount of occurences and then filters out entries with Named Entities
    
    :ngram: string specifying whether to return top bigrams, trigrams or fourgrams
    :n: amount of top entries function initially looks at 
    :return: list of three-component-tuples 
    """
    
    topn = []
    if (ngram == "bigram"):
        for bigram in finderb.nbest(bigram_measures.pmi, n):
            ## entries with named entities are not included
            if not(bigram in entities):
                pmi = finderb.score_ngram(bigram_measures.pmi, bigram[0], bigram[1])
                rf = finderb.ngram_fd[bigram]
                topn.append((bigram, pmi, rf))
                
    if (ngram == "trigram"):
        for trigram in findert.nbest(trigram_measures.pmi, n):
            ## entries with named entities are not included
            if not (trigram in entities):
                pmi = findert.score_ngram(trigram_measures.pmi, trigram[0], trigram[1], trigram[2])
                rf = findert.ngram_fd[trigram]
                topn.append((trigram, pmi, rf))
                
    if (ngram == "fourgram"):
        for fourgram in finderf.nbest(fourgram_measures.pmi, n):
            ## entries with named entities are not included
            if not(fourgram in entities):
                pmi = finderf.score_ngram(fourgram_measures.pmi, fourgram[0], fourgram[1], fourgram[2], fourgram[3])
                rf = finderf.ngram_fd[fourgram]
                topn.append((fourgram, pmi, rf))
    return topn

def toplikelihoodnone(ngram, n):
    """
    toplikelihoodnone collects top ngrams when measured by Likelihood Ratio, 
    with their respective Likelihood Ratio values and amount of occurences and then filters out entries with Named Entities
    
    :ngram: string specifying whether to return top bigrams, trigrams or fourgrams
    :n: amount of top entries function initially looks at 
    :return: list of three-component-tuples 
    """
    
    topn = []
    if (ngram == "bigram"):
        for bigram in finderb.nbest(bigram_measures.likelihood_ratio, n):
            ## entries with named entities are not included
            if not(bigram in entities):
                pmi = finderb.score_ngram(bigram_measures.likelihood_ratio, bigram[0], bigram[1])
                rf = finderb.ngram_fd[bigram]
                topn.append((bigram, pmi, rf))
                
    if (ngram == "trigram"):
        for trigram in findert.nbest(trigram_measures.likelihood_ratio, n):
            ## entries with named entities are not included
            if not(trigram in entities):
                pmi = findert.score_ngram(trigram_measures.likelihood_ratio, trigram[0], trigram[1], trigram[2])
                rf = findert.ngram_fd[trigram]
                topn.append((trigram, pmi, rf))
                
    if (ngram == "fourgram"):
        for fourgram in finderf.nbest(fourgram_measures.likelihood_ratio, n):
            ## entries with named entities are not included
            if not(fourgram in entities):
                pmi = finderf.score_ngram(fourgram_measures.likelihood_ratio, fourgram[0], fourgram[1], fourgram[2], fourgram[3])
                rf = finderf.ngram_fd[fourgram]
                topn.append((fourgram, pmi, rf))
    return topn

  

def topfreqnone(ngram, n):
    """
    topfreqnone collects top ngrams when measured by Frequency, 
    with their respective Frequency values and amount of occurences and then filters out entries with Named Entities
    
    :ngram: string specifying whether to return top bigrams, trigrams or fourgrams
    :n: amount of top entries function initially looks at 
    :return: list of three-component-tuples 
    """
    
    topn = []
    if (ngram == "bigram"):
        for bigram in finderb.score_ngrams(bigram_measures.raw_freq)[:n]:
        ## entries with named entities are not included
            if not(bigram in entities):
                rf = finderb.ngram_fd[bigram[0]]
                topn.append(bigram + (rf,))
                
    if (ngram == "trigram"):
        for trigram in findert.score_ngrams(trigram_measures.raw_freq)[:n]:
            ## entries with named entities are not included
            if not(trigram in entities):
                rf = findert.ngram_fd[trigram[0]]
                topn.append(trigram + (rf,))
                
    if (ngram == "fourgram"):
        for fourgram in finderf.score_ngrams(fourgram_measures.raw_freq)[:n]:
            ## entries with named entities are not included
            if not(fourgram in entities):
                rf = finderf.ngram_fd[fourgram[0]]
                topn.append(fourgram + (rf,))
                
    return topn


### Print amount of ngrams

def topnsummarynone(ngram, n):
    """
    topnsummarynone counts length of output list of functions toppminogibt, toplikelihoodnogibt
    and topfreqnogibt applied on same parameters
    
    :ngram: string specifying whether to return top bigrams, trigrams or fourgrams
    :n: amount of top entries function initially looks at 
    :return: print of output length 
    """
    
    print("Top " + str(n) + ngram + "values no named entities \n\nPMI:\n")
    print(str(len(toppminone(ngram, n))) + "\n")
    print("Likelihoodratio:\n") 
    print(str(len(toplikelihoodnone(ngram, n))) + "\n")           
    print("Frequency:\n")         
    print(str(len(topfreqnone(ngram, n))) + "\n") 



In [18]:
### RESULT OF COMPARISON: produce all the different versions 
# (nothing filtered out, gibt filtered out, named entities filtered out, both filtered out)
### and compare respective amounts to view how well the recognition and filtering works

### Amount of ngrams, nothing filtered:
topnsummarynotfiltered("bigram", 200)
topnsummarynotfiltered("trigram", 200)
topnsummarynotfiltered("fourgram", 200)

### Amount of ngrams, 'gibt' filtered out:
topnsummarynogibt("bigram", 200)
topnsummarynogibt("trigram", 200)
topnsummarynogibt("fourgram", 200)

### Amount of ngrams, named entities filtered out:
topnsummarynone("bigram", 200)
topnsummarynone("trigram", 200)
topnsummarynone("fourgram", 200)

### Amount of ngrams, both 'gibt' and named entities filtered out, imported from other notebook
printsummary("bigram", 200)
printsummary("trigram", 200)
printsummary("fourgram", 200)


### SUMMARY:
## For 'gibt', the code filtered out: 
# 2 bigrams in Likelihood
# 119 trigrams in Likelihood
# 3 fourgrams in PMI 
# 79 fourgrams in likelihood

# For Named Entities, the code filtered out:
# 58 bigrams in PMI
# 7 bigrams in Likelihood
# 11 trigrams in PMI
# 2 fourgrams in PMI

## In topnsummary, the code filtered out (-> corresponds to [filtered out 'gibt'] + [filtered out NE]?):
# 58 bigrams in PMI -> corresponds to 58 + 0!
# 9 bigrams in Likelihood  -> corresponds to 2 + 7!
# 11 trigrams in PMI -> corresponds to 0 + 11!
# 119 trigrams in Likelihood -> corresponds to 119 + 0!
# 5 fourgrams in PMI -> corresponds to 3 + 2!
# 79 fourgrams in Likelihood -> corresponds to 0 + 79!


## Thus, the code filtered out 67 bigrams, 130 trigrams, and 84 fourgrams

Top 200bigram values not filtered

PMI:

200



Likelihoodratio:

200



 Frequency:

200

Top 200trigram values not filtered

PMI:

200



Likelihoodratio:

200



 Frequency:

200

Top 200fourgram values not filtered

PMI:

200



Likelihoodratio:

200



 Frequency:

200

Top 200bigram values no 'gibt'

PMI:

200

Likelihoodratio:

198

Frequency:

200

Top 200trigram values no 'gibt'

PMI:

200

Likelihoodratio:

81

Frequency:

200

Top 200fourgram values no 'gibt'

PMI:

197

Likelihoodratio:

121

Frequency:

200

Top 200bigramvalues no named entities 

PMI:

142

Likelihoodratio:

193

Frequency:

200

Top 200trigramvalues no named entities 

PMI:

189

Likelihoodratio:

200

Frequency:

200

Top 200fourgramvalues no named entities 

PMI:

198

Likelihoodratio:

200

Frequency:

200

Top 200bigram values gibt and named entities filtered out 

PMI:

142

Likelihoodratio:

191

Frequency:

200

Top 200trigram values gibt and named entities filtered out 

PMI:

189

Likelihoodrati

In [None]:
### PROPOSAL: NOT YET WORKING
### to make len(list of ngrams) = n: create "while" loop that continues appending until len(topn) = n

### didn't find end of loop, maybe the intial search set should have been set differently
### proposal here is applied on the likelihood function, could be used analogically for the others:
def toplikelihood(ngram, n):
    topn = []
    if (ngram == "bigram"):
        ### take the top 4*n entries as set to search in for, so that you can fill up the topn even if there are a lot of ngrams with 'gibt' and named entities
         for bigram in finderb.nbest(bigram_measures.likelihood_ratio, 4 * n):
            while(len(topn) != n):
                ## entries with 'gibt' and named entities are not included
                if not(('gibt' in bigram) or (bigram in entities)):
                    pmi = finderb.score_ngram(bigram_measures.likelihood_ratio, bigram[0], bigram[1])
                    rf = finderb.ngram_fd[bigram]
                    topn.append((bigram, pmi, rf))
            return topn
    if (ngram == "trigram"):
        for trigram in findert.nbest(trigram_measures.likelihood_ratio, 4 * n):
            ### take the top 4*n entries as set to search in for, so that you can fill up the topn even if there are a lot of ngrams with 'gibt' and named entities
            while(len(topn) != n):
                ## entries with 'gibt' and named entities are not included
                if not(('gibt' in trigram) or (trigram in entities)):
                    pmi = findert.score_ngram(trigram_measures.likelihood_ratio, trigram[0], trigram[1], trigram[2])
                    rf = findert.ngram_fd[trigram]
                    topn.append((trigram, pmi, rf))
            return topn
    if (ngram == "fourgram"):
        for fourgram in finderf.nbest(fourgram_measures.likelihood_ratio, 4 * n):
            ### take the top 4*n entries as set to search in for, so that you can fill up the topn even if there are a lot of ngrams with 'gibt' and named entities
            while(len(topn) != n):
                ## entries with 'gibt' and named entities are not included
                if not (('gibt' in fourgram) or (fourgram in entities)):
                    pmi = finderf.score_ngram(fourgram_measures.likelihood_ratio, fourgram[0], fourgram[1], fourgram[2], fourgram[3])
                    rf = finderf.ngram_fd[fourgram]
                    topn.append((fourgram, pmi, rf))  
        return topn



In [19]:
## Demonstration of why it was useful to filter out some collocations with 'gibt', 
## -> many entries include 'gibt' but don't vary in terms of used grammar
findert.nbest(trigram_measures.likelihood_ratio, 10)


[('So', 'gibt', 'es'),
 ('gibt', 'es', 'auch'),
 ('gibt', 'es', 'dort'),
 ('Deshalb', 'gibt', 'es'),
 ('Außerdem', 'gibt', 'es'),
 ('gibt', 'es', 'in'),
 ('gibt', 'es', 'viele'),
 ('Dort', 'gibt', 'es'),
 ('gibt', 'es', 'noch'),
 ('gibt', 'es', 'nicht')]

In [20]:
#### Tests:
## for all relevant functions (without "gibt" and named entities), sorted by ngrams, Top 10, 
## Likelihoodratio is empty for Trigrams and Fourgrams because top entries all included 'gibt' or named entities

### Bigrams
print("\n PMI:\n")
print(toppmi("bigram", 10))
print("\n Likelihoodratio:\n")
print(toplikelihood("bigram", 10))
print("\n Frequency \n")
print(topfreq("bigram", 10))

### Trigrams
print("\n PMI:\n")
print(toppmi("trigram", 10))
print("\n Likelihoodratio:\n")
print(toplikelihood("trigram", 10))
print("\n Frequency \n")
print(topfreq("trigram", 10))

### Fourgrams
print("\n PMI:\n")
print(toppmi("fourgram", 10))
print("\n Likelihoodratio:\n")
print(toplikelihood("fourgram", 10))
print("\n Frequency \n")
print(topfreq("fourgram", 10))



 PMI:

[(('Alfred', 'Wegener'), 16.289406551091517, 2), (('Arbeitsmarktund', 'Berufsforschung'), 16.289406551091517, 2), (('Carnap', 'Bornheim'), 16.289406551091517, 2), (('Championship', 'Cheese'), 16.289406551091517, 2), (('Eisernen', 'Thron'), 16.289406551091517, 2), (('European', 'XFEL'), 16.289406551091517, 2), (('Farsi', 'Darsi'), 16.289406551091517, 2)]

 Likelihoodratio:

[(('zum', 'Beispiel'), 3220.186103516634, 296), (('in', 'Deutschland'), 1654.9763181443818, 276), (('habe', 'ich'), 1645.2500606576757, 211), (('in', 'der'), 1163.5406630672592, 448), (('mehr', 'als'), 1091.8136815725675, 157), (('Ich', 'habe'), 1042.1775676944062, 130), (('ein', 'bisschen'), 956.5199926305581, 110), (('in', 'den'), 898.5558082782275, 274)]

 Frequency 

[(('gibt', 'es'), 0.0033398257047968635, 535), (('in', 'der'), 0.0027967138612130745, 448), (('zum', 'Beispiel'), 0.0018478288011586387, 296), (('in', 'Deutschland'), 0.0017229755037830548, 276), (('in', 'den'), 0.0017104901740454966, 274), (

In [21]:
### Use to view top n summary here instead of having to open the text files:

## Top 200 Bigrams: 
print("\n Top 200 Bigrams without gibt and Named Entities\n")
topnsummary("bigram", 200)
with open('top200bigrams.txt', encoding = 'utf-8') as text:
    print(text.read())


## Top 200 Trigrams:
print("\n Top 200 Trigrams without gibt and Named Entities\n")
topnsummary("trigram", 200)
with open('top200trigrams.txt', encoding = 'utf-8') as text:
    print(text.read())
    
## Top 200 Fourgrams:
print("\n Top 200 Fourgrams without gibt and Named Entities\n") 
topnsummary("fourgram", 200)
with open('top200fourgrams.txt', encoding = 'utf-8') as text:
    print(text.read())



 Top 200 Bigrams without gibt and Named Entities

Top 200 values

PMI:
(('Alfred', 'Wegener'), 16.289406551091517, 2)
(('Arbeitsmarktund', 'Berufsforschung'), 16.289406551091517, 2)
(('Carnap', 'Bornheim'), 16.289406551091517, 2)
(('Championship', 'Cheese'), 16.289406551091517, 2)
(('Eisernen', 'Thron'), 16.289406551091517, 2)
(('European', 'XFEL'), 16.289406551091517, 2)
(('Farsi', 'Darsi'), 16.289406551091517, 2)
(('Further', 'Drachenstich'), 16.289406551091517, 2)
(('Nationalen', 'Waffenregister'), 16.289406551091517, 2)
(('Ostasien', 'Ensemble'), 16.289406551091517, 2)
(('Russell', 'Terrier'), 16.289406551091517, 2)
(('Saalfelder', 'Feengrotten'), 16.289406551091517, 2)
(('Schwarzwälder', 'Kirschtorte'), 16.289406551091517, 2)
(('Soap', 'Company'), 16.289406551091517, 2)
(('Trachycarpus', 'fortunei'), 16.289406551091517, 2)
(('Waldshut', 'Tiengen'), 16.289406551091517, 2)
(('allgemeine', 'Kreditsicherung'), 16.289406551091517, 2)
(('Bayerische', 'Vertretung'), 15.70444405037036, 2

Top 200 values

PMI:
(('Championship', 'Cheese', 'Contest'), 31.99385060146188, 2)
(('Brooklyn', 'Soap', 'Company'), 31.578813102183034, 2)
(('Dinner', 'for', 'One'), 31.40888810074072, 2)
(('Georg', 'Brauchle', 'Ring'), 31.40888810074072, 2)
(('Go', 'Sing', 'Choir'), 31.408888100740718, 3)
(('Want', 'It', 'At'), 31.408888100740718, 3)
(('Kleingärtnerverein', 'Dr', 'Schreber'), 30.99385060146188, 2)
(('It', 'At', 'All'), 30.993850601461876, 3)
(('World', 'Championship', 'Cheese'), 30.77145818012543, 2)
(('Des', 'Kaisers', 'Nachtigall'), 30.578813102183034, 3)
(('Alfred', 'Wegener', 'Instituts'), 30.40888810074072, 2)
(('Ausg', 'steckt', 'is'), 30.186495679404274, 2)
(('Ocean', 's', 'Seven'), 30.119381483545737, 2)
(('fund', 'v', 'Laute'), 30.08696000585336, 2)
(('Don', 't', 'Want'), 29.993850601461876, 3)
(('t', 'Want', 'It'), 29.993850601461876, 3)
(('Automobil', 'Clubs', 'ADAC'), 29.93495691240831, 2)
(('Herzog', 'Eberhard', 'Ludwig'), 29.671922506574514, 2)
(('Web', 'Dokumentation',

Top 200 values

PMI:
(('Want', 'It', 'At', 'All'), 46.698294651832235, 3)
(('World', 'Championship', 'Cheese', 'Contest'), 46.47590223049579, 2)
(('Don', 't', 'Want', 'It'), 45.698294651832235, 3)
(('t', 'Want', 'It', 'At'), 45.698294651832235, 3)
(('can', 't', 'get', 'enough'), 44.86821965327455, 4)
(('just', 'can', 't', 'get'), 44.86821965327455, 4)
(('Oh', 'death', 'by', 'sex'), 43.639400962778666, 2)
(('Allgemeinen', 'Deutschen', 'Automobil', 'Clubs'), 41.50189743902874, 2)
(('Spectrum', 'of', 'the', 'Seas'), 41.305977229053475, 6)
(('Deutschen', 'Automobil', 'Clubs', 'ADAC'), 40.17996934414137, 2)
(('Ulrich', 'Eichstädt', 'vom', 'Verband'), 39.77199865705113, 2)
(('Schutzgemeinschaft', 'für', 'allgemeine', 'Kreditsicherung'), 39.58513130025055, 2)
(('Web', 'Dokumentation', 'Worldwide', 'Berlin'), 39.46734021699237, 3)
(('Claus', 'von', 'Carnap', 'Bornheim'), 39.42683162392787, 2)
(('hohe', 'Dosis', 'Medikamente', 'gegeben'), 38.87386621641569, 2)
(('Post', 'ins', 'Blankeneser', 'T

In [22]:
### for looking at specific Named Entities, e.g. all locations

for e in set(doc.ents): 
    if (e.label_ == 'LOC'): 
        print(e.text) 
    


Krankenhäusern
Wakeboarden
Sachsen
Portugiese
Spanier
Buschenschänke
Deutschland
Hessen
Südafrika
Schnee
Dresden
Deutschland
Kleinen Alster
Russen
Bayern
Lebensmitteln
Zahl
Hamburg
Halligen
Bewohner
Hamburg
Chiemsee
Spanien
Deutschland
Iran
Wiesau
Adrenalin-Junkies
Großküchen
dritten Platz
Bodensee
Giesing
Deutschland
Kleine Gärten und Einfamilienhäuser
Hessen
Land
Siena
China
Deutschland
Wasserpflanzen-Sammlung
Schengen-Visum
Gadheim
Marzipan
Steiermark
München
Nürnberger
Mittagspause
Berlin
B1-Niveau
Europa
Deutschland
Gasthofs
Teleclinic
Nordsee
Hause
München
Deutschland
Indio auf dem Coachella
Konsequenzund
Büro
Frauenparkplätze
Portugal
Deutschland
Müritz
Deutschland
Niedersachsens
Harmonie
Wremer Heimatkreis
Straße
Deutschland
Bremen
Deutschland
Rock im Park
Berlin
Potsdamer Platz
England
Lina
Brandenburger Tor
Asien
Sorge
Ostsee
Millionen
Füssen
Deutschland
Aescher
Mars
Thyssen-Krupp-Stahlwerk Schwelgern
Rom
Alte Utting
Stadt
Watt
Basel
Frankfurt Teil
Berlin
Roten Rathaus
Hagen
