In [None]:
import re # Regular expressions
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm # Shows a smart progress meter - wrap iterable with tqdm(iterable)!
import collections 
from collections import Counter, defaultdict

import nltk
from nltk import bigrams, trigrams, ngrams
from nltk.tokenize import word_tokenize 
# requirement: nltk.download("punkt")
import gensim
from gensim.summarization.textcleaner import split_sentences
from gensim.utils import tokenize
from gensim.utils import simple_preprocess 
import itertools
from itertools import chain

import codecs
from IPython.core.display import HTML
plt.style.reload_library()


In [None]:
# nltk.download("stopwords")
# from nltk.corpus import stopwords
# stop_words = set(stopwords.words('german'))


In [None]:
# Later should be improved: Load all data from a Folder and combine it into one data-frame
df_raw = pd.read_csv('2020_06_23_CE-BVerwG_DE_Datensatz.csv') 
df_raw

## Import Data-Set

Import the data which sould be used for training and evaluation.

Split the data-set into 90/10 (or other wanted ratio), shuffle indices and return random distribution of documents.


In [None]:
np.random.seed(42) 
def split_dataset(input_text, perc_use, perc_predict): 
    """ input_text: the Data-Frame which sould be processed
        perc_use: the percentage of data which sould be used for training 
        perc_predit: "                                         " prediction analysis and metrics 
    """
    num_doc = len(df_raw) # number of documents
    indices_lst = list(range(num_doc)) # list of (for now) ordered indices of the documents
    np.random.shuffle(indices_lst) # shuffle the lsit randomly 
    
    # define the data-sets for training and prediciton 
    df_use = df_raw.iloc[indices_lst[:round(perc_use*num_doc)]][["doc_id", "Gericht", "Entscheidungsart", "Verfahrensart", "text"]] # Later use full data-set 
    df_predict = df_raw.iloc[indices_lst[round((1-perc_predict)*num_doc):]][["Gericht", "doc_id", "text"]] # Later use full data-set 
    
    # print length of data_frames for validation
    print(len(df_use), len(df_predict))
    
    # reset row index after shuffling, append column with original index
    df_use = df_use.reset_index()
    df_predict = df_predict.reset_index()
    
    return df_use, df_predict

# Run code to create data-sets
df_raw_use, df_raw_predict = split_dataset(df_raw, 1., 0.00) # Later use full data-set 
#df_raw_use, df_raw_predict = split_dataset(df_raw, 0.9, 0.1) # <- this is the complete data-set

## If necessary, only use specific documents

In [None]:
# "Entscheidungsart": B, U (Beschluss, Urteil)
# ""Verfahrensart": B, A, C, P, PB, WNB, PKH, WB, WD, WDS-VR, VR, BN, KSt, AV, F, 


# df_raw_use.drop(df_raw_use.loc[df_raw_use["Entscheidungsart"] != "B"].index, inplace=True)
# df_raw_use

## Counting occurrences (helper function)
Count occurrences like “VwGO” (or Tokens of interest) in all decisions either explicitly (exact comparison) or implicitly (ignore case): 

In [None]:
def search_occurrence(input_text, word_string, explicit = True): # helper function
    count = 0
    search = str(word_string)
    
    lst_found = []
    if explicit == True: # search for the exact matching of strings
        for decision in tqdm(input_text):
            count += len(re.findall(search, decision))
            lst_found.append(re.findall(search, decision))
    
    else:
        for decision in tqdm(input_text): # ignore the lower / upper case variance in string
            count += len(re.findall(search, decision, re.IGNORECASE))
            lst_found.append(re.findall(search, decision))
    return count, lst_found
    #print('Counts of “%s“: ' % search, count)

## Define Tokenizer
Use a tokenizer for preprocessing of the given data-set.

Use "case_important = True" for a exact comparison considering the case sensitivity.

In [None]:
# create Sentences
def tokenizer(text_input, min_length = 2, case_important = False, lan = "german"):
    """ text_input: text which should be processed
        min_length: minimum length of the tokens which should be returned
        case_important: Is upper / lower case from importance (False is much faster)
        lan: language of the processed Text
    """
    
    if case_important == False:
        for instance in range(len(text_input)):
            text_input[instance] = simple_preprocess(text_input[instance], min_len = min_length) # use Gensim 
        
    else: # if upper / lower case is from importance
        for instance in range(len(text_input)):

            sentences = nltk.sent_tokenize(text_input[instance],language=lan) #sentence
            sentences_tok = [nltk.tokenize.word_tokenize(sent) for sent in sentences] # Tokenized
            accumulated = list(itertools.chain.from_iterable(sentences_tok)) # Merge List
            
            # Filter the Tokens for short tokens, the list is hereby reversed handeled 
            [accumulated.pop(i) for i in reversed(range(len(accumulated))) if ((len(accumulated[i]) < min_length) and accumulated[i] != "§" and any(xy.isdigit() for xy in accumulated[i]) != True and accumulated[i] != "." )]                                                                          
                                                                               
            accumulated_sent = accumulated    
            
            text_input[instance] = accumulated_sent # assign tokens back to the Data-frame 


## Create data-set only considering sentences which include key-tokens and tokenize

Input: one column of list of text
output: column of tokenized sentences includng the key-token

In [None]:
# Just include sentences with a searchterm = "§", then tokenize

# same function as above, but only accept searchterm of length one
def sentence_reduction_one_it(text_input, searchterm, lang = "german", case_i = True, min_len = 2):
    """ text_input: text which will be processed
        searchterm: the key-token (in this case only of length one) which must occur within the sentence
        lang = "german": language of the processed text (default = german)
        case_i = True: is the task case sensitive (default = True)
        min_len = 2: the minimum length of the tokens (default = 2)
        
        return: two df columns: 1. processed sentence and 2. split (at searchterm occurance) processed sentence
    """
    k = len(text_input)
    text_out = [ [] for _ in range(k)] # gernerate list to write output to
    #sentence_split_out = [ [] for _ in range(k)]
    
    for instance in tqdm(range(k)):
        sentences = nltk.sent_tokenize(text_input[instance],language=lang) #sentence
        sentences_filtered = [item for item in sentences for i in range(len(item)) if item[i] == searchterm] # just sentences with §
        sentences_filtered = list(dict.fromkeys(sentences_filtered))
        # here the sentence is filtered and only sntences with § remain
        #sentences_filtered = word_tokenize(sentences_filtered)
        
        #sentences_filtered = [w for w in sentences_filtered if not w in stop_words]  
    
        
#         # second df column [before][searchterm][after], mehr als ein § pro Satz möglich
#         sentence_split = [ [] for _ in range(len(sentences_filtered))]
#         num_count = 0
#         for item in sentences_filtered:
#             index_search_lst = [index for index, elem in enumerate(item) if elem == searchterm ] # list of indices per sentence
#             new_item = []
#             while(len(index_search_lst) != 0):
#                 new_item = [[item[0:index_search_lst[-1]]], [searchterm], [item[index_search_lst[-1]+1:]]]
#                 del index_search_lst[-1]
            
#             sentence_split[num_count] = new_item
#             num_count += 1
        
#         for item in sentence_split:
#             for kk in item:
#                 if kk[0] != searchterm:
#                     tokenizer(kk, min_length = min_len, case_important = case_i  )
                    
                    
        #tokenizer(sentences_filtered, min_length = min_len, case_important = case_i  )
        
        #sentence_split_out[instance] = sentence_split
        text_out[instance] = sentences_filtered #list(itertools.chain.from_iterable(sentences_filtered))
    #print(len(text_out), len(sentence_split_out))
    return text_out#, sentence_split_out 

### Co-occurance probability (of single Words)

In [None]:
# https://medium.com/analytics-vidhya/a-comprehensive-guide-to-build-your-own-language-model-in-python-5141b3917d6d

# function to create a n-gram from a list of words
def get_ngrams(text_input, order, Keep_duplicates = True):
    """ text_input: Text which is given as an input
        order: The order of the desired n-gram (bi-gram, tri-gram, ...)
        Keep_duplicates: 
    """
    for instance in tqdm(range(len(text_input))):
        sentence = text_input[instance]
        n_grams = ngrams(sentence, order)
        output = [ ' '.join(grams) for grams in n_grams]
        
        if Keep_duplicates == True:
            text_input[instance] = output
        else:
            text_input[instance] = output[::order] #only keep every n-th entry
            
# function to map probabilities of n-grams following each other (co-occurance)            
def n_gram_model_dictionary(text_input, input_ngram_order = 1, mode = "tri", Keep_duplic=False):
    """
    
    """
    
    # Create a placeholder for model
    model = defaultdict(lambda: defaultdict(lambda: 0))
    
    if (input_ngram_order != 1):
        get_ngrams(text_input, order=input_ngram_order, Keep_duplicates=Keep_duplic)
    
    if mode == "tri":
        # Count frequency of co-occurance, 2 words given, output one
        for sentence in tqdm(text_input):
            for word1, word2, word3 in trigrams(sentence, pad_right=True, pad_left=True):
                model[(word1, word2)][word3] += 1
    
    else:
        # Count frequency of co-occurance  
        for sentence in tqdm(text_input):
            for word1, word2 in bigrams(sentence, pad_right=True, pad_left=True):
                model[word1][word2] += 1
         
    return model
    print("Number of dictionary entries:", len(model))
     
# Combine dictionaries and look for overlapping keys and combine the entries
def accumulate_models(list_of_models): 
    model_complete = dict()
    for i in range(len(list_of_models)):
        model_complete = dict(chain(list_of_models[i].items(), model_complete.items()))
    return model_complete
        
def accumulate_models_prob(model_complete):
    # Probability of a word, given the previous two words or word (item)
    for w1_w2 in tqdm(model_complete):
        total_count = float(sum(model_complete[w1_w2].values()))
        for word3 in model_complete[w1_w2]:
            model_complete[w1_w2][word3] /= total_count
    return model_complete


# calculate and accumulate all models 
def model_multi(data_set_col, num_ng_min, num_ng_max, mode_tri = True, mode_bi = True, Keep_duplic=False, col_name = "text"):
    """ data_set_col: column of data-set to process
        num_ng_min: minimum order of n-gram
        num_ng_max: maximum order of n-gram 
        mode_tri = True: 2 words given one out? (defalult = True)
        mode_bi = True: one word given one out? (default = True)
        Keep_duplic=False: Keep duplicates in n-gram model?
        col_name = "text": if col-name of data-frame not calles "text", then state it expicitly
    """
    
    model_comp = {}
    
    if mode_tri == True:
        for i in tqdm(range(num_ng_min,num_ng_max+1)): 
            df_preproc_ngram = data_set_col.copy()
            model = n_gram_model_dictionary(df_preproc_ngram[col_name], input_ngram_order=i, mode= "tri", Keep_duplic=False)
            del df_preproc_ngram 

            model_comp = accumulate_models([model_comp,model])
            del model
            
    if mode_bi == True:
        for i in tqdm(range(num_ng_min,num_ng_max+1)): 
            df_preproc_ngram = data_set_col.copy()
            model = n_gram_model_dictionary(df_preproc_ngram[col_name], input_ngram_order=i, mode= "bi", Keep_duplic=False)
            del df_preproc_ngram 

            model_comp = accumulate_models([model_comp,model])
            del model

    accumulate_models_prob(model_comp)  
    return model_comp


# function to output suggestions of words which should follow the two suggested input-words    
def n_gram_get_suggestion(input_dictionary, suggestion1, suggestion2 = None, num_sugg = 3 ):
    """ input_dictionary: dictionary in which the result should be look up
        suggestion1: first word 
        suggestion2: second word, if None, then return bi-gram model suggestion
        num_sugg = 3: number of output pairs (default = 3, 0 = all possible pairs)
        
        returns: ordered by value list of suggested words which are in the dictionary
    """
    
    # Problem of overwriting overlapping keys solved! --- accumulate first, then calcualte probability!
    
    # If suggestion2 is present
    if suggestion2 != None:
        sorted_toup = sorted(input_dictionary[suggestion1, suggestion2].items(), key = lambda x: x[1], reverse=True)
    else: 
        sorted_toup = sorted(input_dictionary[suggestion1].items(), key = lambda x: x[1], reverse=True)
        
    if num_sugg == 0:
        result = [[k, v] for k, v in sorted_toup]
    else:
        result = [[k, v] for k, v in sorted_toup][:num_sugg]
   # result = sorted(input_dictionary[suggestion1, suggestion2].items(), key = lambda x: x[1], reverse=True)[:num_sugg]
            
    return result


### Search for every Sentence with a key-token (§) and cut the sentence

In [None]:
df_regex = df_raw_use.copy() # copy the corresponding raw row to it 
df_regex

In [None]:
#df_regex = pd.DataFrame(columns = ["Doc_id", "Court", "Book", "Section", "Paragraph", "Doc_index", "Sent","Sent_comp" "Clausel"])

""" Output should be a pd.Dataframe with colums:
    Document id: Where to find the found reference or string occurrence
    Gericht: Which court was the case assigned to
    Book: Which book of legislation (if not found return None)
    Section: Which section within the book (return 0 if non-existent)
    Paragraph: --
    Document index: Index of Clausel (Should be a tupel (start : end))
    sent_comp: complete processed sentence
    sent_split: sentence splittet at searchterm
    Extracted Clausel: The Clausel of length n which was found to be of interest
"""

df_regex = df_raw_use.copy() # copy the corresponding raw row to it 

df_regex.rename(columns={"index":"Index", "Gericht":"Court", "doc_id":"Doc_id", "text":"Sent"}, inplace = True)
#df_regex = df_regex[["Index", "Doc_id", "Court", "Sent_comp" ]]
#df_regex[["Sent_comp"]] = df_raw_use[["index"]]
# add sent_comp to df_regex, and doc_id

# compute reduction of sentences and add to data-frame
df_regex["Sent_comp"] = sentence_reduction_one_it(df_regex["Sent"], '§', lang = "german",  case_i = True, min_len = 2)

### Identify reference (court, book,)

In [None]:
def str_int(string: str) -> int:
    if string == '':
        return None
    else:
        return int(string)

    # Mapping of every sentence which icludes a §
def find_references(decision_text: str, book: str) -> list:
    row_lsit = []
    for instance in tqdm(range(len(decision_text))): # for each document  
        row_lsit_2 = []
        for k in range(len(decision_text[instance])):

            references = re.findall(r'§ (\d+)\W*[Abs.]*\W*(\d+)*\W*[S.]*\W*(\d+)*\W*' + book, decision_text[instance][k])
            references = [{'book': book,'section': str_int(m[0]), 'paragraph': str_int(m[1])} for m in references]
            row_lsit_2.append(references)
        row_lsit.append([x for x in row_lsit_2 if x])
    return row_lsit

### CreateList of regex seach patterns to find all references

In [None]:
ref_booklst = ["AdVermiG", "AFG", "AltPflG", "AO", "AuslG", "BAFöG", "BBiG", "BDSG", "BErzGG", "BetrVG", "BGB", "BGG", "BGleiG", "BKGG", "BRRG", "BSeuchG", "BSHG", "BtBG", "BtG", "BtmG", "BVG", "EStG", "FEVG", "FGG", "FÖJG", "FSJG", "GeschlKrG", "GewO", "GG", "GlG", "HeimBSG", "HeimG", "JArbSchG", "JGG", "JÖSchG", "KHG", "KJHG", "KrPflG", "KSchG", "LGBG", "MuSchG", "PersAuswG", "PersVG", "PQsG", "PsychThG", "SchKG", "SchwbG", "SGB", "SGG", "StBauFG", "StGB", "StPO", "StVollzG", "USG", "UVG", "VwGO", "VwVfG", "WoGG", "ZDG", "ZPO"]

In [None]:
len_n = len(df_regex["Sent_comp"])
references = find_references(df_regex["Sent_comp"].replace('\n', ''), "VwGO")
for item in tqdm(ref_booklst):
    strng = str(item)
    ref = find_references(df_regex["Sent_comp"].replace('\n', ''), strng)    
    for i in range(len_n):
        references[i].append(ref[i])
        
for i in tqdm(range(len_n)): # for each document 
    references[i] = [x for x in references[i] if x] # delete all empty lists
    references[i] = list(itertools.chain(*references[i]))
    ref = []
    for j in range(len(references[i])):
        if len(references[i][j]) == 3:
            ref.append([references[i][j]])
        else: 
            ref.append(list(itertools.chain(references[i][j])))
    references[i] = list(itertools.chain(*ref))
    
    
# filter unapcked lists
for i in tqdm(range(len_n)): # for each document 
    for j in range(len(references[i])):
        if type(references[i][j]) != dict:
            references[i][j] = {}
            

In [None]:
refer_zwisch = references

## Create co-occurence matrix




In [None]:
# trnslate dict to str

for i in range(len(references)):
    for j in range(len(references[i])):
        references[i][j] = [(k,v) for k,v in references[i][j].items()]
        references[i][j] = list(itertools.chain(*references[i][j]))
        #references[i][j] = list(itertools.chain(*references[i][j]))
        references[i][j] = [x for x in references[i][j] if x] # delete all empty lists
    references[i] = [x for x in references[i] if x] # delete all empty lists
    
all_ref = []
for i in range(len(references)):
    for j in range(len(references[i])):
        all_ref.append([str(references[i][j][1]) + str(" ") + str(references[i][j][3])])
        if len(references[i][j]) <= 5:
            references[i][j] = (str(references[i][j][1]) + str(" ") + str(references[i][j][3])) #+ str(references[i][j][5]))
        else: 
            references[i][j] = (str(references[i][j][1]) + str(" ") + str(references[i][j][3]) + str(" ") + str(references[i][j][5]))

        

In [None]:
# search for all possible entries in referneces (col_names)
name_set = set()
for i in range(len(references)):
    b = set(references[i]) 
    name_set = name_set | b
    
    
name_set_all = set()
for i in range(len(all_ref)):
    b = set(all_ref[i]) 
    name_set_all = name_set_all | b

    
len(name_set),len(name_set_all)

In [None]:
from collections import OrderedDict

document = references # all data
names = name_set

occurrences = OrderedDict((name, OrderedDict((name, 0) for name in names)) for name in names)

# Find the co-occurrences:
for l in document:
    for i in range(len(l)):
        for item in l[:i] + l[i + 1:]:
            occurrences[l[i]][item] += 1


In [None]:
# Write co_occurrence matrix to csv
df_co_occ_complete = pd.DataFrame(occurrences)  
df_co_occ_complete.to_csv('co_occurrence_matrix.csv') 

### Create df with 5 most occuring co-occurencies 

In [None]:
co_occ_lst = []
co_occ_lst_name = []
for i in range(len(name_set)):
    arb = n_gram_get_suggestion(occurrences, suggestion1 = list(name_set)[i],suggestion2 = None, num_sugg = 5 )
    co_occ_lst_name.append(list(name_set)[i])
    co_occ_lst.append(arb)

In [None]:
# write co_occurrences per reference to csv
df_co_occ = pd.DataFrame(co_occ_lst, columns = ["First","Second","Third","Fourth","Fifth"])  
df_co_occ.index = co_occ_lst_name
df_co_occ.to_csv('5_most_occurrences_per_ref.csv') 

## Which references are the most frequent (in general), which 

In [None]:
# for item in list(name_set)[0], seach references(accumulated)
all_references = list(itertools.chain(*references))
all_ref = list(itertools.chain(*all_ref))
all_ref_count = []
reference_count = []

for i in tqdm(range(len(list(name_set)))):
    a,b = search_occurrence(all_references, list(name_set)[i], explicit = True)
    reference_count.append([list(name_set)[i], a ])
    
for i in range(len(name_set_all)):
    a,b = search_occurrence(all_ref, list(name_set_all)[i], explicit = True)
    all_ref_count.append([list(name_set_all)[i], a ])
    
reference_count.sort(key = lambda x: x[1]) 
reference_count.reverse()

all_ref_count.sort(key = lambda x: x[1]) 
all_ref_count.reverse()

# Plotting

In [None]:

max_length_of_Chart = 50

labels = [item[0] for item in reference_count[:max_length_of_Chart]]
counts = [item[1] for item in reference_count[:max_length_of_Chart]]

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(20,10))
rects1 = ax.bar(x, counts, width, label='References')

# Add some text for labels, title and custom x-axis tick labels, etc.
plt.xticks(rotation='vertical')
ax.set_ylabel('times of mentioning')
ax.set_title('reference')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)

fig.tight_layout()
plt.savefig("references_all.jpeg")
plt.show()



In [None]:
max_length_of_Chart = 50

labels = [item[0] for item in all_ref_count[:max_length_of_Chart]]
counts = [item[1] for item in all_ref_count[:max_length_of_Chart]]

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(20,10))
rects1 = ax.bar(x, counts, width, label='References')

# Add some text for labels, title and custom x-axis tick labels, etc.
plt.xticks(rotation='vertical')
ax.set_ylabel('times of mentioning')
ax.set_title('reference')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)

fig.tight_layout()
plt.savefig("references_all_reduced.jpeg")
plt.show()

### Book occurrences (not obvious ones)
Aus den 60 gesetzesbüchern, welche treten nach der VWgo noch häufig auf (VeGO ist offensichtlich) welche könnte noch von bedeutung sein


In [None]:
# ref_count liste durchgegehn und nur nach den Büchern suchen und nicht nach ganzen strings 

all_book_count = []


for i in tqdm(range(len(list(ref_booklst)))):
    a,b = search_occurrence(all_ref, ref_booklst[i], explicit = True)
    all_book_count.append([ref_booklst[i], a ])
    
all_book_count.sort(key = lambda x: x[1]) 
all_book_count.reverse()


In [None]:
max_length_of_Chart = 50

labels = [item[0] for item in all_book_count[:max_length_of_Chart]]
counts = [item[1] for item in all_book_count[:max_length_of_Chart]]

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(20,10))
rects1 = ax.bar(x, counts, width, label='occurrences of books')

# Add some text for labels, title and custom x-axis tick labels, etc.
plt.xticks(rotation='vertical')
ax.set_ylabel('times of mentioning')
ax.set_title('reference')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)

fig.tight_layout()
plt.savefig("books_all.jpeg")
plt.show()



In [None]:
max_length_of_Chart = 50

labels = [item[0] for item in all_book_count[1:max_length_of_Chart]]
counts = [item[1] for item in all_book_count[1:max_length_of_Chart]]

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(20,10))
rects1 = ax.bar(x, counts, width, label='occurrences of books')

# Add some text for labels, title and custom x-axis tick labels, etc.
plt.xticks(rotation='vertical')
ax.set_ylabel('times of mentioning')
ax.set_title('reference')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)

fig.tight_layout()
plt.savefig("books_all_reduced.jpeg")
plt.show()



## mean, variance Reference frequency (all, urteil, Beschluss)

In [None]:
refer_zwischen = refer_zwisch.copy()
ind_list_Beschluss = df_raw_use[df_raw_use["Entscheidungsart"] == "B"].index
ind_list_Urteil = df_raw_use[df_raw_use["Entscheidungsart"] == "U"].index

In [None]:
#len(refer_zwisch[0])

refer_zwischen_Beschluss = [refer_zwischen[item] for item in ind_list_Beschluss]
refer_zwischen_Urteil = [refer_zwischen[item] for item in ind_list_Urteil]

len_Besch = len(refer_zwischen_Beschluss)
len_Urteil = len(refer_zwischen_Urteil)

In [None]:
refer_zwischen_Beschluss_num = []
refer_zwischen_Urteil_num = []

for i in range(len_Besch):
    refer_zwischen_Beschluss_num.append(len(refer_zwischen_Beschluss[i]))
    
for i in range(len_Urteil):
    refer_zwischen_Urteil_num.append(len(refer_zwischen_Urteil[i]))
    

In [None]:
refer_zwischen_xxx_num = refer_zwischen_Beschluss_num + refer_zwischen_Urteil_num
refer_means = [np.mean(refer_zwischen_xxx_num),np.mean(refer_zwischen_Beschluss_num), np.mean(refer_zwischen_Urteil_num)]
refer_vars = [np.var(refer_zwischen_xxx_num),np.var(refer_zwischen_Beschluss_num), np.var(refer_zwischen_Urteil_num)]

In [None]:
labels = ["complete set", "Beschluss", "Urteil"]
counts = [ round(elem, ndigits=2) for elem in refer_means ]

x = np.arange(len(labels))  # the label locations
width = 0.5  # the width of the bars

fig, ax = plt.subplots(figsize=(10,6))
rects1 = ax.bar(x, counts, width, label='mean reference occurrence')

# Add some text for labels, title and custom x-axis tick labels, etc.
plt.xticks(rotation='vertical')
ax.set_ylabel('mean occurrence')
ax.set_title('mean reference occurrence per document')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)

fig.tight_layout()
plt.savefig("mean_ref_occurrence_per_doc.jpeg")
plt.show()



In [None]:
labels = ["complete set", "Beschluss", "Urteil"]
counts = [ round(elem, ndigits=2) for elem in refer_vars ]

x = np.arange(len(labels))  # the label locations
width = 0.5  # the width of the bars

fig, ax = plt.subplots(figsize=(10,6))
rects1 = ax.bar(x, counts, width, label='variance of reference occurrences')

# Add some text for labels, title and custom x-axis tick labels, etc.
plt.xticks(rotation='vertical')
ax.set_ylabel('variance of occurrence')
ax.set_title('variance of reference occurrence per document')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)

fig.tight_layout()
plt.savefig("mean_ref_occurrence_per_doc_variance.jpeg")
plt.show()

