# Preprocessing and Import Word Documents (Reports)
 
## Literatur
- https://textmining.wp.hs-hannover.de/Preprocessing.html
- https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

## Kommentare
- doc lesen ist schwierig. Darum verwenden eines Converters http://www.multidoc-converter.com/en/index.html, der in docx convertiert
- 



In [None]:
# %pip install --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --trusted-host pypi.org docx2python --user

In [None]:
# conda install -c conda-forge pypdf2
# conda install nltk 
# pip install HanTa

import re, nltk, os, glob, docx
import pandas as pd
import wehs_helpers as wh
from HanTa import HanoverTagger as ht
from docx2python import docx2python
import win32com.client
from bs4 import BeautifulSoup as bs
import functools
import operator
# import stopwords
# nltk.download('stopwords')  
os.name  # nt means windows

In [None]:
#list with pats to all files in the folder and subfolders

liste_docs2docx2 = pd.read_csv('liste_docs2docx2.csv',delimiter=';',encoding="latin-1")
liste_docs2docx2


In [None]:


import aspose.words as aw

def doc2docx(quelle, ziel):
    """
    Converts a DOC file to DOCX format using the Aspose.Words library.

    Parameters:
    - quelle (str): The path of the input DOC file.
    - ziel (str): The path where the converted DOCX file will be saved.

    Returns:
    - None: If the conversion is successful.
    - RuntimeError: If an error occurs during the conversion.
    """
    try:
        # Load DOC file
        doc = aw.Document(quelle)
        # Save DOC as DOCX
        doc.save(ziel)
    except RuntimeError:
        return


for i in range(len(liste_docs2docx2)):
        # Convert each document from .doc to .docx format
        doc2docx(liste_docs2docx2.loc[i, "quelle_mit_pfad"], liste_docs2docx2.loc[i, "ziele_mit_pfad"])


In [None]:

import os
import pandas as pd

def get_meta(region):
    """
    Retrieves metadata for rechenschaftsberichte based on the given region.

    Args:
        region (str): The region for which to retrieve the metadata.

    Returns:
        pandas.DataFrame: The metadata for the rechenschaftsberichte in the specified region.
    """
    def docpath(filename, folder):
        """
        Constructs the document path based on the filename and folder.

        Args:
            filename (str): The name of the file.
            folder (str): The folder path.

        Returns:
            str: The constructed document path.
        """
        if len(filename.split('.')) != 2:
            return 'error'
        else:
            name, ext = filename.split('.')

        if ext == 'DOCM':
            path = os.path.join(folder, filename)
        elif ext == 'DOC':
            path = os.path.join(folder, 'docx', name + '.docx')
        else:
            return 'error'
        
        if os.path.exists(path):
            return path
        else:
            return 'error'
        
    folder = 'c:\\temp\\rechenschaftsberichte\\rbs_{}'.format(region)
    meta_path = 'c:/temp/rechenschaftsberichte/meta_infos_rbs_{}.csv'.format(region)
    df = pd.read_csv(meta_path, delimiter=';')
    df['region'] = region
    df['docpath'] = df['RB_DATEI_NAME'].apply(lambda n: docpath(n, folder))
    df['doctype'] = df['docpath'].apply(lambda p: p.split('.')[-1])
    return df

regions = ['norden', 'osten', 'sueden', 'westen']
df = pd.concat([get_meta(reg) for reg in regions])
errors = df[df.docpath == 'error']['DOKUMENTNAME'].values
print('Number of errors =', len(errors))
df = df.query("docpath!='error'").reset_index(drop=True)
df

In [None]:
# checking the number of documents per region and doctype
wh.two_count(df,'doctype','region')

In [None]:
# Extracting text from the documents and save them in chunks of 1000 documents each as csv files
import docx
import functools
import operator

def get_text(docpath, docout=False):
    """
    Extracts text from a given document file.

    Parameters:
    - docpath (str): The path of the document file.
    - docout (bool): Optional. If True, returns the document object instead of the extracted text.

    Returns:
    - str or Document: The extracted text from the document file, or the document object if docout is True.
    """

    if docpath.split('.')[-1].lower() == 'docx':
        # If the document file is in .docx format
        doc = docx.Document(docpath)
        # Extract the text from paragraphs, excluding empty paragraphs
        text = '\n'.join([p.text for p in doc.paragraphs if len(p.text) > 0])
    elif docpath.split('.')[-1].lower() == 'docm':
        # If the document file is in .docm format
        doc = docx2python(docpath)
        tmp = doc.body
        for ii in range(3):
            tmp = functools.reduce(operator.iconcat, tmp, [])
        # Join the text from the document body
        text = '\n'.join(tmp)
    else:
        # If the document file format is not supported
        return 'error'

    if docout:
        # If docout is True, return the document object
        return doc
    else:
        # Remove non-ASCII characters from the extracted text
        text = ''.join(char for char in text if ord(char) < 256)
        return text
    
def get_tags(text, tokenizer = nltk.tokenize.RegexpTokenizer(r'[a-zA-ZäöüÄÖÜ]+')):

    tokens = [token.lower() for token in tokenizer.tokenize(text)]
    tagger = ht.HanoverTagger('morphmodel_ger.pgz')
    return tagger.tag_sent(tokens, taglevel=1)


# pattern = r'NN' # r'NN|VV.*|ADJA'
def get_lemmas(tags, pattern):
     return [lemma for (token, lemma, pos) in tags if re.match(pattern,pos)]
    
Nchunk = 1000   
Ndoc = df.shape[0]
pathpattern = 'data/chunk_{:02d}.csv'
for ii in range(int(Ndoc/Nchunk)+1):
    ind1 = ii*Nchunk
    ind2 = min([Ndoc,(ii+1)*Nchunk])
    sdf = df.iloc[ind1:ind2]
    sdf.loc[sdf.index,'text'] = sdf['docpath'].apply(get_text)
    
    # tags = sdf.text.apply(get_tags)
    # sdf.loc[sdf.index,'nn_lemmas'] = tags.apply(lambda t: get_lemmas(t,'NN'))
    # sdf.loc[sdf.index,'vv_lemmas'] = tags.apply(lambda t: get_lemmas(t,'VV.*'))
    # sdf.loc[sdf.index,'adja_lemmas'] = tags.apply(lambda t: get_lemmas(t,'ADJA'))
    
    sdf.to_csv(pathpattern.format(ii))
    print('chunk',ii,'done ...')