# Extraction Text Sections with violence stems words
 
## Literatur
- https://textmining.wp.hs-hannover.de/Preprocessing.html
- https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0
- https://neptune.ai/blog/pyldavis-topic-modelling-exploration-tool-that-every-nlp-data-scientist-should-know

## Ideen
-  To determine if there is a reference to violence in the text, only certain parts of the text are relevant. Typically, the reference to violence can be identified by keywords such as violence, hitting, assault, aggressive, women's shelter(?), protective measure, police surveillance, fighting, psychological pressure, ...
- Nur Teile des Textes sind relevant um zu bestimmen, ob es einen Bezug zu Gewalt gibt
- Typischerweise zeigt sich der Bezug zu Gewalt an Stichworten: Gewalt, schlagen, übergriff, aggressiv, Frauenhaus(?), Schutzmassnahme, Polizeiüberwachung, aufeinander losgehen, psychischer Druck, ... 
- 



In [1]:
# conda install -c conda-forge pypdf2
# conda install nltk 
# pip install HanTa

import re, nltk, os, glob, docx, json, gensim
import pandas as pd
import numpy as np
import seaborn as sns
import wehs_helpers as wh
# from HanTa import HanoverTagger as ht
from docx2python import docx2python
import win32com.client
from bs4 import BeautifulSoup as bs
import gensim.corpora as corpora

# import stopwords
# nltk.download('stopwords')  
os.name  # nt means windows
pd.options.display.max_colwidth = 1000

## Analysis Metadata

In [4]:
# load meta data about the reports
mdf = []
for region in ['norden', 'osten', 'sueden', 'westen']:
    meta_path = '../../../Daten_pcm_export/meta_infos_rbs_{}.csv'.format(region)
    mdf.append(pd.read_csv(meta_path,delimiter=';').assign(region=region))

mdf = pd.concat(mdf, axis=0).drop_duplicates('ID_DOKUMENT').reset_index(drop=True)

In [3]:
# mdf['RBS_REIHENFOLGE'][:30]

In [5]:
# show the number of reports per clients case
wh.one_count(mdf,'RBS_REIHENFOLGE')
pd.crosstab(index=mdf['RBS_REIHENFOLGE'], columns='count')

col_0,count
RBS_REIHENFOLGE,Unnamed: 1_level_1
1,15057
2,8148
3,4302
4,2360
5,1300
6,697
7,368
8,179
9,63
10,34


## Load data

In [8]:
df = [pd.read_csv(path,index_col=0) for path in glob.glob('data/*.csv')]
df = pd.concat(df).fillna({'text':''})
df['textlen'] = df.text.apply(len)

df["text"][:1]
# df['nn_lemmas'][:1]
# parse list, Hochkommas werden ersetzt, String mit allen Lemmas erstellt
def parse_list(s):
    for ch in "'[]":
        s = s.replace(ch,'')
    return s.split(', ')

# df['nn_lemmas'] = df['nn_lemmas'].apply(parse_list)
# df['vv_lemmas'] = df['vv_lemmas'].apply(parse_list)
# df['adja_lemmas'] = df['adja_lemmas'].apply(parse_list)
# df['nn_lemmas'][:1]
# remove duplicates
df = df.drop_duplicates('ID_DOKUMENT').reset_index(drop=True)
# append new metadata
id_col = 'ID_DOKUMENT'
tmp = mdf.set_index(id_col).reindex(df[id_col])
for col in tmp.columns:
    if col not in df.columns:
        df[col] = tmp[col].values

## Identifying sentences with violence

False positives:
- Hitting: suggestion, suggestions, suggested, I suggest, fights, affected, division proposal
- Pressure: under pressure, impression, expression

Others:
- started
- psychological pressure

Difficult cases:
- State of mind, wishes, will (Which solutions or approach does the child or adolescent suggest)
## Sätze mit Gewalt identifizieren

Falsch positive:
- Schlagen: Vorschlag, Vorschläge, schlug vor, schlage ich vor, Schlägereien, angeschlagen, Teilungsvorschlag
- Druck: Auf druck hin, Eindruck, Ausdruck, 

Weitere:
- losgegangen
- psyschischer Druck

Schwierige Fälle:
- Befindlichkeit, Wünsche, Wille (Welche Lösungen bzw. welches Vorgehen schlägt das Kind bzw. der/die Jugendliche vor)

In [13]:
# df.loc[:5,["text", "VORNAME", "NAME", "VORNAME_MUTTER", "NAME_MUTTER", "VORNAME_VATER", "NAME_VATER"]]

In [14]:
sample_df = df.sample(100)

In [15]:
# search for violence stems in the text and extraxt the sentence and the two sentences before and after the violence stem
class ViolenceFinder:
    RE_POS_STEMS = ['gewalt','schl\wg','übergriff']
    FALSE_POS_STEMS = ['schlagmüller','ausschlag','einschlag','eingeschlag','niederschlag','niedergeschlag',
                      'schicksalsschlag','angeschlag','vorschlag','vorgeschlag','vorschläge','voranschlag',
                      'ratschlag','ratschläge','schlagzeug','schleg']
    CUT_PHRASES = ['(Welche Lösungen bzw. welches Vorgehen schlägt das Kind bzw. der/die Jugendliche vor)']
    SPLITCHAR = '. | '
    SPLITCHARFULLSTOP = '.'
    SPLITPHRASES = ['Ausgangslage, Auftrag und Ziele\n','Situation, Entwicklung\n','Auftrag und Ziele\n',
                   'Vorgeschichte\n']
    SPLITPATTERN = r'((?<!(z\.B|.\s\w|.\W\w|..\d|Art|bzw|geb|Abs|Stv|etc))\.|;|:|\n\w\)|--|\n\n|\d*\.\d*\.\d\d\d\d\n|\{})'.format('|'.join(SPLITPHRASES))     # three characte expressions
    
    def __init__(self, text):
        self.text = text
        
        self.sentences = [s for s in re.split(self.SPLITPATTERN, text) if s is not None]
        
        
        self.pos_words = [self._find_pos_words(s) for s in self.sentences]
        self.labels = [len(words)>0 for words in self.pos_words]

                
    def _find_pos_words(self,text):
        for phrase in self.CUT_PHRASES:
            text = text.lower().replace(phrase.lower(),'')
        pattern = [r'\b\w*{}\w*\b'.format(stem) for stem in self.RE_POS_STEMS]
        pattern = r'(' + r'|'.join(pattern) + r')'
        pos_words = []
        for word in re.findall(pattern, text.lower()):
            if not np.any([fpword in word for fpword in self.FALSE_POS_STEMS]):
                pos_words.append(word)
        return pos_words
    
    def label(self):
        return np.any(self.labels)
        
    def pos_sentences(self):
        out = []        
        for ind in np.where(self.labels)[0]:
            out.append(self.sentences[ind].replace('\n','').strip()+self.SPLITCHARFULLSTOP)
#             print(type(out))
        out3 = []
        out5 = []
        for ind in np.where(self.labels)[0]:
            try:
                tempM1 = (self.sentences[ind-2].replace('\n','').strip()+self.SPLITCHAR+' ')
            except IndexError:
                tempM1 = self.SPLITCHAR
            
            temp = (self.sentences[ind].replace('\n','').strip()+self.SPLITCHAR+' ')
            
            try:
                tempP1 =(self.sentences[ind+2].replace('\n','').strip()+self.SPLITCHAR)
            except IndexError:
                tempP1 = self.SPLITCHAR
            
            try:
                tempM2 = (self.sentences[ind-4].replace('\n','').strip()+self.SPLITCHAR+' ')
            except IndexError:
                tempM2 = self.SPLITCHAR
            try:
                tempP2 =(self.sentences[ind+4].replace('\n','').strip()+self.SPLITCHAR)
            except IndexError:
                tempP2 = self.SPLITCHAR 
                
            out3_temp = tempM1 + temp + tempP1
            out3.append(out3_temp)
#             print(out3_temp)
            out5_temp = tempM2 + tempM1 + temp + tempP1 + tempP2
            out5.append(out5_temp)
            
        return out, out3, out5
#     def pos_sentences3(self):       
#         out3 = []
#         for ind in np.where(self.labels)[0]:
#             try:
#                 tempM1 = (self.sentences[ind-2].replace('\n','').strip()+self.SPLITCHAR+' ')
#             except IndexError:
#                 tempM1 = '.'
            
#             temp = (self.sentences[ind].replace('\n','').strip()+self.SPLITCHAR+' ')
            
#             try:
#                 tempP1 =(self.sentences[ind+2].replace('\n','').strip()+self.SPLITCHAR)
#             except IndexError:
#                 tempP1 = '.'
                
            
#             out3.append(tempM1 + temp + tempP1)
# #             print(type(out3))
#         return out3

    def marked_text(self, ansi='\033[44;33m', nul = "\033[0m"):
        text = ''
        for ii, s in enumerate(self.sentences):
            if self.labels[ii]:
                text += ansi + s + nul + self.SPLITCHAR
            else:
                text += s + self.SPLITCHAR        
        return text
    
    def pprint(self):
        print(self.marked_text())
        
    def ppprint(self):
        print(self.sentencesMinus1, self.sentences, self.sentencesPlus1)
    
vf = ViolenceFinder(df.text[2])
# vf.pprint()
# vf.ppprint()


In [None]:
# save the sentences with violence stems in a dataframe
sentences = []
sentences3 = []
sentences5 = []
id_dict = {key:[] for key in ['ID_DOKUMENT','ID_K','RBS_REIHENFOLGE']}
for ii, row in df.iterrows():
    pos_sentences, pos_sentences3, pos_sentences5 = ViolenceFinder(row['text']).pos_sentences()
    sentences += pos_sentences
#     pos_sentences3 = ViolenceFinder(row['text']).pos_sentences3()
    sentences3 += pos_sentences3
#     print(pos_sentences3, len(pos_sentences3), pos_sentences, len(pos_sentences))
    sentences5 += pos_sentences5

    for key, el in id_dict.items():
        el += [row[key]]*len(pos_sentences)
    


sent_df = pd.DataFrame()
for key, el in id_dict.items():
    sent_df[key] = el
sent_df['Nsatz'] = sent_df.groupby(['ID_K'])[['ID_K']].count()['ID_K'].reindex(sent_df['ID_K']).values
sent_df['Gewalt'] = 1
sent_df['Gegen Kind'] = np.nan
sent_df['Satz'] = sentences
sent_df['Satz3'] = sentences3
sent_df['Satz5'] = sentences5
 
print('Anzahl Sätze:',sent_df.shape[0])
print('Anzahl Dokumente:',len(sent_df['ID_DOKUMENT'].unique()))
sent_df.head()
sent_df.ID_DOKUMENT.is_unique
df

In [19]:
# add the metadata of the clients case to the sentences
sent2_df= pd.merge(sent_df, df[['ID_DOKUMENT', 'ID_COMBO_ANREDE', 'VORNAME', 'NAME', 'VORNAME_MUTTER', 'NAME_MUTTER', 'VORNAME_VATER', 'NAME_VATER']], on="ID_DOKUMENT", how="left")
pd.options.display.max_colwidth = 100000
# df.loc[df["ID_DOKUMENT"]==15997, ["text"]]
# sent2_df
# df.loc[:10, ["text", "VORNAME", "ID_DOKUMENT"]]
# sent2_df.loc[10:30, ['Satz3', 'VORNAME', 'NAME', 'VORNAME_MUTTER', 'NAME_MUTTER', 'VORNAME_VATER', 'NAME_VATER']]

# sent2_df['ID_DOKUMENT_string']=sent2_df['ID_DOKUMENT'].apply(str)

# sent2_df['Satz100char'] = sent2_df['Satz'].str[:100]

# sent2_df['ID_DOKUMENT_Satz100char']=sent2_df['ID_DOKUMENT_string']+sent2_df['Satz100char']

# sent2_df = sent2_df.drop_duplicates(subset='ID_DOKUMENT_Satz100char', keep="first")
# sent2_df['ID_DOKUMENT_Satz100char'][sent2_df.duplicated(subset=['ID_DOKUMENT_Satz100char'])]
# # sent2_df[sent2_df.duplicated(subset=['ID_DOKUMENT_Satz100char'])]

# drop duplicates
sent2_df = sent2_df.drop_duplicates(subset='Satz', keep="first")
# clean up the dataframe
sent2_df= sent2_df[['Satz','ID_DOKUMENT', 'ID_K',  'Satz3', 'Satz5', 'ID_COMBO_ANREDE', 'VORNAME', 'NAME', 'VORNAME_MUTTER', 'NAME_MUTTER', 'VORNAME_VATER', 'NAME_VATER' ]]
# sent2_df
# save the text_sections dataframe in a csv file and excel file
sent2_df.sort_values(by=['Satz'],ascending=False).to_excel('GewaltSaetze5_20230213.xlsx',index=False)
sent2_df.sort_values(by=['Satz'],ascending=False).to_csv('GewaltSaetze5_20230213.csv',index=False)

# End