In [1]:
import spacy
from spacy import displacy
import re
nlp = spacy.load("de_dep_news_trf")
import de_dep_news_trf
nlp = de_dep_news_trf.load()

In [2]:
import pandas as pd

# Read the Excel file into a Pandas dataframe
df = pd.read_excel(r'C:\Users\matte\OneDrive\Masterstudium\MA gehören\new data\code\gehören_posTI45k.xlsx')

In [3]:
#adding id numbers
df['sentence_id'] = df.reset_index().index +1

In [4]:
# Split Meta into zeitung und datum
df[['Zeitung', 'Rest']] = df['Meta'].str.split('_', n=1, expand=True)

# nur das tatsächliche Datum bei 'Datum'
df['Datum'] = df['Rest'].str.extract(r'(\d{4})')

df = df.drop(columns=['Rest'])

In [5]:
df_ohnemeta = df.drop(columns=['Meta', 'Zeitung', 'Datum', 'Ressort', 'Mediatype', 'Region'])

In [6]:
# Define the function to normalize text
def normalize_text(text):
    # remove digits
    text = re.sub(r'\d+', '', text)
    # remove superfluous white spaces
    text = re.sub(r'\s+', ' ', text)
    return text

In [7]:
df_ohnemeta['text'] = df_ohnemeta.apply(lambda row: ' '.join([str(row[col]) for col in df_ohnemeta.columns]), axis=1)

# Normalize the text in the dataframe
df_ohnemeta['text'] = df_ohnemeta['text'].apply(normalize_text)


In [8]:
# Split the text into sentences at the <s>
df_ohnemeta['sentences'] = df_ohnemeta['text'].str.split(r'<s>')

#explode sentences
df_ohnemeta = df_ohnemeta.explode('sentences')


In [9]:
# Split the text into sentences at the <s>
df_ohnemeta['punctuation'] = df_ohnemeta['sentences'].str.split(r'[^\w\s]')

#explode sentences
df_ohnemeta = df_ohnemeta.explode('punctuation')

In [10]:
#split the text at the und
df_ohnemeta['splitund'] = df_ohnemeta['punctuation'].str.split(r'\bund\b')

#explode sentences
df_ohnemeta = df_ohnemeta.explode('splitund')

In [11]:
df_ohnemeta['splitund'] = df_ohnemeta['splitund'].apply(nlp)

In [12]:
#filter sentences with gehören
target_word = 'gehören'
df_filtered = df_ohnemeta[df_ohnemeta['splitund'].apply(lambda x: any([token.lemma_ == target_word for token in x]))]

In [13]:
df_unique = df_filtered.drop_duplicates(subset='sentence_id')

In [14]:
df_merged = pd.merge(df[['sentence_id', 'Zeitung', 'Datum', 'Ressort', 'Mediatype', 'Region']], df_unique[['sentence_id', 'text', 'splitund']],  on='sentence_id', how='right')

In [15]:
# Function to extract gehören-Formen
def extract_gehören(text):
    doc = nlp(text)
    gehören = [token.text for token in doc if token.lemma_ == 'gehören']
    return ', '.join(gehören)

# Apply the function to the column and create a new column with the extracted verbs
df_merged['gehören'] = df_merged['splitund'].apply(extract_gehören)

In [16]:
# Function to extract VVPP tagged verbs
def extract_vvpp_verbs(text):
    doc = nlp(text)
    vvpp_verbs = [token.text for token in doc if token.tag_ == 'VVPP' and token.lemma_ != 'gehören']
    return ', '.join(vvpp_verbs)

# Apply the function to the column and create a new column with the extracted verbs
df_merged['VVPP'] = df_merged['splitund'].apply(extract_vvpp_verbs)

In [17]:
import openpyxl
print(df_merged)
df_merged.to_excel('results_posTI45kleicht.xlsx', index=False)

       sentence_id Zeitung Datum    Ressort Mediatype      Region   
0                1     APA  2010     inland   agentur     agesamt  \
1                2     APA  2010     inland   agentur     agesamt   
2                3     APA  2010      sport   agentur     agesamt   
3                5     APA  2010     inland   agentur     agesamt   
4                6     APA  2010    ausland   agentur     agesamt   
...            ...     ...   ...        ...       ...         ...   
43201        45870   WOMAN  2022  allgemein     print  spezifisch   
43202        45871   WOMAN  2022  allgemein     print  spezifisch   
43203        45872   WOMAN  2022  allgemein     print  spezifisch   
43204        45873   WOMAN  2022      lokal     print  spezifisch   
43205        45874   WOMAN  2022  allgemein     print  spezifisch   

                                         text   
0                    gehört ganz abgeschafft   \
1                        gehörte unterbunden    
2                       