In [1]:
import re
import json
import string
import pandas as pd
from cleantext import clean
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [None]:
def read_reviews(file_path):
    with open(file_path, 'r') as file:
        reviews = file.readlines()
    return reviews

In [6]:
file_path = '../raw/ml-based.txt'  # Replace with the actual path to your text file
ml_based = read_reviews(file_path)
core = read_reviews('../raw/core/train_reviews.txt')
core += read_reviews('../raw/core/test_reviews.txt')
ml_df = pd.DataFrame({'Group': 'ML-based', 'review_raw': ml_based})
core_df = pd.DataFrame({'Group': 'Core', 'review_raw': core})
reviews_raw = pd.concat([ml_df, core_df], ignore_index=True)
reviews_raw


Unnamed: 0,Group,review_raw
0,ML-based,<p>persona: </p><ul><li>die grundzüge personas...
1,ML-based,<p>stärken lösung:</p><ul><li>die personas gut...
2,ML-based,<p>stärken lösung:</p><ul><li>die personas gut...
3,ML-based,"<p>hello there, hoffe, feedback weiterentwickl..."
4,ML-based,"<p>hello there, voll idee begeistert! ich hoff..."
...,...,...
24162,Core,"Hallo, Hier findest du meine Reventuelliew für..."
24163,Core,\n
24164,Core,"Ich finde es eine gute Idee, den klassischen O..."
24165,Core,\n


### processing functions

In [7]:
def female_pronouns(sentences):
    results = []
    for sentence in sentences.split('.'):
        pattern = r"\b(\w+)\s(?:Sie|Ihr|Ihrem|Ihren|Ihrer|Ihres)\b"
        matches = re.findall(pattern, sentence)
        if matches:
            # print(sentence)
            sentence = re.sub(pattern, matches[0].split(' ')[0], sentence)
            # print(sentence)
        results.append(sentence)
    return '.'.join(results)

In [8]:
def clean_text(review):
    # remove html tags
    sentences = BeautifulSoup(review).get_text(" ")
    # replace indexing with a space
    sentences = re.sub(r"\d+[).]+", ' ', sentences)
    # remove .pdf files
    sentences = re.sub('\w*.pdf', '', sentences)
    # keep e3 values
    sentences = re.sub('[eE]3', 'E-three', sentences)
    # remove version numbers
    sentences = re.sub('[vV][12]', '', sentences)
    # replace new line with space
    sentences = sentences.replace('\n', ' ')
    # remove urls
    sentences = re.sub(r'http\S+', '', sentences)

    sentences = female_pronouns(sentences)
    # use package
    sentences = clean(sentences, no_emoji=True, lower=True,
        no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True,
        no_digits=True, no_currency_symbols=True, no_punct=True,
        replace_with_url='', replace_with_email='', replace_with_phone_number='',
        replace_with_number='', replace_with_digit='', replace_with_currency_symbol='', lang='de')
    
    # substitute multiple whitespace with single whitespace
    # Also, removes leading and trailing whitespaces
    text_no_doublespace = re.sub('\s+', ' ', sentences).strip()
    return text_no_doublespace

In [9]:
def stop_word_removal(x, remove_stop_words):
    gender_set = set()

    with open('../scripts/male.txt', 'r') as file:
        for line in file:
            # Remove leading/trailing whitespace, including the newline character
            word = line.strip()
            gender_set.add(word) 

    with open('../scripts/female.txt', 'r') as file:
        for line in file:
            # Remove leading/trailing whitespace, including the newline character
            word = line.strip()
            gender_set.add(word) 
    token = x.split()
    german_stop_words = stopwords.words('german')
    stop_words = set(german_stop_words) - gender_set
    abbreviations = ['ggf.', 'ggf', 'vlt.', 'vlt', 'dh.', 'd.h.', 'd.h', 'd h', 'dh', \
            'zb.', 'z.b.', 'z.b', 'z b', 'zb', 'bsp.', 'bsp', 'bspw.', 'bspw', \
            'oä.', 'o.ä.', 'oä', 'ev.', 'ev', 'evtl.', 'evtl']
    if remove_stop_words == 1:
        cleaned = ' '.join([w for w in token if not w in list(stop_words) and not w in abbreviations])
    elif remove_stop_words == 2:
        cleaned = ' '.join([w for w in token if not w in set(german_stop_words)])
    else:
        cleaned = ' '.join([w for w in token if not w in abbreviations])
    return cleaned.replace('.', '\n')

In [10]:
def clean_full(review, remove_stop_words):
    # remove meaningless characters
    review = clean_text(review)
    review = stop_word_removal(review, remove_stop_words=remove_stop_words)
    # remove all characters before the first letter
    return review.lstrip(string.punctuation + string.whitespace)

In [11]:
reviews_raw['review_nostop'] = reviews_raw['review_raw'].apply(lambda x: clean_full(x, remove_stop_words=0))



In [13]:
reviews_raw.to_csv('../outputs/reviews_raw_noAI.csv', index=False)