In [1]:
import pandas as pd 
import numpy as np
import os
import re

In [2]:
df = pd.read_excel("data/translation-data.xlsx")

In [3]:
df

Unnamed: 0,English,German
0,"""The mask with a grinning man's face appears c...",Die Maske mit grinsendem Männergesicht wirkt d...
1,The WSWS posted this comment on the slanderous...,Die WSWS hatte den folgenden Kommentar zu dies...
2,The Haitian manner of spelling 'vodou' was int...,"""Für diese Ausstellung wurde bewusst die haiti..."
3,The database also records new manufacturers in...,Die Datenbank nimmt bei jeder neuen Herausgabe...
4,Medially this situation is hushed up with just...,"Medial wird diese Situation, wenige Ausnahmen ..."
...,...,...
95,You will then have some free time before the d...,Es wird auch einige Zeit zur freien Verfügung ...
96,Our grandchildren Saphia and Avia are visiting...,Unsere Enkelkinder Saphia und Avia sind bei de...
97,Fashion and advertising have ensured a lasting...,Mode und Werbung sorgen für ein dauerhaftes re...
98,Their caps are brown <<tag:ends />> and streaked.,Der Oberkopf ist braun und gestreift.


In [4]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    clean1 = re.compile('<<.*?>>')
    clean2 = re.compile('<.*?>')
    text = re.sub(clean1, '', text)
    text = re.sub(clean2, '', text)
    return text

In [5]:
def dequote(s):
    """
    If a string has single or double quotes around it, remove them.
    Make sure the pair of quotes match.
    If a matching pair of quotes is not found,
    or there are less than 2 characters, return the string unchanged.
    """
    if (len(s) >= 2 and s[0] == s[-1]) and s.startswith(("'", '"')):
        return s[1:-1]
    return s

In [6]:
def replace_multiple_dots_spaces(text):
    """If a string has multiple dots or spaces, this replaces to single dot and single space"""
    text = re.sub('\.+', ' ', text)
    text = re.sub(' +', ' ', text)
    return text

def replace_multiple_back_slash(text):
    """If a string has multiple back slash, this removes all"""
    text = text.replace("\\","")
    return text

def replace_special_characters(text):
    text = re.sub('\„+', ' ', text)
    return text

In [7]:
replace_multiple_back_slash(df.English[70])

'The text is from Et suk igennem verden går (A sigh throughout the world doth go), no.'

In [8]:
# build the object "CleanText"
class CleanText:
    
    def remove_html_tags(text):
        """Remove html tags from a string"""
        clean1 = re.compile('<<.*?>>')
        clean2 = re.compile('<.*?>')
        text = re.sub(clean1, '', text)
        text = re.sub(clean2, '', text)
        return text

    def dequote(text):
        """
        If a string has single or double quotes around it, remove them.
        Make sure the pair of quotes match.
        If a matching pair of quotes is not found,
        or there are less than 2 characters, return the string unchanged.
        """
        if (len(text) >= 2 and text[0] == text[-1]) and text.startswith(("'", '"')):
            return text[1:-1]
        return text
    
    def replace_multiple_dots_spaces(text):
        """If a string has multiple dots or spaces, this replaces to single dot and single space"""
        text = re.sub('\.+', ' ', text)
        text = re.sub(' +', ' ', text)
        return text

    def replace_multiple_back_slash(text):
        """If a string has multiple back slash, this removes all"""
        text = text.replace("\\","")
        return text

    def replace_special_characters(text):
        text = re.sub('\„+', ' ', text)
        return text
    
    def clean_functions(text):
        text = remove_html_tags(text)
        text = dequote(text)
        text = replace_multiple_back_slash(text)
        text = replace_special_characters(text)
        text = replace_multiple_dots_spaces(text)
        return text.strip()

In [9]:
df.German[70]

'Der Text stammt aus Nr. 139 des Dänischen Gesangbuchs (1953), „Ein Seufzer durch die Welt geht".'

In [10]:
text = df.English[89]
CleanText.clean_functions(text)

'Prior says the latent potential in Thailand is enormous'

In [11]:
df['Clean_English'] = df['English'].apply(lambda x: CleanText.clean_functions(x))
df['Clean_German'] = df['German'].apply(lambda x: CleanText.clean_functions(x))

In [12]:
df.drop(["English","German"], axis=1, inplace=True)
df.rename(columns={"Clean_English":"English","Clean_German":"German"}, inplace=True)
df.to_excel("CleanData.xlsx", index=False)

In [13]:
new_df = pd.read_excel("data/CleanData.xlsx")

In [14]:
new_df

Unnamed: 0,English,German
0,The mask with a grinning man's face appears ca...,Die Maske mit grinsendem Männergesicht wirkt d...
1,The WSWS posted this comment on the slanderous...,Die WSWS hatte den folgenden Kommentar zu dies...
2,The Haitian manner of spelling 'vodou' was int...,Für diese Ausstellung wurde bewusst die haitia...
3,The database also records new manufacturers in...,Die Datenbank nimmt bei jeder neuen Herausgabe...
4,Medially this situation is hushed up with just...,"Medial wird diese Situation, wenige Ausnahmen ..."
...,...,...
95,You will then have some free time before the d...,Es wird auch einige Zeit zur freien Verfügung ...
96,Our grandchildren Saphia and Avia are visiting...,Unsere Enkelkinder Saphia und Avia sind bei de...
97,Fashion and advertising have ensured a lasting...,Mode und Werbung sorgen für ein dauerhaftes re...
98,Their caps are brown and streaked,Der Oberkopf ist braun und gestreift
