In [None]:
import pandas as pd
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import spacy

stemmer = PorterStemmer()

In [None]:
def load_tools(language, stopwords=stopwords):
    """
    Load open tasks in DataFrame from csv file.
    Define stopwords and lemmatizer appropriate to given language of data.
    """
    if language == "german": 
        stopwords = stopwords.words('german') 
        lemmatizer = spacy.load("de_core_news_sm") 
    elif language == "english": 
        stopwords = stopwords.words('english')
        lemmatizer = spacy.load('en_core_web_sm')
    else:
        raise language + " language not supported." 
    return stopwords, lemmatizer 

def preprocess(text, lemmatizer, stopwords, stemmer=stemmer):
    if isinstance(text, float):
        return ""
    # lowercase, remove punctuation, tokenize 
    words = simple_preprocess(text, deacc=True, min_len=1, max_len=50)
    # remove stopwords 
    tokens = [word for word in words if word not in stopwords]
    # stemming
    stems = [stemmer.stem(token) for token in tokens]
    # lemmatize
    stemmed_doc = lemmatizer(" ".join(stems))
    lemmas = [s.lemma_ for s in stemmed_doc]
    # lowercase again (apparently stemming capitalizes the output) 
    lemmas = [lemma.lower() for lemma in lemmas] 
    return " ".join(lemmas)

In [None]:
stopwords, lemmatizer = load_tools("english")
filenames = [
    "open_tasks_EN",
    "augmented_BT_EN", 
    "augmented_RD_EN", 
    "augmented_RI_EN", 
    "augmented_RS_EN", 
    "augmented_SR_EN", 
]

for filename in filenames: 
    df = pd.read_csv("data/"+filename+".csv") 
    df["description"] = df["description"].apply(lambda text: preprocess(text, lemmatizer, stopwords))
    df["word_count"] = df["description"].apply(lambda s: len(s.split())) 
    df.to_csv("data/preprocessed_"+filename+".csv", index_label=False)

In [None]:
stopwords, lemmatizer = load_tools("german")
filenames = [
    "open_tasks_DE",
    "augmented_BT_DE", 
    "augmented_RD_DE", 
    "augmented_RI_DE", 
    "augmented_RS_DE", 
    "augmented_SR_DE", 
]

for filename in filenames: 
    df = pd.read_csv("data/"+filename+".csv") 
    df["description"] = df["description"].apply(lambda text: preprocess(text, lemmatizer, stopwords))
    df["word_count"] = df["description"].apply(lambda s: len(s.split())) 
    df.to_csv("data/preprocessed_"+filename+".csv", index_label=False)