In [136]:
from subprocess import check_output, run
from nltk.tokenize import sent_tokenize, word_tokenize
import os
import pandas as pd
import re
from pandarallel import pandarallel

In [141]:
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


Run the make command so that the RFTagger is built

In [2]:
run(["make"], cwd="RFTagger/src")

strip ./rft-train ./rft-print ./rft-annotate


CompletedProcess(args=['make'], returncode=0)

In [151]:
def run_batches(df,starting_index, max_len):
    
    if starting_index > max_len:
        print("max_len must not be less than starting_index")
        return
    
    if max_len < 2000:
        print("Please specify a number larger than 2000")
        return
    
    # the run should be done in 1000 piece batches
    chosen_index = list(range(starting_index,starting_index+1000))
    
    # the 1000 piece batches are stored in separate .csv files with the naming convention
    # batch_<starting_index>_<ending_index>.csv (batch_0_999.csv)
    file_name = get_filename(chosen_index)
    
    while chosen_index[-1] <= (max_len):
        df_current = df.iloc[chosen_index]
        df_current["tagged"] = df_current.parallel_apply(
            lambda row: tag("test_{}".format(row.name),row[1]),axis=1)
        
        df_current.to_csv(file_name,index=False)
        new_index = [x+1000 for x in chosen_index]
        
        chosen_index = new_index
        file_name = get_filename(chosen_index)
    

def get_filename(index):
    return "batch_{start}_{end}.csv".format(start=index[0],end=index[-1])

def tag(filename, text):
    file = open("RFTagger/{}".format(filename),"w")
    file.write("\n\n".join("\n".join(word_tokenize(sentence, language='german')) for sentence in sent_tokenize(text, language='german')))
    file.close()
    
    res = check_output(["src/rft-annotate", "lib/german.par", filename], cwd="RFTagger").decode("utf-8").split("\n")

    os.system("rm RFTagger/{}".format(filename))
    
    return ' '.join(res)

def contains(text,tag):
    regexp = re.compile(r'{}'.format(tag))
    return bool(regexp.search(text))

In [19]:
df = pd.read_csv("../../data/protocols/all_parsed.csv")

In [157]:
run_batches(df, 60000, 63000)

reading parameter file...reading parameter file...finished
0finished
2
4
reading parameter file...reading parameter file...finished
2
finished
3
reading parameter file...reading parameter file...finished
3
finished
1
reading parameter file...reading parameter file...finished
0finished
1
2
reading parameter file...reading parameter file...finished
2
finished
4
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
5
finished
4
reading parameter file...reading parameter file...finished
3
finished
3
reading parameter file...reading parameter file...finished
1
finished
4
reading parameter file...reading parameter file...finished
2
finished
4
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
1
finished
8
reading parameter file...reading parameter file...finished
1
finished
5reading parameter file...
reading parameter file...finished
2
r

reading parameter file...finished
4
reading parameter file...finished
5
reading parameter file...finished
4
reading parameter file...finished
5
reading parameter file...finished
2
reading parameter file...finished
5
reading parameter file...finished
1
reading parameter file...finished
1
reading parameter file...finished
5
reading parameter file...finished
2
reading parameter file...finished
3
reading parameter file...finished
3
reading parameter file...finished
5
reading parameter file...finished
2
reading parameter file...finished
5
reading parameter file...finished
6
reading parameter file...finished
2
reading parameter file...finished
5
reading parameter file...finished
3
reading parameter file...finished
3
reading parameter file...finished
2
reading parameter file...finished
5
reading parameter file...finished
2
reading parameter file...finished
1
reading parameter file...finished
3
reading parameter file...finished
7
reading parameter file...finished
3
reading parameter file...fin

reading parameter file...reading parameter file...finished
2
finished
2
reading parameter file...reading parameter file...finished
4
finished
2
reading parameter file...reading parameter file...finished
2
finished
4
reading parameter file...reading parameter file...finished
3
finished
3
reading parameter file...reading parameter file...finished
5
finished
3
reading parameter file...reading parameter file...finished
3
finished
1
reading parameter file...reading parameter file...finished
5
finished
3
reading parameter file...reading parameter file...finished
2
finished
2
reading parameter file...reading parameter file...finished
4
finished
2
reading parameter file...reading parameter file...finished
8
finished
2
reading parameter file...reading parameter file...finished
5
finished
2
reading parameter file...reading parameter file...finished
2
finished
2
reading parameter file...reading parameter file...finished
5
finished
2
reading parameter file...reading parameter file...finished
5
fin

reading parameter file...reading parameter file...finished
4
finished
3
reading parameter file...reading parameter file...finished
7
finished
2
reading parameter file...reading parameter file...finished
4
finished
4
reading parameter file...reading parameter file...finished
3
finished
1
reading parameter file...reading parameter file...finished
2
finished
2
reading parameter file...reading parameter file...finished
4
finished
2
reading parameter file...reading parameter file...finished
3
finished
5
reading parameter file...reading parameter file...finished
1
finished
2
reading parameter file...reading parameter file...finished
2
finished
2
reading parameter file...reading parameter file...finished
3
finished
2
reading parameter file...reading parameter file...finished
3
finished
1
reading parameter file...reading parameter file...finished
4
finished
8
reading parameter file...reading parameter file...finished
6
finished
7
reading parameter file...reading parameter file...finished
7
fin

reading parameter file...reading parameter file...finished
2
finished
2
reading parameter file...reading parameter file...finished
2
finished
3
reading parameter file...reading parameter file...finished
1
finished
9
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
1
finished
6
reading parameter file...reading parameter file...finished
1
finished
2
reading parameter file...reading parameter file...finished
3
finished
2
reading parameter file...reading parameter file...finished
2
finished
3
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
6
finished
5
reading parameter file...reading parameter file...finished
2
finished
4
reading parameter file...reading parameter file...finished
3
fin

reading parameter file...reading parameter file...finished
5
finished
4
reading parameter file...reading parameter file...finished
5
finished
3
reading parameter file...reading parameter file...finished
6
finished
2
reading parameter file...reading parameter file...finished
2
finished
1
reading parameter file...reading parameter file...finished
6
finished
1
reading parameter file...reading parameter file...finished
3
finished
3
reading parameter file...reading parameter file...finished
6
finished
2
reading parameter file...reading parameter file...finished
2
finished
3
reading parameter file...reading parameter file...finished
1
finished
2
reading parameter file...reading parameter file...finished
6
finished
6
reading parameter file...reading parameter file...finished
4
finished
3
reading parameter file...reading parameter file...finished
3
finished
6
reading parameter file...reading parameter file...finished
2
finished
4
reading parameter file...reading parameter file...finished
1
fin

reading parameter file...reading parameter file...finished
0finished
1
1
reading parameter file...reading parameter file...finished
1
finished
1
reading parameter file...reading parameter file...finished
0finished
0
1
reading parameter file...reading parameter file...finished
0finished
1
1
reading parameter file...reading parameter file...finished
1
finished
7
reading parameter file...reading parameter file...finished
1
finished
2
reading parameter file...reading parameter file...finished
1
finished
5
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
1
finished
1
reading parameter file...reading parameter file...finished
1
finished
1
reading parameter file...reading parameter file...finished
1
finished
4
reading parameter file...reading parameter file...finished
14
finished
5
reading parameter file...reading parameter file...finished
1
finished
7
reading parameter file...reading parameter file...finished
2

reading parameter file...reading parameter file...finished
5
finished
1
reading parameter file...reading parameter file...finished
2
finished
1
reading parameter file...reading parameter file...finished
2
finished
1
reading parameter file...reading parameter file...finished
1
finished
1
reading parameter file...reading parameter file...finished
2
finished
1
reading parameter file...reading parameter file...finished
5
finished
1
reading parameter file...reading parameter file...finished
5
finished
1
reading parameter file...reading parameter file...finished
4
finished
3
reading parameter file...reading parameter file...finished
15
finished
2
reading parameter file...reading parameter file...finished
2
finished
1
reading parameter file...reading parameter file...finished
3finished
4
4
reading parameter file...reading parameter file...finished
0finished
4
3
reading parameter file...reading parameter file...finished
0finished
2
4
reading parameter file...reading parameter file...finished
1

reading parameter file...reading parameter file...finished
1
finished
2
reading parameter file...reading parameter file...finished
6
finished
3
reading parameter file...reading parameter file...finished
1
finished
6
reading parameter file...reading parameter file...finished
4
finished
2
reading parameter file...reading parameter file...finished
1
finished
1
reading parameter file...reading parameter file...finished
1
finished
2
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
1
finished
1
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
1
finished
2
reading parameter file...reading parameter file...finished
1
finished
6
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
2
finished
9
reading parameter file...reading parameter file...finished
2
fin

reading parameter file...reading parameter file...finished
2finished
4
4
reading parameter file...reading parameter file...finished
3
finished
4
reading parameter file...reading parameter file...finished
3
finished
6
reading parameter file...reading parameter file...finished
2
finished
4
reading parameter file...reading parameter file...finished
7finished
10
5
reading parameter file...reading parameter file...finished
2finished
1
4
reading parameter file...reading parameter file...finished
finished
3
2
reading parameter file...reading parameter file...finished
2
finished
4
reading parameter file...reading parameter file...finished
4finished
5
3
reading parameter file...reading parameter file...finished
0finished
7
2
reading parameter file...reading parameter file...finished
1finished
2
3
reading parameter file...reading parameter file...finished
5
finished
7
reading parameter file...reading parameter file...finished
3
finished
1
reading parameter file...reading parameter file...finishe

reading parameter file...reading parameter file...finished
1
finished
2
reading parameter file...reading parameter file...finished
4
finished
4
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
6
finished
2
reading parameter file...reading parameter file...finished
2
finished
2
reading parameter file...reading parameter file...finished
6
finished
4
reading parameter file...reading parameter file...finished
5
finished
4
reading parameter file...reading parameter file...finished
5
finished
3
reading parameter file...reading parameter file...finished
5
finished
6
reading parameter file...reading parameter file...finished
4
finished
2
reading parameter file...reading parameter file...finished
6
finished
6
reading parameter file...reading parameter file...finished
2
finished
5
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
4
fin

reading parameter file...reading parameter file...finished
3
finished
3
reading parameter file...reading parameter file...finished
1
finished
6
reading parameter file...reading parameter file...finished
3
finished
1
reading parameter file...reading parameter file...finished
1
finished
4
reading parameter file...reading parameter file...finished
6
finished
6
reading parameter file...reading parameter file...finished
3
finished
5
reading parameter file...reading parameter file...finished
1
finished
8
reading parameter file...reading parameter file...finished
3
finished
6
reading parameter file...reading parameter file...finished
2
finished
3
reading parameter file...reading parameter file...finished
4
finished
6
reading parameter file...reading parameter file...finished
7
finished
8
reading parameter file...reading parameter file...finished
1finished
4
4
reading parameter file...reading parameter file...finished
1finished
4
7
reading parameter file...reading parameter file...finished
0fi

reading parameter file...reading parameter file...finished
2
finished
2
reading parameter file...reading parameter file...finished
2
finished
1
reading parameter file...reading parameter file...finished
2
finished
1
reading parameter file...reading parameter file...finished
1
finished
1
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
1
finished
1
reading parameter file...reading parameter file...finished
1
finished
5
reading parameter file...reading parameter file...finished
2
finished
2
reading parameter file...reading parameter file...finished
2
finished
1
reading parameter file...reading parameter file...finished
2
finished
1
reading parameter file...reading parameter file...finished
2
finished
3
reading parameter file...reading parameter file...finished
2
finished
5
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
1
fin

reading parameter file...reading parameter file...finished
1
finished
5
reading parameter file...reading parameter file...finished
3finished
4
3
reading parameter file...reading parameter file...finished
1
finished
6
reading parameter file...reading parameter file...finished
5
finished
3
reading parameter file...reading parameter file...finished
5
finished
4
reading parameter file...reading parameter file...finished
2
finished
3
reading parameter file...reading parameter file...finished
5
finished
5
reading parameter file...reading parameter file...finished
3
finished
2
reading parameter file...reading parameter file...finished
5
finished
1
reading parameter file...reading parameter file...finished
3
finished
3
reading parameter file...reading parameter file...finished
1
finished
6
reading parameter file...reading parameter file...finished
0finished
3
4
reading parameter file...reading parameter file...finished
4
finished
3
reading parameter file...reading parameter file...finished
2fi

In [160]:
df_short = df.tail(909)

In [161]:
df_short["tagged"] = df_short.parallel_apply(lambda row: tag("test_{}".format(row.name),row[1]),axis=1)

reading parameter file...reading parameter file...finished
finished
6
10
reading parameter file...reading parameter file...finished
0finished
1
5
reading parameter file...reading parameter file...finished
1finished
3
2
reading parameter file...reading parameter file...finished
2
finished
2
reading parameter file...reading parameter file...finished
4
finished
7
reading parameter file...reading parameter file...finished
4
finished
3
reading parameter file...reading parameter file...finished
1
finished
1
reading parameter file...reading parameter file...finished
2
finished
2
reading parameter file...reading parameter file...finished
2
finished
3
reading parameter file...reading parameter file...finished
1
finished
2
reading parameter file...reading parameter file...finished
3
finished
2
reading parameter file...reading parameter file...finished
1
finished
7
reading parameter file...reading parameter file...finished
5
finished
9
reading parameter file...reading parameter file...finished
3


reading parameter file...finished
5
reading parameter file...finished
4
finished
2
reading parameter file...reading parameter file...finished
2
finished
6reading parameter file...
reading parameter file...finished
2
finished
2
reading parameter file...reading parameter file...finished
3
finished
6
reading parameter file...reading parameter file...finished
3
finished
1
reading parameter file...reading parameter file...finished
3
finished
1
reading parameter file...reading parameter file...finished
2
finished
4
reading parameter file...reading parameter file...finished
7
finished
2
reading parameter file...reading parameter file...finished
5
finished
3
reading parameter file...reading parameter file...finished
7
finished
4
reading parameter file...reading parameter file...finished
2
finished
6
reading parameter file...reading parameter file...finished
3
finished
4
reading parameter file...reading parameter file...finished
6
finished
3
reading parameter file...reading parameter file...fin

reading parameter file...reading parameter file...finished
1
finished
5
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
5
finished
3
reading parameter file...reading parameter file...finished
1
finished
3
reading parameter file...reading parameter file...finished
8
finished
1
reading parameter file...reading parameter file...finished
3
finished
3
reading parameter file...reading parameter file...finished
5
finished
1
reading parameter file...reading parameter file...finished
4
finished
1
reading parameter file...reading parameter file...finished
4
finished
3
reading parameter file...reading parameter file...finished
1
finished
1
reading parameter file...reading parameter file...finished
2
finished
3
reading parameter file...reading parameter file...finished
4
finished
2
reading parameter file...reading parameter file...finished
1
finished
2
reading parameter file...reading parameter file...finished
1
fin

finished
2
reading parameter file...reading parameter file...finished
5
finished
4
reading parameter file...reading parameter file...finished
2
finished
3
reading parameter file...reading parameter file...finished
4
finished
5
reading parameter file...reading parameter file...finished
8
finished
5
reading parameter file...reading parameter file...finished
6
finished
3
reading parameter file...reading parameter file...finished
3
finished
1
reading parameter file...reading parameter file...finished
2
finished
2
reading parameter file...reading parameter file...finished
4
finished
3
reading parameter file...reading parameter file...finished
2
finished
3
reading parameter file...reading parameter file...finished
2
finished
4
reading parameter file...reading parameter file...finished
5
finished
4
reading parameter file...reading parameter file...finished
3
finished
3
reading parameter file...reading parameter file...finished
1
finished
4
reading parameter file...reading parameter file...fin

reading parameter file...finished
3
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short["tagged"] = df_short.parallel_apply(lambda row: tag("test_{}".format(row.name),row[1]),axis=1)


In [163]:
df_short.to_csv("batch_63000_63909.csv",index=False)

In [144]:
tagged_text

'Diese\tPRO.Dem.Attr.-.Acc.Sg.Fem Normalität\tN.Reg.Acc.Sg.Fem erwarten\tVFIN.Full.3.Pl.Pres.Ind sich\tPRO.Refl.Subst.3.Acc.Pl.* auch\tADV die\tART.Def.Nom.Pl.Fem Bürgerinnen\tN.Reg.Nom.Pl.Fem und\tCONJ.Coord.- Bürger\tN.Reg.Nom.Pl.Masc ,\tSYM.Pun.Comma nicht\tPART.Neg nur\tADV die\tART.Def.Nom.Sg.Fem Normalität\tN.Reg.Nom.Sg.Fem ,\tSYM.Pun.Comma die\tPRO.Rel.Subst.-.Nom.Sg.Fem jetzt\tADV durch\tAPPR.Acc die\tART.Def.Acc.Sg.Fem Freiheit\tN.Reg.Acc.Sg.Fem eingekehrt\tVPP.Full.Psp ist\tVFIN.Sein.3.Sg.Pres.Ind schaut\tVFIN.Full.3.Sg.Pres.Ind man\tPRO.Indef.Subst.-.Nom.Sg.* jetzt\tADV raus\tPART.Verb ,\tSYM.Pun.Comma sieht\tVFIN.Full.3.Sg.Pres.Ind man\tPRO.Indef.Subst.-.Nom.Sg.* ,\tSYM.Pun.Comma wie\tPROADV.Inter schön\tADJD.Pos es\tPRO.Pers.Subst.3.Nom.Sg.Neut geworden\tVPP.Aux.Psp ist\tVFIN.Sein.3.Sg.Pres.Ind ,\tSYM.Pun.Comma da\tCONJ.SubFin.- die\tART.Def.Nom.Pl.Fem Inzidenzzahlen\tN.Reg.Nom.Pl.Fem stark\tADJD.Pos zurückgegangen\tVPP.Full.Psp und\tCONJ.Coord.- unter\tAPPR.Unter 50\tCARD

In [145]:
#df_short.apply(lambda row: contains(row["tagged"],"ADJA.Comp"),axis=1).tail(50)