In [149]:
import numpy as np, regex, unicodedata

def uniprop_nr(s):
    UNIPROP_HASH = {'Ll': 0, 'Lu': 1, 'Nd': 2, 'Pc':3, 'Pd': 4, 'Ps': 5, 'Pe': 6, 'Pi': 7, 'Pf': 8, 'Po': 9, 'Zs': 10, 'Cc': 11, 'Sm': 12, 'Sc': 13}
    if s in UNIPROP_HASH:
        return UNIPROP_HASH[s]
    else:
        return 14

BOUNDARY_SYMBOL = '¤'

def training_vectorise(doc,ww,boundary_symbol):    
    X = []
    y = []
    
    ws = regex.finditer('(\s)',doc)
    previous_boundary = -1
    
    for i in ws:
        c = i.start()
        if c < ww + 2 or c > len(doc) - ww - 3:
            continue

        if unicodedata.category(doc[c-1])[0] == 'L' and unicodedata.category(doc[c+1])[0] == 'L' and doc[c] == ' ':
            continue

        boundary = doc[c-1] == boundary_symbol
        l = list()
        left_window = doc[c-ww-2:c].replace(boundary_symbol,'')
        right_window = doc[c:c+ww+3].replace(boundary_symbol,'')

        for j in range(1,ww+1):
            l.append(ord(left_window[-j]))
            l.append(uniprop_nr(unicodedata.category(left_window[-j])))
            l.append(ord(right_window[j]))
            l.append(uniprop_nr(unicodedata.category(right_window[j])))
        
        l.append(ord(doc[c]))
        l.append(c - previous_boundary - boundary)
        X.append(l)
        y.append(int(boundary))
        
        if boundary:
            previous_boundary = c
    
    return np.array(X), np.array(y)

In [150]:
def insert_sentence_boundaries(doc,model,boundary_symbol = '¤'):
    ww = (model.n_features_ - 2) // 4
    out_doc = list()

    ws = regex.finditer('(\s)',doc)
    previous_boundary = 0

    if boundary_symbol in doc:
        doc.replace(boundary_symbol,'')

    for i in ws:
        c = i.start()
        if c < ww or c > len(doc) - ww - 1:
            continue
        if unicodedata.category(doc[c-1])[0] == 'L' and unicodedata.category(doc[c+1])[0] == 'L' and doc[c] == ' ':
            continue

        l = list()
        
        left_window = doc[c-ww:c]
        right_window = doc[c:c+ww+1]

        for j in range(1,ww+1):
            l.append(ord(left_window[-j]))
            l.append(uniprop_nr(unicodedata.category(left_window[-j])))
            l.append(ord(right_window[j]))
            l.append(uniprop_nr(unicodedata.category(right_window[j])))
        
        l.append(ord(doc[c]))
        l.append(c - previous_boundary)

        y = model.predict([l])
        
        if y[0] == 1:
            out_doc.append(doc[previous_boundary:c])
            previous_boundary = c
    
    out_doc.append(doc[previous_boundary:])
    
    return boundary_symbol.join(out_doc)

In [414]:
import os
import time

def verify_segmented_documents(model = None, docs = None, directory = 'sentence segmenter train/segmented/', boundary_symbol = '¤', ww = 0, n_estimators = 100,weights=None,max_depth=None,learning_rate=1,classifier='rf'):
    start = time.time()
    if docs is None:
        docs = os.listdir(directory)
    
    if model:
        if ww == 0:
            ww = (model.n_features_ - 2) // 4
        nomodel = False
    else:
        if ww == 0:
            ww = 4
        nomodel = True

    n = 0
    false_positives = 0
    false_negatives = 0
    true_positives = 0
    
    for d in docs:
        with open(directory+d,encoding='utf8') as f:
            doc = f.read()

        ws = regex.finditer('(\s)',doc)
        previous_boundary = 0
        diffs = 0
        out_doc = list()
        
        if nomodel:
            model = train_with_files(ww,boundary_symbol, [doc for doc in docs if doc != d], directory, n_estimators = n_estimators,weights=weights,max_depth=max_depth,learning_rate = learning_rate, classifier=classifier) 

        for i in ws:
            c = i.start()
            if c < ww + 2 or c > len(doc) - ww - 3:
                continue

            if c < ww or c > len(doc) - ww - 1:
                continue
            if unicodedata.category(doc[c-1])[0] == 'L' and unicodedata.category(doc[c+1])[0] == 'L' and doc[c] == ' ':
                continue

            n += 1
            boundary = doc[c-1] == boundary_symbol
            l = list()
            left_window = doc[c-ww-2:c].replace(boundary_symbol,'')
            right_window = doc[c:c+ww+3].replace(boundary_symbol,'')

            for j in range(1,ww+1):
                l.append(ord(left_window[-j]))
                l.append(uniprop_nr(unicodedata.category(left_window[-j])))
                l.append(ord(right_window[j]))
                l.append(uniprop_nr(unicodedata.category(right_window[j])))

            l.append(ord(doc[c]))
            l.append(c - previous_boundary - boundary)
            
            y = model.predict([l])

            if y and boundary:
                out_doc.append(doc[previous_boundary:c])
                previous_boundary = c
                true_positives += 1
            
            elif not y and boundary:
                out_doc.append(doc[previous_boundary:c] + '×')
                previous_boundary = c
                diffs += 1
                false_negatives += 1
            
            elif y and not boundary:
                out_doc.append(doc[previous_boundary:c] + '×√')
                previous_boundary = c
                diffs += 1
                false_positives += 1
            
#        if diffs:
#            out_doc.append(doc[previous_boundary:])
#            with open(d.replace('.txt','.diff'),'w',encoding='utf8') as f:
#                f.write(''.join(out_doc))
#        print(diffs, 'diffs in', d)
    completion_time = time.time()-start
    print('completed in',completion_time)
    print('n =',n)
    print('False negatives:', false_negatives)
    print('False positives:', false_positives)
    print('True positives:', true_positives)
    precision = true_positives / (true_positives + false_positives)
    print('Precision:', precision)
    recall = true_positives / (true_positives + false_negatives)
    print('Recall:', recall)
    f1 = 2 * precision * recall / (precision + recall)
    print('F1:', f1)
    return {'time': completion_time, 'n': n, 'false_negatives': false_negatives, 'false_positives': false_positives, 'true_positives': true_positives, 'precision': precision, 'recall': recall, 'f1': f1}

In [402]:
def segment_files(model, docs = None, directory = 'sentence segmenter train/', target_directory = 'sentence segmenter train/segmented/', boundary_symbol = '¤', verbose=True):
    if docs is None:
        docs = os.listdir(directory)
    for doc in docs:
        if verbose:
            print('Segmenting',directory+doc)
        with open(directory+doc,encoding='utf8') as f:
            txt = f.read()
        with open(target_directory+doc.replace('.xml.','.dek.'),'w',encoding='utf8') as f:
            f.write(insert_sentence_boundaries(txt,model,boundary_symbol = '¤'))

In [369]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import os

def train_with_files(ww = 4,boundary_symbol = '¤', docs = None, directory = 'sentence segmenter train/segmented/', model = None, n_estimators = 100,weights=None,max_depth=None,learning_rate=1,classifier='rf'):
    if model is None:
        if classifier == 'rf':
            model = RandomForestClassifier(n_estimators,oob_score=False,class_weight=weights,max_depth=max_depth)
        elif classifier == 'ada':
            dt = DecisionTreeClassifier(max_depth=max_depth)
            model = AdaBoostClassifier(
                base_estimator=dt,
                learning_rate=learning_rate,
                n_estimators=n_estimators,
                algorithm="SAMME")
        elif classifier == 'gb':
            model = GradientBoostingClassifier(n_estimators=n_estimators,learning_rate=learning_rate)

    if docs is None:
        docs = os.listdir(directory)
    Xs = list()
    ys = list()
    for doc in docs:
        with open(directory+doc,encoding='utf8') as f:
            X, y = training_vectorise(f.read(),ww,boundary_symbol)
        Xs.append(X)
        ys.append(y)
    X_all = np.concatenate(Xs)
    y_all = np.concatenate(ys)
    model.fit(X_all,y_all)
    return model

In [None]:
imp = m.feature_importances_
print("Feature importances:\n")
for i in range((len(imp)-2)//4):
    print("Code of left character no. %d: %1.4f, its class: %1.4f"%(i+1, imp[4*i], imp[4*i+1]))
    print("Code of left character no. %d.: %1.4f, its class: %1.4f\n"%(i+1, imp[4*i+2], imp[4*i+3]))
print("Center whitespace character: %1.4f"%imp[-2])
print("Distance from previous boundary: %1.4f"%imp[-1])

In [5]:
import pandas
fajlok = pandas.read_csv('keep_files.csv')
fajlok_np = fajlok['file_name'].to_numpy()
np.random.shuffle(fajlok_np)
fajlok_np[:10]

array(['L_2016230HU.01000501.xml.txt', 'C_2008014HU.01003801.xml.txt',
       'CE2007064HU.01010001.xml.txt', 'L_2014178HU.01001801.xml.txt',
       'C_2006224HU.01001002.xml.txt', 'C_2012049HU.01001802.xml.txt',
       'CE2010016HU.01006101.xml.txt', 'L_2010077HU.01001701.xml.txt',
       'L_2007271HU.01001301.xml.txt', 'L_2004359HU.01002902.xml.txt'],
      dtype=object)

In [60]:
fajlok_np[11:20]

array(['L_2014189HU.01009301.xml.txt', 'C_2019206HU.01009501.xml.txt',
       'C_2007170HU.01002701.xml.txt', 'C_2013352HU.01000301.xml.txt',
       'C_2010161HU.01004501.xml.txt', 'C_2007236HU.01001001.xml.txt',
       'L_2007122HU.01003101.xml.txt', 'C_2015389HU.01002101.xml.txt',
       'L_2018089HU.01002003.xml.txt'], dtype=object)

In [62]:
with open('L_2014189HU.01009301.xml.txt',encoding='utf8') as f:
    out = insert_sentence_boundaries(f.read(),modell,'¤')
with open('L_2014189HU.01009301.dek.txt','w',encoding='utf8') as f:
    f.write(out)

In [58]:
with open('L_2014189HU.01009301.dek.txt',encoding='utf8') as f:
    X_uj, y_uj = training_vectorise(f.read(),4,'¤')
X = np.concatenate((X,X_uj))
y = np.concatenate((y,y_uj))
modell.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [125]:
m = train_with_files(6)

In [126]:
m

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [127]:
verify_segmented_documents(m)

0 diffs in CE2007064HU.01010001.dek.txt
0 diffs in CE2010016HU.01006101.dek.txt
0 diffs in CELEX_31999R1655_hu_dek.txt
0 diffs in C_2006224HU.01001002.dek.txt
0 diffs in C_2008014HU.01003801.dek.txt
0 diffs in C_2008316HU.01000301.dek.txt
0 diffs in C_2012049HU.01001802.dek.txt
0 diffs in L_2004359HU.01002902.dek.txt
0 diffs in L_2007271HU.01001301.dek.txt
0 diffs in L_2010077HU.01001701.dek.txt
0 diffs in L_2014178HU.01001801.dek.txt
0 diffs in L_2014189HU.01009301.dek.txt
completed in 23.47880244255066


In [183]:
verify_segmented_documents(docs=[f for f in os.listdir('sentence segmenter train/segmented/') if 'HU' in f], ww=6)

0 diffs in CE2007064HU.01010001.dek.txt
8 diffs in CE2010016HU.01006101.dek.txt
0 diffs in CE2013332HU.01017701.dek.txt
2 diffs in C_2006126HU.01001501.dek.txt
0 diffs in C_2006131HU.01005002.dek.txt
1 diffs in C_2006224HU.01001002.dek.txt
0 diffs in C_2007170HU.01002701.dek.txt
4 diffs in C_2007236HU.01001001.dek.txt
0 diffs in C_2007311HU.01000201.dek.txt
0 diffs in C_2008014HU.01003801.dek.txt
0 diffs in C_2008064HU.01002901.dek.txt
11 diffs in C_2008316HU.01000301.dek.txt
0 diffs in C_2009032HU.01002303.dek.txt
2 diffs in C_2010161HU.01004501.dek.txt
0 diffs in C_2012049HU.01001802.dek.txt
2 diffs in C_2013331HU.01019801.dek.txt
0 diffs in C_2013352HU.01000301.dek.txt
0 diffs in C_2015389HU.01002101.dek.txt
289 diffs in C_2018461HU.01012501.dek.txt
0 diffs in C_2019206HU.01009501.dek.txt
0 diffs in L_2004359HU.01002902.dek.txt
5 diffs in L_2006044HU.01000601.dek.txt
0 diffs in L_2007122HU.01003101.dek.txt
0 diffs in L_2007271HU.01001301.dek.txt
0 diffs in L_2010077HU.01001701.dek.t

In [130]:
fajlok_np[10:20]

array(['C_2009032HU.01002303.xml.txt', 'L_2014189HU.01009301.xml.txt',
       'C_2019206HU.01009501.xml.txt', 'C_2007170HU.01002701.xml.txt',
       'C_2013352HU.01000301.xml.txt', 'C_2010161HU.01004501.xml.txt',
       'C_2007236HU.01001001.xml.txt', 'L_2007122HU.01003101.xml.txt',
       'C_2015389HU.01002101.xml.txt', 'L_2018089HU.01002003.xml.txt'],
      dtype=object)

In [132]:
fa = regex.findall('_(\d{4})',fajlok_np[10])

In [136]:
import shutil

In [407]:
jox_root = '/your_jox_root/'
for fn in fajlok_np[60:70]:
    yr = fn[2:6]
    shutil.copy(jox_root+'JOx_FMX_HU_yyyy/txt/'.replace('yyyy',yr)+fn,'sentence segmenter train')
    shutil.copy(jox_root+'JOx_FMX_EN_yyyy/txt/'.replace('yyyy',yr)+fn.replace('HU','EN'),'sentence segmenter train')

In [396]:
fns = [jox_root+'JOx_FMX_EN_yyyy/txt/'.replace('yyyy',fn[2:6])+fn.replace('HU','EN') for fn in fajlok_np[50:60]]

In [144]:
files = os.listdir('sentence segmenter train')
segment_files(m,[f for f in files if 'EN' in f])

In [410]:
en_model = train_with_files(5, docs = [f for f in os.listdir('sentence segmenter train/segmented/') if 'EN' in f], directory = 'sentence segmenter train/segmented/',n_estimators=1000,weights={0:1,1:100})
segment_files(en_model,[f.replace('HU','EN') for f in fajlok_np[60:70]],target_directory='sentence segmenter train/new/')

Segmenting sentence segmenter train/C_2008316EN.01000301.xml.txt


In [409]:
hu_model = train_with_files(5, docs = [f for f in os.listdir('sentence segmenter train/segmented/') if 'HU' in f], directory = 'sentence segmenter train/segmented/',n_estimators=1000,weights={0:1,1:100})
segment_files(hu_model,fajlok_np[60:70],target_directory='sentence segmenter train/new/')

Segmenting sentence segmenter train/L_2012321HU.01004201.xml.txt
Segmenting sentence segmenter train/C_2007042HU.01004002.xml.txt
Segmenting sentence segmenter train/C_2014274HU.01000402.xml.txt
Segmenting sentence segmenter train/C_2009069HU.01001201.xml.txt
Segmenting sentence segmenter train/L_2011176HU.01003701.xml.txt
Segmenting sentence segmenter train/L_2011200HU.01001301.xml.txt
Segmenting sentence segmenter train/L_2016038HU.01000301.xml.txt
Segmenting sentence segmenter train/C_2014461HU.01000101.xml.txt
Segmenting sentence segmenter train/C_2007176HU.01000801.xml.txt
Segmenting sentence segmenter train/L_2005344HU.01004001.xml.txt


In [415]:
hu_model = train_with_files(6, docs = [f for f in os.listdir('sentence segmenter train/segmented/') if 'HU' in f], directory = 'sentence segmenter train/segmented/')
verify_segmented_documents(hu_model,docs=[f for f in os.listdir('sentence segmenter train/segmented/') if 'HU' in f])

completed in 359.46383571624756
n = 41923
False negatives: 3
False positives: 2
True positives: 8814
Precision: 0.9997731397459165
Recall: 0.9996597482136781
F1: 0.9997164407644757


{'time': 359.46383571624756,
 'n': 41923,
 'false_negatives': 3,
 'false_positives': 2,
 'true_positives': 8814,
 'precision': 0.9997731397459165,
 'recall': 0.9996597482136781,
 'f1': 0.9997164407644757}

In [416]:
en_model = train_with_files(5, docs = [f for f in os.listdir('sentence segmenter train/segmented/') if 'EN' in f], directory = 'sentence segmenter train/segmented/')
verify_segmented_documents(en_model,docs=[f for f in os.listdir('sentence segmenter train/segmented/') if 'EN' in f])

completed in 327.8741145133972
n = 38541
False negatives: 0
False positives: 0
True positives: 8759
Precision: 1.0
Recall: 1.0
F1: 1.0


{'time': 327.8741145133972,
 'n': 38541,
 'false_negatives': 0,
 'false_positives': 0,
 'true_positives': 8759,
 'precision': 1.0,
 'recall': 1.0,
 'f1': 1.0}

In [223]:
import re
dir_pattern = jox_root+'JOx_FMX_xx_yyyy/txt/'
#fd = 'sentence segmenter train/'
for yr in range(2004,2020):
    for lang in ['EN','HU']:
        fd = dir_pattern.replace('xx',lang).replace('yyyy',str(yr))
        fl = [f for f in os.listdir(fd) if '.txt' in f]
        for fn in fl:
            with open(fd+fn,encoding='utf8') as f:
                txt = f.read()
            txt = re.sub('^\n','',txt)
            txt = re.sub(' \n','\n',txt)
            txt = re.sub('\n(\| ?)+¤? ?\n','\n',txt)
            with open(fd+fn,'w',encoding='utf8') as f:
                f.write(txt)

In [273]:
en_model = train_with_files(10, docs = [f for f in os.listdir('sentence segmenter train/segmented/') if 'EN' in f], directory = 'sentence segmenter train/segmented/')

In [274]:
en_model.oob_score_

0.9982112486071198

In [321]:
for ww in range(4,7):
    for l in ['EN','HU']:
        for i in [3,10,30,100,300,1000]:
            print('Starting validation for %s with positive weight = %d and ww = %d'%(l,i,ww))
            verify_segmented_documents(docs=[f for f in os.listdir('sentence segmenter train/segmented/') if l in f][:30],weights={0:1,1:i})
            print()

Starting validation for EN with positive weight = 3 and ww = 4
completed in 49.51448106765747
n = 6244
False negatives: 25
False positives: 7
True positives: 1381
Precision: 0.9949567723342939
Recall: 0.9822190611664295
F1: 0.9885468861846813

Starting validation for EN with positive weight = 10 and ww = 4
completed in 48.68150019645691
n = 6244
False negatives: 26
False positives: 5
True positives: 1380
Precision: 0.9963898916967509
Recall: 0.9815078236130867
F1: 0.9888928699390899

Starting validation for EN with positive weight = 30 and ww = 4
completed in 49.157055377960205
n = 6244
False negatives: 26
False positives: 6
True positives: 1380
Precision: 0.9956709956709957
Recall: 0.9815078236130867
F1: 0.9885386819484241

Starting validation for EN with positive weight = 100 and ww = 4
completed in 49.97064423561096
n = 6244
False negatives: 27
False positives: 6
True positives: 1379
Precision: 0.9956678700361011
Recall: 0.980796586059744
F1: 0.9881762809029022

Starting validation 

completed in 58.73649573326111
n = 7209
False negatives: 64
False positives: 5
True positives: 1574
Precision: 0.9968334388853705
Recall: 0.960927960927961
F1: 0.9785514454460678

Starting validation for HU with positive weight = 1000 and ww = 6
completed in 59.65037989616394
n = 7209
False negatives: 75
False positives: 5
True positives: 1563
Precision: 0.9968112244897959
Recall: 0.9542124542124543
F1: 0.9750467872738615



Best / worst ww = 4 EN // HU:
0.9888928699390899 w = 10 / 0.9874506991753317 w = 300 // 0.9798199316982302 w = 100 / 0.9777089783281734 w = 3

Best / worst ww = 5 EN // HU:
0.989624329159213 w = 100 / 0.9878223495702005 w = 300 // 0.9801365611421478 w = 100 / 0.9760199314855186 w = 1000

Best / worst ww = 6 EN // HU:
0.9892550143266475 w = 3 / 0.987106017191977 w = 300 // 0.9795158286778398 w = 30 / 0.9750467872738615 w = 1000

In [384]:
verify_segmented_documents(ww=5,weights={0:1,1:100},docs=[f for f in os.listdir('sentence segmenter train/segmented/') if 'EN' in f])

completed in 545.8137211799622
n = 34141
False negatives: 241
False positives: 36
True positives: 7530
Precision: 0.9952418715305313
Recall: 0.9689872603268562
F1: 0.9819391015192018


In [381]:
verify_segmented_documents(docs=[f for f in os.listdir('sentence segmenter train/segmented/') if 'EN' in f],classifier = 'ada',n_estimators=50,learning_rate=1)

completed in 568.4180021286011
n = 34162
False negatives: 143
False positives: 83
True positives: 7632
Precision: 0.9892417368762152
Recall: 0.9816077170418006
F1: 0.9854099418979987


In [380]:
verify_segmented_documents(docs=[f for f in os.listdir('sentence segmenter train/segmented/') if 'EN' in f],classifier = 'gb',n_estimators=100,learning_rate=1.0)

completed in 431.8739969730377
n = 34162
False negatives: 84
False positives: 197
True positives: 7691
Precision: 0.975025354969574
Recall: 0.9891961414790997
F1: 0.9820596309774628


In [390]:
results = list()
for l in ['EN','HU']:
    for c in ['rf','ada','gb']:
        print('Starting validation for %s with c = %s'%(l,c))
        results.append(verify_segmented_documents(docs=[f for f in os.listdir('sentence segmenter train/segmented/') if l in f],classifier=c,n_estimators=100,weights={0:1,1:100}))
        print()

Starting validation for EN with c = rf
completed in 529.7000639438629
n = 34162
False negatives: 160
False positives: 43
True positives: 7615
Precision: 0.9943849569078088
Recall: 0.9794212218649517
F1: 0.9868463681720988

Starting validation for EN with c = ada
completed in 971.1325669288635
n = 34162
False negatives: 143
False positives: 83
True positives: 7632
Precision: 0.9892417368762152
Recall: 0.9816077170418006
F1: 0.9854099418979987

Starting validation for EN with c = gb
completed in 454.18780303001404
n = 34162
False negatives: 84
False positives: 198
True positives: 7691
Precision: 0.9749017619470148
Recall: 0.9891961414790997
F1: 0.981996935648621

Starting validation for HU with c = rf
completed in 655.3188421726227
n = 38169
False negatives: 195
False positives: 47
True positives: 7848
Precision: 0.9940468651044965
Recall: 0.9757553151809026
F1: 0.9848161626301919

Starting validation for HU with c = ada
completed in 1182.864158153534
n = 38169
False negatives: 115
False

In [282]:
en_model = train_with_files(docs = [f for f in os.listdir('sentence segmenter train/segmented/') if 'EN' in f], directory = 'sentence segmenter train/segmented/')

In [392]:
34162 - 7775

26387

In [394]:
38169 - 8043

30126

In [393]:
7935+108

8043

In [360]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

X, y = make_hastie_10_2(random_state=0)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
     max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

0.913

In [389]:
len([f for f in os.listdir('sentence segmenter train/segmented/') if 'HU' in f])

61

In [395]:
m = 

In [404]:
for c in ['rf','ada','gb']:
    for n in [30,100,300,1000]:
        print('Training', c, 'with %d estimators'%n)
        start = time.time()
        model = train_with_files(docs = [f for f in os.listdir('sentence segmenter train/segmented/') if 'EN' in f], directory = 'sentence segmenter train/segmented/',n_estimators=n,classifier=c)
        print("Training time:",time.time()-start,'s')
        start = time.time()
        segment_files(model, docs = [fn.replace('HU','EN') for fn in fajlok_np[100:300]], directory = 'sentence segmenter train/temp/', target_directory = 'sentence segmenter train/temp/out', boundary_symbol = '¤',verbose=False)
        print("Prediction time:",time.time()-start,'s')
        print()

Training rf with 30 estimators
Training time: 1.6629507541656494 s
Prediction time: 171.00018453598022 s

Training rf with 100 estimators
Training time: 2.551316499710083 s
Prediction time: 171.3777096271515 s

Training rf with 300 estimators
Training time: 6.841279983520508 s
Prediction time: 167.26971769332886 s

Training rf with 1000 estimators
Training time: 18.023351669311523 s
Prediction time: 167.27076649665833 s

Training ada with 30 estimators
Training time: 2.918445587158203 s
Prediction time: 166.68954014778137 s

Training ada with 100 estimators
Training time: 7.844264984130859 s
Prediction time: 167.00993633270264 s

Training ada with 300 estimators
Training time: 21.89357089996338 s
Prediction time: 167.16367769241333 s

Training ada with 1000 estimators
Training time: 70.78373003005981 s
Prediction time: 166.40626192092896 s

Training gb with 30 estimators
Training time: 1.615180492401123 s
Prediction time: 168.82127571105957 s

Training gb with 100 estimators
Training t

In [424]:
en_files = [f for f in os.listdir('sentence segmenter train/segmented/') if 'EN' in f][0:71]
np.random.shuffle(en_files)
results = list()
for r in range(1,8):
    print('Starting validation with %d documents'%(r*10))
    results.append(verify_segmented_documents(docs=en_files[1:r*10],weights={0:1,1:100}))
    print()

Starting validation with 10 documents
completed in 30.448218822479248
n = 2274
False negatives: 38
False positives: 3
True positives: 513
Precision: 0.9941860465116279
Recall: 0.9310344827586207
F1: 0.9615745079662605

Starting validation with 20 documents
completed in 77.67113208770752
n = 5140
False negatives: 30
False positives: 13
True positives: 1126
Precision: 0.9885864793678666
Recall: 0.9740484429065744
F1: 0.9812636165577342

Starting validation with 30 documents
completed in 182.41609692573547
n = 11064
False negatives: 44
False positives: 9
True positives: 2299
Precision: 0.9961005199306759
Recall: 0.9812206572769953
F1: 0.9886046011610407

Starting validation with 40 documents
completed in 277.817670583725
n = 16125
False negatives: 43
False positives: 9
True positives: 3511
Precision: 0.9974431818181818
Recall: 0.9879009566685425
F1: 0.9926491376873056

Starting validation with 50 documents
completed in 331.4863979816437
n = 19335
False negatives: 48
False positives: 9
Tru