In [None]:
from multiprocessing import Pool

In [None]:
source_dir = '/media/xxxx/LaCie/data/opus_2018/'
import os
import re
from itertools import chain
from datetime import datetime
from collections import Counter, defaultdict, deque
import pandas as pd
tls = {'el' : 'Greek', 'es' : 'Spanish', 'tr' : 'Turkish', 'fi' : 'Finnish', 'ar' : 'Arabic'}

In [None]:
def tail(filename, n=10):
    'Return the last n lines of a file'
    return deque(open(filename), n)
        
def get_metadata(fn):
    language,genre = [],set()

    lines = tail(fn,200000)
    for li,line in enumerate(lines):
        if re.match(r'\s*\<orig', line) != None: 
            #if li > 100: print(li, line)
            language = re.sub(r'\<.*?\>', '', line.strip()).split(', ')
            if genre != set(): break
        if re.match(r'\s*<genre', line) != None: 
            #if li > 100: print(li, line)
            genre = set(map(lambda x : x.strip(), re.sub(r'\<.*?\>', '', line.strip()).strip('[]\'').split(',')))
            if language != []: break
    return language, genre

## get all bitext mappings

In [None]:
movie_mappings = {}
for tl in tls:
    print(tl)
    la,lb = sorted([tl,'en'])
    la2lb, lb2la = {}, {}
    try: fh = open('%s/%s-%s.txt/OpenSubtitles.%s-%s.ids' % (source_dir,la,lb,la,lb))
    except: continue
    for idx in fh:
        ea,eb = idx.strip('\n').split('\t')[:2]
        if ea not in la2lb:
            la2lb[ea] = eb
            lb2la[eb] = ea
    movie_mappings[tl] = (la2lb,lb2la)

## get metadata

In [None]:
for tl in tls:
    metadir = source_dir + tl + '/OpenSubtitles/xml/'
    bldr = []
    files = list(map(lambda f : f[:-3], movie_mappings[tl][tl > 'en']))
    print(tl, len(files), datetime.now())
    with Pool(12) as p:
        metadata = p.map(get_metadata, (metadir + fn for fn in files))
    pd.DataFrame([{'year':fn.split('/')[1], 'movie':fn.split('/')[2], 'version':fn.split('/')[3], 'languages':l, 'genres':g} 
                  for fn,(l,g) in zip(files, metadata)]).to_excel('./files/%s_metadata.xlsx' % tl)

In [None]:
freq_tls = {}
for tl,tlx in tls.items():
    print(tl,tlx)
    df = pd.read_excel('./files/%s_metadata.xlsx' % tl)
    dfx = df[df.languages.isin({"['%s']" % tlx, "['English']"})]
    print(Counter(dfx['languages']))
    if sum(dfx['languages'] == "['%s']" % tlx) >= 12:
        freq_tls[tl] = tlx
print(freq_tls)
    

## determine genre-matched pairings

In [None]:
for tl,tlx in freq_tls.items():
    dfx = pd.read_excel('./files/%s_metadata.xlsx' % tl)
    items_tl = {(r.year,r.movie,r.version):eval(r.genres) for i,r in dfx.iterrows() if r.languages == "['%s']" % tlx}
    items_en = {(r.year,r.movie,r.version):eval(r.genres) for i,r in dfx.iterrows() if r.languages == "['English']"}
    print(tl, len(items_tl))
    #
    builder, seen = [], set()
    for i,((y,f,v),g) in enumerate(sorted(items_tl.items())):
        if '%s/%s/%s/%s.gz' % (tl, y, f, v) not in movie_mappings[tl][int(tl > 'en')]: continue
        if f in seen: continue
        seen.add(f)
        print(y,f,v,g)
        #
        best_match_score, builder_it = 0, None
        for (yy,ff,vv),gg in sorted(filter(lambda k : isinstance(k[0][0],int) or k[0][0].isnumeric(), items_en.items()), key = lambda k : abs(int(k[0][0])-int(y))):
            if '%s/%s/%s/%s.gz' % (tl, yy, ff, vv) not in movie_mappings[tl][int(tl > 'en')]: continue
            #
            print('\t', yy,ff,vv, gg)
            if g == gg:
                print('\t\tmax found')
                builder_it = {'tl_y' : y, 'tl_f':f, 'tl_v':v, 'tl_g' : g, 'en_y':yy, 'en_f':ff, 'en_v':vv, 'en_g':gg}
                break
            elif len(g&gg)/len(g|gg) > best_match_score:
                best_match_score =  len(g&gg)/len(g|gg) 
                print('\t\tbetter match found: %.2f' % best_match_score)
                builder_it = {'tl_y' : y, 'tl_f':f, 'tl_v':v, 'tl_g':g, 'en_y':yy, 'en_f':ff, 'en_v':vv, 'en_g':gg}
            if abs(int(yy)-int(y)) > 10: break
        if builder_it != None:
            builder.append(builder_it)
    pd.DataFrame(builder).to_excel('./files/%s_matched_files.xlsx' % tl)

## write bitexts

In [None]:
for tl in freq_tls:
    la,lb = sorted([tl,'en'])
    metadata = []
    #
    dfx = pd.read_excel('./files/%s_matched_files.xlsx' % tl)
    items_tl = {(str(r.tl_y),str(r.tl_f),r.tl_v) for i,r in dfx.iterrows()}
    items_en = {(str(r.en_y),str(r.en_f),r.en_v) for i,r in dfx.iterrows()}
    la,lb = sorted([tl,'en'])
    #
    with open('./generated/opus_bitexts/%s_bitext.txt' % tl,'w') as fout:
        for i,(idx,ra,rb) in enumerate(zip(open('%s/%s-%s.txt/OpenSubtitles.%s-%s.ids' % (source_dir,la,lb,la,lb)),
                                   open('%s/%s-%s.txt/OpenSubtitles.%s-%s.%s' % (source_dir,la,lb,la,lb,tl)),
                                   open('%s/%s-%s.txt/OpenSubtitles.%s-%s.%s' % (source_dir,la,lb,la,lb,'en')))):
            e1,e2 = idx.strip('\n').split('\t')[:2]
            if tl > 'en': e1,e2 = e2,e1
            orig = 'orig' if tuple(e1[3:-3].split('/')) in items_tl else ('translated' if tuple(e1[3:-3].split('/')) in items_en else None)
            if orig != None:
                metadata.append({k:v for k,v in zip(['la','lb','ra','rb'], idx.strip('\n').split('\t'))} | {'original' : orig })
                fout.write(ra.strip('\n') + ' ||| ' + rb.strip('\n') + '\n')
                if len(metadata) % 2500 == 0: print(tl, len(metadata), e1, datetime.now())
    #
    pd.DataFrame(metadata).to_excel('./generated/opus_bitexts/%s_bitext_metadata.xlsx' % tl)
    print(Counter([m['original'] for m in metadata]))

## spacy english

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
for lg in freq_tls:
    new_bit = []
    for li,l in enumerate(open('./generated/opus_bitexts/%s_bitext.txt' % lg).readlines()):
        spacied = ['/'.join(map(lambda e : str(e).replace('/','#').replace(' ', '#'), [w.text, w.i, w.lemma_, w.pos_, w.tag_, w.dep_, w.head.i]))
                   for w in nlp(l.strip('\n').split(' ||| ')[1])]
        new_bit.append((l.strip('\n').split(' ||| ')[0], ' '.join(spacied)))
        if li % 10000 == 0: print(lg, li, new_bit[-1])
    with open('./generated/opus_bitexts/%s.spc' % lg, 'w') as fout:
        fout.write('\n'.join('%s ||| %s' % ln for ln in new_bit))