In [None]:
import os
import sys
import jsonlines

sys.path.append('../')

from tqdm import tqdm
from glob import glob
from multiprocessing import Pool
from bibermda.tagger import load_config, load_pipeline, calculate_tag_frequencies
from bibermda.reducer import encode_text
from bibermda.tagger.tagger_utils import load_tokenizer

## Partition the file into 100 parts

In [None]:
!wc -l /shared/3/projects/hiatus/reddit/corpus.jsonl

In [None]:
def partition_file(input_file, output_directory, chunks=100):
    lines = count_lines(input_file)
    print("{} lines in file".format(lines))
    chunk_size = round(lines / chunks)
    print("Chunk size: {} lines".format(chunk_size))
    count, chunk = 0, 1
    curr_lines = []
    
    with jsonlines.open(input_file) as reader:
        for obj in reader:
            curr_lines.append(obj)
            count += 1
            if count >= chunk_size:
                save_partition(curr_lines, output_directory, index=chunk)
                count, curr_lines = 0, []
                chunk += 1
                
    if curr_lines:
        save_partition(curr_lines, output_directory, index=chunk+1)
    

def save_partition(json_lines, output_directory, index):
    out = output_directory + "partition-{}.jsonl".format(index) 
    print("Saving {}".format(out))
    with jsonlines.open(out, mode='w') as writer:
        writer.write_all(json_lines) 

        
def count_lines(input_file):
    return sum(1 for i in open(input_file, 'rb'))

In [None]:
output_directory = '/shared/3/projects/hiatus/partitions/reddit/'
input_file = '/shared/3/projects/hiatus/reddit/corpus.jsonl'
partition_file(input_file, output_directory, chunks=100)

## Tag each partition with 1 CPU 

Set nice value low so we don't hog the server

**Tagger config**

In [None]:
config = load_config()
config.update({'use_gpu': False, 
               'biber': True,
               'binary_tags': True, 
               'function_words': True,
               'token_normalization': 100})
tokenizer = load_tokenizer(use_gpu=False)
config

In [None]:
def tag_partition(config, input_file, output_file):
    print("Tagging file " + input_file)
    with jsonlines.open(input_file) as reader:
        with jsonlines.open(output_file, mode='w') as writer:
            for obj in reader:
                try:
                    num_tokens = len(tokenizer(obj['fullText']))
                    obj['num_tokens'] = num_tokens
                    if num_tokens >= 10:
                        obj['encodings'] = encode_text(text=obj['fullText'], config=config)
                except Exception:
                    pass
                writer.write(obj)

In [None]:
NICENESS = 20
WORKERS = 100
process_args = []
partition_files = glob(output_directory + "*.jsonl")

for fp in partition_files:
    fname = fp.rsplit('/')[-1].replace('.jsonl', '') + '-tagged.jsonl'
    out = output_directory + fname
    process_args.append((config, fp, out))
    
len(process_args)

In [None]:
def set_nicesness():
    _ = os.nice(NICENESS) 

with Pool(WORKERS, initializer=set_nicesness) as p:
    p.starmap(tag_partition, process_args)

## Join the partioned files into a single file

In [None]:
def join_tagged_files(input_directory, output_file):
    tagged_files = glob(input_directory + "*-tagged.jsonl")
    with jsonlines.open(output_file, mode='w') as writer:
        for tagged_file in tqdm(tagged_files):
            with jsonlines.open(tagged_file) as reader:
                for obj in reader:
                    writer.write(obj)

In [None]:
input_directory = '/shared/3/projects/hiatus/partitions/reddit/'
output_file = '/shared/3/projects/hiatus/reddit/corpus-tagged.jsonl'

## Delete all the partioned files

Delete the directory itself

In [None]:
# !rm -r /shared/3/projects/hiatus/partitions/reddit

## Partition the file into 100 parts

In [None]:
!wc -l /shared/3/projects/hiatus/reddit/corpus.jsonl

In [None]:
def partition_file(input_file, output_directory, chunks=100):
    lines = count_lines(input_file)
    print("{} lines in file".format(lines))
    chunk_size = round(lines / chunks)
    print("Chunk size: {} lines".format(chunk_size))
    count, chunk = 0, 1
    curr_lines = []
    
    with jsonlines.open(input_file) as reader:
        for obj in reader:
            curr_lines.append(obj)
            count += 1
            if count >= chunk_size:
                save_partition(curr_lines, output_directory, index=chunk)
                count, curr_lines = 0, []
                chunk += 1
                
    if curr_lines:
        save_partition(curr_lines, output_directory, index=chunk+1)
    

def save_partition(json_lines, output_directory, index):
    out = output_directory + "partition-{}.jsonl".format(index) 
    print("Saving {}".format(out))
    with jsonlines.open(out, mode='w') as writer:
        writer.write_all(json_lines) 

        
def count_lines(input_file):
    return sum(1 for i in open(input_file, 'rb'))

In [None]:
output_directory = '/shared/3/projects/hiatus/partitions/reddit/'
input_file = '/shared/3/projects/hiatus/reddit/corpus.jsonl'
partition_file(input_file, output_directory, chunks=100)

## Tag each partition with 1 CPU 

Set nice value low so we don't hog the server

**Tagger config**

In [None]:
config = load_config()
config.update({'use_gpu': False, 
               'biber': True,
               'binary_tags': True, 
               'function_words': True,
               'token_normalization': 100})
tokenizer = load_tokenizer(use_gpu=False)
config

In [None]:
def tag_partition(config, input_file, output_file):
    print("Tagging file " + input_file)
    with jsonlines.open(input_file) as reader:
        with jsonlines.open(output_file, mode='w') as writer:
            for obj in reader:
                try:
                    num_tokens = len(tokenizer(obj['fullText']))
                    obj['num_tokens'] = num_tokens
                    if num_tokens >= 10:
                        obj['encodings'] = encode_text(text=obj['fullText'], config=config)
                except Exception:
                    pass
                writer.write(obj)

In [None]:
NICENESS = 20
WORKERS = 100
process_args = []
partition_files = glob(output_directory + "*.jsonl")

for fp in partition_files:
    fname = fp.rsplit('/')[-1].replace('.jsonl', '') + '-tagged.jsonl'
    out = output_directory + fname
    process_args.append((config, fp, out))
    
len(process_args)

In [None]:
def set_nicesness():
    _ = os.nice(NICENESS) 

with Pool(WORKERS, initializer=set_nicesness) as p:
    p.starmap(tag_partition, process_args)

## Join the partioned files into a single file

In [None]:
def join_tagged_files(input_directory, output_file):
    tagged_files = glob(input_directory + "*-tagged.jsonl")
    with jsonlines.open(output_file, mode='w') as writer:
        for tagged_file in tqdm(tagged_files):
            with jsonlines.open(tagged_file) as reader:
                for obj in reader:
                    writer.write(obj)

In [None]:
input_directory = '/shared/3/projects/hiatus/partitions/reddit/'
output_file = '/shared/3/projects/hiatus/reddit/corpus-tagged.jsonl'

## Delete all the partioned files

Delete the directory itself

In [None]:
# !rm -r /shared/3/projects/hiatus/partitions/reddit