In [None]:
import sys
sys.path.append('../')
import os
from data_partitioner import *
from parallel_tagging import *
from biberplus.tagger import load_config
from biberplus.tagger.tagger_utils import load_tokenizer

In [None]:
file_name = 'RC_2019-11.gz'

In [None]:
input_path = os.path.join('/shared/4/datasets/long-reddit/', file_name)
partition_dir = os.path.join('/shared/3/projects/hiatus/tagged_data/partitions/reddit', file_name.split('.')[0]) 
tag_dir = os.path.join('/shared/3/projects/hiatus/tagged_data/partitions/tagged-reddit', file_name.split('.')[0])
output_dir= '/shared/3/projects/hiatus/tagged_data/long-reddit/'
    
# ensures all directories will exist
for directory in [partition_dir, tag_dir, output_dir]:
    os.makedirs(directory, exist_ok=True)

# initialize author subreddit dictionary
author_subreddit_counts = {}


## Partition the file into 100 parts

In [None]:
%%time
partition_file(input_path, partition_dir, chunks=100)

## Tag each partition with 1 CPU 

Set nice value low so we don't hog the server

**Tagger config**

In [None]:
config = load_config()
config.update({'use_gpu': False, 
               'biber': True,
               'binary_tags': True, 
               'function_words': True,
               'token_normalization': 100})
tokenizer = load_tokenizer(use_gpu=False)
config

In [None]:
%%time
import warnings

# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

tag_partitions(config,
               input_directory=partition_dir,
               output_directory=tag_dir,
               num_workers=101,
               post_counts=author_subreddit_counts,
               default_niceness=0
               )

In [None]:
%%time
output_tsv = output_dir + file_name.split('.')[0] + '-counts.tsv'
# Write the post counts to a TSV file
write_to_tsv(author_subreddit_counts, output_tsv)

## Join the partitioned files

In [None]:
join_tagged_files(input_directory=tag_dir,
                  output_file=os.path.join(output_dir, file_name))

## Delete the temp directories

In [None]:
delete_partitioned_files(partition_dir)
delete_partitioned_files(tag_dir)