In [1]:
import sys
sys.path.append('../')

from data_partitioner import *
from parallel_tagging import *
from biberplus.tagger import load_config
from biberplus.tagger.tagger_utils import load_tokenizer

INFO:numexpr.utils:Note: detected 160 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
INFO:numexpr.utils:Note: NumExpr detected 160 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [2]:
file_name = "RC_2022-02.gz"

input_path = os.path.join('/shared/4/datasets/long-reddit/', file_name)
partition_dir = os.path.join('/shared/3/projects/hiatus/tagged_data/partitions/reddit', file_name.split('.')[0]) 
tag_dir = os.path.join('/shared/3/projects/hiatus/tagged_data/partitions/tagged-reddit', file_name.split('.')[0])
output_dir= '/shared/3/projects/hiatus/tagged_data/long-reddit/'
    
# ensures all directories will exist
for directory in [partition_dir, tag_dir, output_dir]:
    os.makedirs(directory, exist_ok=True)

# initialize author subreddit dictionary
author_subreddit_counts = {}


## Partition the file into 100 parts

In [None]:
partition_file(input_path, partition_dir, chunks=100)

## Tag each partition with 1 CPU 

Set nice value low so we don't hog the server

**Tagger config**

In [3]:
config = load_config()
config.update({'use_gpu': False, 
               'biber': True,
               'binary_tags': True, 
               'function_words': True,
               'token_normalization': 100})
tokenizer = load_tokenizer(use_gpu=False)
config

{'biber': True,
 'function_words': True,
 'binary_tags': True,
 'function_words_list': [],
 'token_normalization': 100,
 'use_gpu': False,
 'show_progress': False,
 'n_processes': 1,
 'processing_size': 10000,
 'return_text': False,
 'drop_last_batch_pct': 0.5}

In [4]:
%%time
import warnings

# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

tag_partitions(config,
               input_directory=partition_dir,
               output_directory=tag_dir,
               num_workers=100,
               post_counts=author_subreddit_counts,
               default_niceness=0
               )

Aggregating post counts: 0file [00:00, ?file/s]

CPU times: user 182 ms, sys: 3.34 s, total: 3.52 s
Wall time: 4.7 s





In [11]:
%%time

output_tsv = output_dir + file_name.split('.')[0] + '-counts.tsv'
# Write the post counts to a TSV file
write_counts_to_tsv(author_subreddit_counts, output_tsv)

/shared/3/projects/hiatus/tagged_data/long-reddit/RC_2022-02-counts.tsv
CPU times: user 2.04 ms, sys: 0 ns, total: 2.04 ms
Wall time: 2.85 ms


## Join the partitioned files

In [None]:
join_tagged_files(input_directory=tag_dir,
                  ouput_file=os.join(output_dir, input_path.split('/')[-1]))

## Delete the temp directories

In [None]:
delete_partitioned_files(partition_dir)
delete_partitioned_files(tag_dir)