In [None]:
import os
from glob import glob

from datasets import load_dataset
import pandas as pd

from paths.paths_2013 import TEXTS_DIR, RESULTS_DIR, DATA_DIR, YEAR

In [None]:
def write_to_file(df_chunk):
    results_dir = TEXTS_DIR
    
    df = pd.DataFrame(df_chunk)
    grouped = df.groupby('subreddit')['body'].apply(lambda x: ' '.join(x)).reset_index()
    for idx, row in grouped.iterrows():
        output_file = f"subreddit_{row['subreddit']}_.txt"
        with open(os.path.join(results_dir, output_file), 'a') as f:
            f.write(row['body'])
    return df_chunk

In [None]:
def comments_by_subreddit():
    data_dir = DATA_DIR
    
    data = load_dataset('parquet', data_files=os.path.join(data_dir, f'RC_{YEAR}*'), split='train', streaming=True)
    data_mapped = data.map(write_to_file, batched=True, batch_size=10000)
    for data in data_mapped:
        pass

In [None]:
class FileReader:
    def __init__(self, filename, chunk_size=int(1e8)):
        self.chunk_size = chunk_size
        self.filename = filename
        self.file = None        
    
    def __enter__(self):
        self.file = open(self.filename, 'r')
        return self
        
    def __exit__(self, exc_type, exc_value, traceback):
        self.file.close()
        
    def __iter__(self):
        return self
    
    def __next__(self):
        data = self.file.read(self.chunk_size)
        if not data:
            raise StopIteration
        return data

In [None]:
class AppendFiles:
    def __init__(self, input_filenames, output_filename, chunk_size=int(1e8)):
        self.chunk_size = chunk_size
        self.input_filenames = input_filenames
        self.output_filename = output_filename
        self.file = None
        
    def __enter__(self):
        self.file = open(self.output_filename, 'a')
        return self
        
    def __exit__(self, exc_type, exc_value, traceback):
        self.file.close()
    
    def run(self):
        for file in self.input_filenames:
            self.append_file(file)
    
    def append_file(self, input_file):
        with FileReader(input_file) as reader:
            for data in reader:
                self.file.write(data)
            self.file.write('\n')

In [None]:
def texts_to_single_file():
    data_dir = TEXTS_DIR
    
    files = glob(os.path.join(data_dir, f'subreddit_*'))
    files.sort()
    with AppendFiles(files, os.path.join(RESULTS_DIR, f'subreddits_{YEAR}.txt')) as file_handler:
        file_handler.run()

In [None]:
comments_by_subreddit()

In [None]:
texts_to_single_file()