In [19]:
import time
import pandas as pd
from os import listdir

In [20]:
CHUNK_SIZE: int    = 1 << 16
DATA_DIR_PATH: str = 'quotebank'
OUT_DIR_PATH: str  = 'out'

In [21]:
!mkdir out

In [22]:
data_files = listdir(DATA_DIR_PATH)
data_files.sort()

In [23]:
start_time = time.time()

for filename in data_files:
    file_path: str = '{}/{}'.format(DATA_DIR_PATH, filename)
    comp_ext:  str = file_path.split('.')[-1]
    out_path:  str = '{}/sport-{}.csv'.format(OUT_DIR_PATH, filename.split('.')[0])
    
    with pd.read_json(file_path, lines=True, compression=comp_ext, chunksize=CHUNK_SIZE) as df_reader:
        
        i: int = 0 # keeps track of the chunk number
        total_lines: int = 0 # keeps track of output file length
        export_header: bool = True
        
        for chunk in df_reader:   
            if (i & 15 == 0):
                print(f"  - Processing chunk #{i} (size = {chunk.shape[0]}) for file '{file_path}'")
            
            # keep only lines containing the 'sport' substring in the url(s)
            sports_quotes = chunk[[any('sport' in url for url in url_list) for url_list in chunk.urls]]

            sports_quotes.to_csv(out_path, mode='a', header=export_header)
            export_header = False

            total_lines += sports_quotes.shape[0]
            i += 1

        # summary at the end of a file
        print(f">>> Processed a total of {i} chunks for file '{file_path}' => total of {total_lines} lines out of {i * CHUNK_SIZE}")


print("--- %s seconds ---" % (time.time() - start_time))

  - Processing chunk #0 (size = 65536) for file 'quotebank/quotes-2015.json.bz2'
  - Processing chunk #16 (size = 65536) for file 'quotebank/quotes-2015.json.bz2'
  - Processing chunk #32 (size = 65536) for file 'quotebank/quotes-2015.json.bz2'
  - Processing chunk #48 (size = 65536) for file 'quotebank/quotes-2015.json.bz2'
  - Processing chunk #64 (size = 65536) for file 'quotebank/quotes-2015.json.bz2'
  - Processing chunk #80 (size = 65536) for file 'quotebank/quotes-2015.json.bz2'
  - Processing chunk #96 (size = 65536) for file 'quotebank/quotes-2015.json.bz2'
  - Processing chunk #112 (size = 65536) for file 'quotebank/quotes-2015.json.bz2'
  - Processing chunk #128 (size = 65536) for file 'quotebank/quotes-2015.json.bz2'
  - Processing chunk #144 (size = 65536) for file 'quotebank/quotes-2015.json.bz2'
  - Processing chunk #160 (size = 65536) for file 'quotebank/quotes-2015.json.bz2'
  - Processing chunk #176 (size = 65536) for file 'quotebank/quotes-2015.json.bz2'
  - Processi

In [24]:
7471 / 60

124.51666666666667

In [32]:
!find out -iname '*.csv' -exec bzip2 -kzv {} \;
!mkdir out_bz2
!find out -iname '*.csv.bz2' -exec mv {} out_bz2 \;

  out/sport-quotes-2015.csv:  4.201:1,  1.904 bits/byte, 76.20% saved, 2586161046 in, 615620062 out.
  out/sport-quotes-2016.csv:  4.384:1,  1.825 bits/byte, 77.19% saved, 2052387478 in, 468111951 out.
  out/sport-quotes-2017.csv:  4.561:1,  1.754 bits/byte, 78.07% saved, 5103764884 in, 1119072564 out.
  out/sport-quotes-2020.csv:  4.717:1,  1.696 bits/byte, 78.80% saved, 578289196 in, 122593618 out.
  out/sport-quotes-2019.csv:  4.671:1,  1.713 bits/byte, 78.59% saved, 2621718521 in, 561288174 out.
  out/sport-quotes-2018.csv:  4.769:1,  1.678 bits/byte, 79.03% saved, 4473331888 in, 938035681 out.
