In [80]:
import os
import csv
import json
from multiprocessing import Pool
from io import StringIO
from time import time

Fields in json object

Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name

In [131]:
basefolder = "/home/ubuntu/data/tmp/de/wiki_sentences/"
filename = "wikisent2.json"
filepath = os.path.join(basefolder, filename)

In [122]:
header = [
    "year",
    "title",
    "origin",
    "director",
    "cast",
    "genre",
    "wiki_page",
    "plot"
]

with open(basefolder+"wikisent2.txt","r") as fin, open(basefolder+"wikisent2.json","w") as fout:
    for line in fin:
        sline = line.strip()
        js = {"sentence": sline}
        fout.write(f"{json.dumps(js)}\n")

In [82]:
class ChunkedReader(object):
    
    def __init__(self, filepath, number_of_chunks):
        self.number_of_chunks = number_of_chunks
        self.filepath = filepath

    def read_block_lines(self, block):
        """
        
        """
        assert 0 <= block and block < self.number_of_chunks
        assert 0 < self.number_of_chunks

        with open(self.filepath,"r+") as filehandle:
            filehandle.seek(0,2)
            file_size = filehandle.tell()

            ini = int(file_size * block / self.number_of_chunks)
            end = int(file_size * (1 + block) / self.number_of_chunks)

            if ini <= 0:
                filehandle.seek(0)
            else:
                #ini can be in the middle of the line: read line first
                filehandle.seek(ini-1) 
                filehandle.readline()

            while filehandle.tell() < end:
                line = filehandle.readline()
                yield json.loads(line)

In [121]:
number_of_chunks = 100
number_of_processes = 4

def process_chunk(args):
    reader, block = args
    counter = 0
    age_sum = 0
    for js in reader.read_block_lines(block):
        try:
            age = int(js["year"])
        except ValueError:
            print(js)
            continue
            
        counter += 1
        age_sum += age
        
    return age_sum, counter


pool = Pool(number_of_processes)
fin = ChunkedReader(filepath, number_of_chunks)

start = time()

age_super_sum = 0
super_count = 0

for age_sum, count in pool.imap_unordered(process_chunk,zip([fin]*number_of_chunks,range(number_of_chunks))):
    super_count += count
    age_super_sum += age_sum
    
average_age = age_super_sum/super_count

print(f"average age {average_age}")
print("in {} seconds".format(time()-start))

{'year': 'Release Year', 'title': 'Title', 'origin': 'Origin/Ethnicity', 'director': 'Director', 'cast': 'Cast', 'genre': 'Genre', 'wiki_page': 'Wiki Page', 'plot': 'Plot'}
average age 1981.314252135527
in 0.3518540859222412 seconds


In [111]:
age = 0
count = 0

start = time()
with open(filepath, "r") as fin:
    for line in fin:
        js = json.loads(line)
        try:
            age += int(js["year"])
        except ValueError:
            continue
        count += 1
        
average_age = age/count
print(f"average age {average_age}")
print("in {} seconds".format(time()-start))

average age 1981.314252135527
in 0.486339807510376 seconds


In [124]:
number_of_chunks = 100
number_of_processes = 4

def process_chunk(args):
    reader, block = args
    counter = 0
    for js in reader.read_block_lines(block):
        counter += len(js["sentence"].split(" "))
        
    return counter


pool = Pool(number_of_processes)
fin = ChunkedReader(filepath, number_of_chunks)

start = time()

super_count = 0

for count in pool.imap_unordered(process_chunk,zip([fin]*number_of_chunks,range(number_of_chunks))):
    super_count += count
    

print(f"word count {super_count}")
print("in {} seconds".format(time()-start))

word count 151523090
in 42.168298959732056 seconds


In [134]:

count = 0

start = time()
with open(filepath, "r") as fin:
    for line in fin:
        js = json.loads(line)
        count += len(js["sentence"].split(" "))
        
print(f"word count {count}")
print("in {} seconds".format(time()-start))

word count 187824
in 0.04913592338562012 seconds


In [132]:
filepaths = list()
for filename in os.listdir(basefolder):
    filepath = basefolder + filename
    filepaths.append(filepath)
    
    

In [133]:
number_of_processes = 4

def process_chunk(filepath):
    with open(filepath,"r") as fin:
        counter = 0
        for line in fin:
            js = json.loads(line)
            counter += len(js["sentence"].split(" "))

    return counter


pool = Pool(number_of_processes)

start = time()

super_count = 0
for count in pool.imap_unordered(process_chunk,filepaths):
    super_count += count
    

print(f"word count {super_count}")
print("in {} seconds".format(time()-start))

word count 151523090
in 16.944016933441162 seconds
