This notebook was used to check if there was a significant difference between the most common solution with the `json` library and the `orjson` library.

In [6]:
import json
import orjson
import time
from pathlib import Path
import os

PROJECT_ROOT = Path(os.getcwd()).parent

DATA_DIR = PROJECT_ROOT / "data"
FILENAME = "farmers-protest-tweets-2021-2-4.json"

# Original solution using json.loads
def original_solution(file_path):
    with open(file_path, 'r') as json_file:
        data = [
            (item['date'], item['user']['username'])
            for line in json_file
            for item in [json.loads(line.strip())]
            if 'date' in item and 'user' in item and 'username' in item['user']
        ]
    return data

# Optimized solution using orjson
def optimized_solution(file_path):
    data = []
    with open(file_path, 'r') as json_file:
        for line in json_file:
            try:
                item = orjson.loads(line)
                if 'date' in item and 'user' in item and 'username' in item['user']:
                    data.append((item['date'], item['user']['username']))
            except orjson.JSONDecodeError:
                continue
    return data

# File path
file_path = DATA_DIR / FILENAME

# Measure the time for original solution
start_time = time.time()
original_data = original_solution(file_path)
original_duration = time.time() - start_time

# Measure the time for optimized solution
start_time = time.time()
optimized_data = optimized_solution(file_path)
optimized_duration = time.time() - start_time

print(f"Original Duration: {original_duration:.4f} seconds")
print(f"Optimized Duration: {optimized_duration:.4f} seconds")

Original Duration: 5.0383 seconds
Optimized Duration: 3.1999 seconds


We can see there is 37% reduction of time. We go with the optimized version.