In [None]:
import os
import json
import random
from datetime import datetime, timedelta

def generate_random_date(start, end):
    return start + timedelta(days=random.randint(0, int((end - start).days)))

def generate_flight_data(cities, null_probability):
    date = generate_random_date(datetime(2023, 1, 1), datetime(2023, 12, 31)).strftime('%Y-%m-%d')
    origin_city = random.choice(cities)
    destination_city = random.choice([city for city in cities if city != origin_city])
    flight_duration_secs = random.randint(30*60, 15*60*60)  # Between 30 minutes and 15 hours
    passengers_on_board = random.randint(1, 400)  # Between 1 and 400 passengers

    flight_record = {
        'date': date,
        'origin_city': origin_city,
        'destination_city': destination_city,
        'flight_duration_secs': flight_duration_secs,
        'passengers_on_board': passengers_on_board
    }

    # Introduce NULL values with a certain probability
    for key in flight_record:
        if random.random() < null_probability:
            flight_record[key] = None

    return flight_record

def generate_json_files(output_dir, num_files, num_cities, null_probability):
    cities = [f'City_{i}' for i in range(1, num_cities+1)]

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for i in range(num_files):
        month_year = datetime.now().strftime('%m-%y')
        origin_city = random.choice(cities)
        file_dir = os.path.join(output_dir, f'{month_year}-{origin_city}-flights.json')
        
        num_records = random.randint(50, 100)
        flight_data = [generate_flight_data(cities, null_probability) for _ in range(num_records)]

        with open(file_dir, 'w') as json_file:
            json.dump(flight_data, json_file, indent=4)

# Configuration
output_directory = 'E:\DHA'
number_of_files = 5000
number_of_cities = random.randint(100, 200)
null_value_probability = random.uniform(0.005, 0.001)

# Generate files
generate_json_files(output_directory, number_of_files, number_of_cities, null_value_probability)

In [1]:
import os
import json
import time
import numpy as np
from collections import defaultdict
from glob import glob

def is_dirty_record(record):
    return any(value is None for value in record.values())

def process_json_files(input_dir):
    total_records = 0
    dirty_records = 0
    flight_durations = defaultdict(list)
    passengers_arrived = defaultdict(int)
    passengers_left = defaultdict(int)
    start_time = time.time()

    for file_path in glob(os.path.join(input_dir, '*.json')):
        with open(file_path, 'r') as file:
            data = json.load(file)
            for record in data:
                total_records += 1
                if is_dirty_record(record):
                    dirty_records += 1
                else:
                    flight_durations[record['destination_city']].append(record['flight_duration_secs'])
                    passengers_arrived[record['destination_city']] += record['passengers_on_board']
                    passengers_left[record['origin_city']] += record['passengers_on_board']

    end_time = time.time()
    duration = end_time - start_time

    # Calculate AVG and P95 for top 25 destination cities
    top_25_destinations = sorted(flight_durations.keys(), key=lambda x: len(flight_durations[x]), reverse=True)[:25]
    avg_flight_durations = {city: np.mean(flight_durations[city]) for city in top_25_destinations}
    p95_flight_durations = {city: np.percentile(flight_durations[city], 95) for city in top_25_destinations}

    # Find cities with max passengers arrived and left
    max_passengers_arrived = max(passengers_arrived, key=passengers_arrived.get)
    max_passengers_left = max(passengers_left, key=passengers_left.get)

    results = {
        'total_records': total_records,
        'dirty_records': dirty_records,
        'run_duration': duration,
        'avg_flight_durations_top_25': avg_flight_durations,
        'p95_flight_durations_top_25': p95_flight_durations,
        'max_passengers_arrived_city': max_passengers_arrived,
        'max_passengers_left_city': max_passengers_left
    }

    return results

# Configuration
input_directory = 'E:\\DHA\\Gen'

# Process files and get results
results = process_json_files(input_directory)

# Display results
print(f"Total records processed: {results['total_records']}")
print(f"Dirty records count: {results['dirty_records']}")
print(f"Total run duration: {results['run_duration']} seconds")
print("\nAVG flight durations for top 25 destination cities:")
for city, avg in results['avg_flight_durations_top_25'].items():
    print(f"{city}: {avg:.2f} seconds")

print("\nP95 flight durations for top 25 destination cities:")
for city, p95 in results['p95_flight_durations_top_25'].items():
    print(f"{city}: {p95:.2f} seconds")

print(f"\nCity with max passengers arrived: {results['max_passengers_arrived_city']}")
print(f"City with max passengers left: {results['max_passengers_left_city']}")

Total records processed: 14098
Dirty records count: 132
Total run duration: 0.05157065391540527 seconds

AVG flight durations for top 25 destination cities:
City_169: 27744.51 seconds
City_69: 28602.53 seconds
City_22: 26839.89 seconds
City_81: 30878.32 seconds
City_114: 25526.19 seconds
City_33: 28149.16 seconds
City_131: 26690.37 seconds
City_109: 27467.68 seconds
City_144: 27502.73 seconds
City_75: 28150.55 seconds
City_175: 28812.43 seconds
City_115: 24463.97 seconds
City_125: 26604.75 seconds
City_7: 26669.79 seconds
City_117: 25164.04 seconds
City_155: 28795.99 seconds
City_122: 25471.91 seconds
City_56: 27840.76 seconds
City_107: 27789.44 seconds
City_16: 27482.90 seconds
City_129: 26617.72 seconds
City_97: 28374.30 seconds
City_170: 24630.48 seconds
City_164: 24637.46 seconds
City_15: 27617.42 seconds

P95 flight durations for top 25 destination cities:
City_169: 51488.00 seconds
City_69: 51444.40 seconds
City_22: 50839.30 seconds
City_81: 51303.40 seconds
City_114: 50445.40 se

In [None]:
#!pip install numpy