# Download State Analysis

This notebook analyzes download state JSON files using the DownloadStateAnalyzer class.

## Initialization

In [12]:
import sys
sys.path.append('..')

from download_state_analyzer import DownloadStateAnalyzer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import json
import csv

In [None]:
# Configuration
json_files = glob.glob('../download_state.json', )  # Find all JSON files in parent directory
output_dir = 'analysis_results'
os.makedirs(output_dir, exist_ok=True)

print(f"Found {len(json_files)} JSON files to analyze:")
for f in json_files:
    print(f"- {os.path.basename(f)}")

## Analyze Multiple JSON Files

In [None]:
def analyze_json_file(json_file):
    """Analyze a single JSON file and return the analyzer instance."""
    print(f"\nAnalyzing {os.path.basename(json_file)}...")
    analyzer = DownloadStateAnalyzer(json_file)
    
    # Basic stats
    status_summary = analyzer.get_status_summary()
    print("\nStatus Summary:")
    print(status_summary)
    
    # Plot status distribution
    #plt.figure(figsize=(8, 6))
    #plt.pie(status_summary, labels=status_summary.index, autopct='%1.1f%%')
    #plt.title(f'Download Status Distribution - {os.path.basename(json_file)}')
    #plt.show()
    
    return analyzer

# Analyze each JSON file
analyzers = {}
for json_file in json_files:
    analyzers[json_file] = analyze_json_file(json_file)


## Compare Results Across Files

In [None]:
# Compare status distributions
status_comparison = pd.DataFrame({
    os.path.basename(f): analyzer.get_status_summary()
    for f, analyzer in analyzers.items()
}).fillna(0)

print("Status Comparison Across Files:")
print(status_comparison)

# Plot comparison
status_comparison.plot(kind='bar', figsize=(12, 6))
plt.title('Download Status Comparison')
plt.xlabel('Status')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.show()

## Detailed Analysis for Each File

In [None]:
for json_file, analyzer in analyzers.items():
    print(f"\n=== Detailed Analysis for {os.path.basename(json_file)} ===")
    
    # Daily downloads
    daily_downloads = analyzer.get_daily_downloads()
    print(f"\nDownload Statistics:")
    print(f"Total days with downloads: {len(daily_downloads)}")
    print(f"Average downloads per day: {daily_downloads.mean():.2f}")
    print(f"Maximum downloads in a day: {daily_downloads.max()}")
    
    # Failed downloads
    failed = analyzer.get_failed_downloads()
    if not failed.empty:
        print(f"\nFailed Downloads: {len(failed)}")
        error_counts = failed['error'].value_counts()
        print("\nError Types:")
        print(error_counts)
    
    # Export results
    excel_file = analyzer.export_analysis(output_dir)
    print(f"\nDetailed analysis exported to: {excel_file}")

## Custom Analysis

Add your custom analysis below:

In [None]:
# Example: Compare download patterns over time
plt.figure(figsize=(15, 8))
for json_file, analyzer in analyzers.items():
    daily_downloads = analyzer.get_daily_downloads()
    daily_downloads.plot(label=os.path.basename(json_file))

plt.title('Download Patterns Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Downloads')
plt.legend(bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.show()

## Downloads still pending

In [2]:

import pandas as pd
import json
json_file2 = '../download_state_aria2.json'

# Load JSON file
with open(json_file2, "r") as file:
    data = json.load(file)

# Convert JSON to DataFrame
df = pd.DataFrame.from_dict(data, orient="index")

# Filter rows where 'status' is 'completed' and 'checksum_valid' is False
completed_but_invalid_df = df[(df['status'] == 'completed') & (df['checksum_valid'] == False)]
completed_but_invalid_df


Unnamed: 0,status,timestamp,md5,path,tool,verified_with_md5,checksum_valid


In [3]:
failed_df = df[(df['status'] != 'completed')]
print(f"Completed with 'lftp' but invalid: {len(completed_but_invalid_df)}")
print(f"Failed not yet downloaded: {len(failed_df)}")

pending_df = failed_df + completed_but_invalid_df
print(f"Number of files to download: {len(pending_df)}")

Completed with 'lftp' but invalid: 0
Failed not yet downloaded: 0
Number of files to download: 0


In [4]:
import csv
metadata_file = "../project_OEP00000860_data_download_link.txt"
# df = pd.read_csv(metadata_file, delimiter="\t")
rows = {}
try:
    with open(metadata_file, "r") as f:
        reader = csv.DictReader(f, delimiter="\t")
        for row in reader:
            rows[row["fileName"]] = row
        for index, row in pending_df.iterrows():
            run_id = rows[index]["run_id"]
            print(f"Missing: {index} from {run_id}")
except csv.Error as e:
    raise csv.Error(f"Error parsing {metadata_file}: {e}")
