In [None]:
import boto3
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import tarfile

# Config
bucket = "c144972a3751929l10036967t1w533267341463-bucket-pp6e047xcla7" #enter your bucket name here
key = "comprehend/output/output.tar.gz" #enter the output path
local_tar_path = "/tmp/output.tar.gz"
extract_dir = "/tmp/comprehend_output"

s3 = boto3.client("s3")
s3.download_file(bucket, key, local_tar_path)

# Extract
os.makedirs(extract_dir, exist_ok=True)
with tarfile.open(local_tar_path, "r:gz") as tar:
    tar.extractall(path=extract_dir)

print("Extracted:", os.listdir(extract_dir))

# Parse JSON
results = []
json_path = os.path.join(extract_dir, "output")
with open(json_path) as f:
    for line in f:
        try:
            data = json.loads(line)
            for phrase in data.get("KeyPhrases", []):
                results.append({
                    "Text": phrase.get("Text", ""),
                    "Score": phrase.get("Score", 0.0)
                })
        except json.JSONDecodeError:
            pass

df_phrases = pd.DataFrame(results)
df_phrases = df_phrases.sort_values(by="Score", ascending=False)

# Visualize
top_phrases = df_phrases["Text"].value_counts().head(15)

plt.figure(figsize=(10, 6))
top_phrases.plot(kind="barh")
plt.xlabel("Frequency")
plt.title("Top 15 Key Phrases Across All Videos")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
