## Summarize `technologyDescriptionAndIncludedProcesses`

In [None]:
import json
import os
import pandas as pd
from langchain_ollama import ChatOllama  # Use your Ollama integration
from langchain_core.prompts import ChatPromptTemplate

# Define a JSON schema for the summarization response.
json_schema = {
    "title": "SummarizationResponse",
    "description": "Response containing a summarized version of the technology description.",
    "type": "object",
    "properties": {
        "summary": {
            "type": "string",
            "description": "A concise summary of the technology description emphasizing key raw materials and its use in construction."
        }
    },
    "required": ["summary"]
}

# Create a prompt template that only uses the Technology Description.
prompt_template = ChatPromptTemplate.from_template(
    """
You are an expert in summarizing technical manufacturing processes for building material products.
Summarize the following technology description into a concise paragraph that emphasizes:
- The key raw materials (e.g., PMMA, titanium dioxide, aluminum hydroxide, etc.)
Avoid excessive process details; focus on the material composition and intended use.

Technology Description:
{text}

Return your answer in JSON format following this schema:
{json_schema}
"""
)

# Set your Ollama model name (adjust as needed for your environment)
llm_model = "deepseek-r1:8b"  # e.g., "deepseek-r1:8b"
model = ChatOllama(model=llm_model)

def summarize_technology_description(text):
    """Uses Ollama to summarize a long Technology Description."""
    final_prompt = prompt_template.format_prompt(
        text=text, json_schema=json.dumps(json_schema)
    ).to_string()
    
    structured_llm = model.with_structured_output(
        json_schema, method="json_schema", include_raw=True
    )
    raw_response = structured_llm.invoke(final_prompt)
    structured_response = raw_response.get("parsed", None)
    if structured_response and "summary" in structured_response:
        return structured_response["summary"]
    return None

# Path to the CSV file.
csv_path = "../../data/pipeline2/sql/filtered_epd_data.csv"

number_entries = 100

# Read the CSV file and select only the first 100 entries.
df = pd.read_csv(csv_path).head(number_entries)

# Container to hold all responses.
summaries = []

# Process each row.
for idx, row in df.iterrows():
    tech_description = row.get("Technology Description", "")
    if tech_description and isinstance(tech_description, str):
        summary = summarize_technology_description(tech_description)
        # Create a dictionary for this row, including a row index for reference.
        summaries.append({
            "row_index": idx,
            "Product Name": row.get("Product Name", ""),
            "Classification": row.get("Classification", ""),
            "Technology Description Summary": summary
        })
        print(f"Summary for row {idx}:\n{summary}\n{'-'*50}\n")
    else:
        print(f"Row {idx} has no Technology Description.")

# Ensure the output directory exists.
output_dir = os.path.join("..", "..", "data", "pipeline2", "json")
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f"{number_entries}_technology_summaries.json")

# Write the JSON responses to file.
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(summaries, f, indent=2)

print(f"All summaries saved to {output_file}")


### Iteration 2

In [None]:
import json
import os
import pandas as pd
from langchain_ollama import ChatOllama  # Use your Ollama integration
from langchain_core.prompts import ChatPromptTemplate

# Define a JSON schema for the summarization response.
json_schema = {
    "title": "SummarizationResponse",
    "description": "Response containing a summarized version of the technology description.",
    "type": "object",
    "properties": {
        "summary": {
            "type": "string",
            "description": "A concise summary of the technology description emphasizing key raw materials and its use in construction."
        }
    },
    "required": ["summary"]
}

# Create a prompt template that only uses the Technology Description.
prompt_template = ChatPromptTemplate.from_template(
    """
You are an expert in summarizing technical documents of building material products to facilitate categorization.
Summarize the following technology description into a concise paragraph emphasizing key raw materials and its use in construction.
If no materials are mentioned, avoid excessive process details.

Technology Description:
{text}

Return your answer in JSON format following this schema:
{json_schema}
"""
)

# Set your Ollama model name (adjust as needed for your environment)
llm_model = "deepseek-r1:8b"  # e.g., "deepseek-r1:8b"
model = ChatOllama(model=llm_model)

def summarize_technology_description(text):
    """Uses Ollama to summarize a long Technology Description."""
    final_prompt = prompt_template.format_prompt(
        text=text, json_schema=json.dumps(json_schema)
    ).to_string()
    
    structured_llm = model.with_structured_output(
        json_schema, method="json_schema", include_raw=True
    )
    raw_response = structured_llm.invoke(final_prompt)
    structured_response = raw_response.get("parsed", None)
    if structured_response and "summary" in structured_response:
        return structured_response["summary"]
    return None

# Path to the CSV file.
csv_path = "../../data/pipeline2/sql/filtered_epd_data.csv"

number_entries = 100

# Read the CSV file and select only the first 100 entries.
df = pd.read_csv(csv_path).head(number_entries)

# Container to hold all responses.
summaries = []

# Process each row.
for idx, row in df.iterrows():
    tech_description = row.get("Technology Description", "")
    if tech_description and isinstance(tech_description, str):
        summary = summarize_technology_description(tech_description)
        # Create a dictionary for this row, including a row index for reference.
        summaries.append({
            "row_index": idx,
            "Product Name": row.get("Product Name", ""),
            "Classification": row.get("Classification", ""),
            "Technology Description Summary": summary
        })
        print(f"Summary for row {idx}:\n{summary}\n{'-'*50}\n")
    else:
        print(f"Row {idx} has no Technology Description.")

# Ensure the output directory exists.
output_dir = os.path.join("..", "..", "data", "pipeline2", "json")
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f"{number_entries}_tech_sum_one_paragraph.json")

# Write the JSON responses to file.
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(summaries, f, indent=2)

print(f"All summaries saved to {output_file}")


## Extract Composition Materials from `technologyDescriptionAndIncludedProcesses`

In [None]:
import json
import os
import pandas as pd
from langchain_ollama import ChatOllama  # Use your Ollama integration
from langchain_core.prompts import ChatPromptTemplate

# Define a JSON schema for the composition extraction response.
json_schema = {
    "title": "CompositionExtractionResponse",
    "description": "Response containing a list of key composition materials extracted from the technology description.",
    "type": "object",
    "properties": {
        "composition": {
            "type": "string",
            "description": "A list of key raw materials mentioned in the technology description, e.g., PMMA, titanium dioxide, aluminum hydroxide."
        }
    },
    "required": ["composition"]
}

# Create a prompt template that focuses on extracting composition materials from the Technology Description.
prompt_template = ChatPromptTemplate.from_template(
    """
You are an expert in technical manufacturing processes for building material products.
Extract and list the key composition materials from the following technology description. 
Focus exclusively on the raw materials (e.g., PMMA, titanium dioxide, aluminum hydroxide, etc.) used in the product.
Avoid including unnecessary process details; only list the materials.

Technology Description:
{text}

Return your answer in JSON format following this schema:
{json_schema}
"""
)

# Set your Ollama model name (adjust as needed for your environment)
llm_model = "deepseek-r1:8b"  # for example, "deepseek-r1:8b"
model = ChatOllama(model=llm_model)

def extract_composition_materials(text):
    """Uses Ollama to extract the composition materials from a Technology Description."""
    final_prompt = prompt_template.format_prompt(
        text=text, json_schema=json.dumps(json_schema)
    ).to_string()
    
    structured_llm = model.with_structured_output(
        json_schema, method="json_schema", include_raw=True
    )
    raw_response = structured_llm.invoke(final_prompt)
    structured_response = raw_response.get("parsed", None)
    if structured_response and "composition" in structured_response:
        return structured_response["composition"]
    return None

# Path to the CSV file.
csv_path = "../../data/pipeline2/sql/filtered_epd_data.csv"

number_entries = 100

# Read the CSV file and select only the first 100 entries.
df = pd.read_csv(csv_path).head(number_entries)

# Container to hold all responses.
composition_extractions = []

# Process each row.
for idx, row in df.iterrows():
    tech_description = row.get("Technology Description", "")
    if tech_description and isinstance(tech_description, str):
        composition = extract_composition_materials(tech_description)
        # Create a dictionary for this row, including a row index for reference.
        composition_extractions.append({
            "row_index": idx,
            "Product Name": row.get("Product Name", ""),
            "Classification": row.get("Classification", ""),
            "Extracted Composition": composition
        })
        print(f"Extracted composition for row {idx}:\n{composition}\n{'-'*50}\n")
    else:
        print(f"Row {idx} has no Technology Description.\n{'-'*50}\n")

# Ensure the output directory exists.
output_dir = os.path.join("..", "..", "data", "pipeline2", "json")
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f"{number_entries}_technology_compositions.json")

# Write the JSON responses to file.
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(composition_extractions, f, indent=2)

print(f"All composition extractions saved to {output_file}")

## Do Both Summarization and Composition Material Extraction from `technologyDescriptionAndIncludedProcesses`

In [None]:
import json
import os
import pandas as pd
from langchain_ollama import ChatOllama  # Use your Ollama integration
from langchain_core.prompts import ChatPromptTemplate

# Define a JSON schema for the combined summarization and composition extraction response.
json_schema = {
    "title": "SummarizationAndCompositionResponse",
    "description": "Response containing a summarized version of the technology description and a list of key composition materials.",
    "type": "object",
    "properties": {
        "summary": {
            "type": "string",
            "description": "A concise summary of the technology description emphasizing key raw materials and its use in construction."
        },
        "composition": {
            "type": "string",
            "description": "A list of key raw materials mentioned in the technology description, e.g., PMMA, titanium dioxide, aluminum hydroxide."
        }
    },
    "required": ["summary", "composition"]
}

# Create a prompt template that instructs the model to provide both outputs.
prompt_template = ChatPromptTemplate.from_template(
    """
You are an expert in technical manufacturing processes for building material products.
Given the following technology description, perform two tasks:
1. Summarize the description into a concise paragraph that emphasizes the key raw materials.
2. Extract and list the key composition materials mentioned (e.g., PMMA, titanium dioxide, aluminum hydroxide).

Avoid excessive process details; focus on material composition and intended use.

Technology Description:
{text}

Return your answer in JSON format following this schema:
{json_schema}
"""
)

# Set your Ollama model name (this is dynamic so you can test different models)
llm_model = "deepseek-r1:8b"  # For example, "deepseek-r1:8b", "falcon3:7b-instruct-q4_K_M",
model = ChatOllama(model=llm_model)

def process_technology_description(text):
    """Uses Ollama to get both a summary and the extracted composition materials from a Technology Description."""
    final_prompt = prompt_template.format_prompt(
        text=text, json_schema=json.dumps(json_schema)
    ).to_string()

    structured_llm = model.with_structured_output(
        json_schema, method="json_schema", include_raw=True
    )
    raw_response = structured_llm.invoke(final_prompt)
    structured_response = raw_response.get("parsed", None)
    if structured_response and "summary" in structured_response and "composition" in structured_response:
        return structured_response
    return {"summary": None, "composition": None}

# Path to the CSV file.
csv_path = os.path.join("..", "..", "data", "pipeline2", "sql", "filtered_epd_data.csv")

number_entries = 100

# Read the CSV file and select only the first 100 entries.
df = pd.read_csv(csv_path).head(number_entries)

# Container to hold all responses.
results = []

# Process each row.
for idx, row in df.iterrows():
    tech_description = row.get("Technology Description", "")
    if tech_description and isinstance(tech_description, str):
        response = process_technology_description(tech_description)
        results.append({
            "row_index": idx,
            "Product Name": row.get("Product Name", ""),
            "Classification": row.get("Classification", ""),
            "Technology Description Summary": response.get("summary"),
            "Extracted Composition": response.get("composition")
        })
        print(f"Summary and material composition for row {idx}:\n{response.get("summary")}\n{response.get("composition")}\n{'-'*50}\n")
    else:
        print(f"Row {idx} has no Technology Description.")

# Ensure the output directory exists.
output_dir = os.path.join("..", "..", "data", "pipeline2", "json")
os.makedirs(output_dir, exist_ok=True)

# Include the model name in the output file name.
model_name = llm_model.split(":")[0]
output_file = os.path.join(output_dir, f"{number_entries}_technology_summary_composition_{model_name}.json")

# Write the JSON responses to file.
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

print(f"All responses saved to {output_file}")


## Summarize starting with Product Name

In [None]:
import json
import os
import pandas as pd
from langchain_ollama import ChatOllama  # Use your Ollama integration
from langchain_core.prompts import ChatPromptTemplate

# Define a JSON schema for the summarization response (only a summary field).
json_schema = {
    "title": "TechnologySummaryResponse",
    "description": "Response containing a concise technology summary that begins with the product name.",
    "type": "object",
    "properties": {
        "summary": {
            "type": "string",
            "description": "A concise summary of the technology description that starts with the product name."
        }
    },
    "required": ["summary"]
}

# Create a prompt template that uses both the Product Name and Technology Description.
prompt_template = ChatPromptTemplate.from_template(
    """
You are an expert in technical manufacturing processes for building material products.
Using the information provided, produce a concise technology summary that begins with the product name.
The summary should start with "The product {product_name}" and then describe the manufacturing process and key raw materials.

Technology Description: {text}

Return your answer in JSON format following this schema:
{json_schema}
"""
)

# Set your Ollama model name (this can be changed dynamically)
llm_model = "deepseek-r1:8b"  # e.g., "deepseek-r1:8b" or another model name
model = ChatOllama(model=llm_model)

def summarize_technology_description(product_name, text):
    """Uses Ollama to generate a summary for the technology description that starts with the product name."""
    final_prompt = prompt_template.format_prompt(
        product_name=product_name,
        text=text,
        json_schema=json.dumps(json_schema)
    ).to_string()
    
    structured_llm = model.with_structured_output(
        json_schema, method="json_schema", include_raw=True
    )
    raw_response = structured_llm.invoke(final_prompt)
    structured_response = raw_response.get("parsed", None)
    if structured_response and "summary" in structured_response:
        return structured_response["summary"]
    return None

# Path to the CSV file.
csv_path = os.path.join("..", "..", "data", "pipeline2", "sql", "filtered_epd_data.csv")
number_entries = 100

# Read the CSV file and select only the first 100 entries.
df = pd.read_csv(csv_path).head(number_entries)

# Container to hold all responses.
results = []

# Process each row.
for idx, row in df.iterrows():
    product_name = row.get("Product Name", "")
    tech_description = row.get("Technology Description", "")
    if product_name and tech_description and isinstance(tech_description, str):
        summary = summarize_technology_description(product_name, tech_description)
        results.append({
            "row_index": idx,
            "Product Name": product_name,
            "Classification": row.get("Classification", ""),
            "Technology Description Summary": summary
        })
        print(f"Processed row {idx}\n{product_name}\n\n{summary}\n{'-'*50}\n")
    else:
        print(f"Row {idx} missing product name or technology description.")

# Ensure the output directory exists.
output_dir = os.path.join("..", "..", "data", "pipeline2", "json")
os.makedirs(output_dir, exist_ok=True)

# Include the model name in the output file name.
model_name = llm_model.split(":")[0]
output_file = os.path.join(output_dir, f"{number_entries}_tech_sum_prodname_{model_name}.json")

# Write the JSON responses to file.
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

print(f"All summaries saved to {output_file}")

### Iteration 2

In [None]:
import json
import os
import pandas as pd
from langchain_ollama import ChatOllama  # Use your Ollama integration
from langchain_core.prompts import ChatPromptTemplate

# Define a JSON schema for the summarization response (only a summary field).
json_schema = {
    "title": "TechnologySummaryResponse",
    "description": "Response containing a concise technology summary that begins with the product name.",
    "type": "object",
    "properties": {
        "summary": {
            "type": "string",
            "description": "A concise summary of the technology description that starts with the product name."
        }
    },
    "required": ["summary"]
}

# Create a prompt template that uses both the Product Name and Technology Description.
prompt_template = ChatPromptTemplate.from_template(
    """
You are an expert in technical manufacturing processes for building material products.
Using the information provided, produce a one sentence summary that begins with the product name.
The summary should start with "The product {product_name}" and then describe concisely the product.

Technology Description: {text}

Return your answer in JSON format following this schema:
{json_schema}
"""
)

# Set your Ollama model name (this can be changed dynamically)
llm_model = "deepseek-r1:8b"  # e.g., "deepseek-r1:8b" or another model name
model = ChatOllama(model=llm_model)

def summarize_technology_description(product_name, text):
    """Uses Ollama to generate a summary for the technology description that starts with the product name."""
    final_prompt = prompt_template.format_prompt(
        product_name=product_name,
        text=text,
        json_schema=json.dumps(json_schema)
    ).to_string()
    
    structured_llm = model.with_structured_output(
        json_schema, method="json_schema", include_raw=True
    )
    raw_response = structured_llm.invoke(final_prompt)
    structured_response = raw_response.get("parsed", None)
    if structured_response and "summary" in structured_response:
        return structured_response["summary"]
    return None

# Path to the CSV file.
csv_path = os.path.join("..", "..", "data", "pipeline2", "sql", "filtered_epd_data.csv")
number_entries = 100

# Read the CSV file and select only the first 100 entries.
df = pd.read_csv(csv_path).head(number_entries)

# Container to hold all responses.
results = []

# Process each row.
for idx, row in df.iterrows():
    product_name = row.get("Product Name", "")
    tech_description = row.get("Technology Description", "")
    if product_name and tech_description and isinstance(tech_description, str):
        summary = summarize_technology_description(product_name, tech_description)
        results.append({
            "row_index": idx,
            "Product Name": product_name,
            "Classification": row.get("Classification", ""),
            "Technology Description Summary": summary
        })
        print(f"Processed row {idx}\n{product_name}\n\n{summary}\n{'-'*50}\n")
    else:
        print(f"Row {idx} missing product name or technology description.")

# Ensure the output directory exists.
output_dir = os.path.join("..", "..", "data", "pipeline2", "json")
os.makedirs(output_dir, exist_ok=True)

# Include the model name in the output file name.
model_name = llm_model.split(":")[0]
output_file = os.path.join(output_dir, f"{number_entries}_tech_sum_prodname_{model_name}.json")

# Write the JSON responses to file.
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

print(f"All summaries saved to {output_file}")

## Summarize Technology Descriptions in one Sentence

In [None]:
import json
import os
import pandas as pd
from langchain_ollama import ChatOllama  # Use your Ollama integration
from langchain_core.prompts import ChatPromptTemplate

# Define a JSON schema for the summarization response.
json_schema = {
    "title": "SummarizationResponse",
    "description": "Response containing a summarized version of the technology description.",
    "type": "object",
    "properties": {
        "summary": {
            "type": "string",
            "description": "A concise summary of the technology description emphasizing key raw materials and its use in construction."
        }
    },
    "required": ["summary"]
}

# Create a prompt template that only uses the Technology Description.
prompt_template = ChatPromptTemplate.from_template(
    """
You are an expert in summarizing technical documents of building material products.
Summarize the following technology description into a concise sentence that facilitates the categorization of the building product.

Technology Description:
{text}

Return your answer in JSON format following this schema:
{json_schema}
"""
)

# Set your Ollama model name (adjust as needed for your environment)
llm_model = "deepseek-r1:8b"  # e.g., "deepseek-r1:8b"
model = ChatOllama(model=llm_model)

def summarize_technology_description(text):
    """Uses Ollama to summarize a long Technology Description."""
    final_prompt = prompt_template.format_prompt(
        text=text, json_schema=json.dumps(json_schema)
    ).to_string()
    
    structured_llm = model.with_structured_output(
        json_schema, method="json_schema", include_raw=True
    )
    raw_response = structured_llm.invoke(final_prompt)
    structured_response = raw_response.get("parsed", None)
    if structured_response and "summary" in structured_response:
        return structured_response["summary"]
    return None

# Path to the CSV file.
csv_path = "../../data/pipeline2/sql/filtered_epd_data.csv"

number_entries = 100

# Read the CSV file and select only the first 100 entries.
df = pd.read_csv(csv_path).head(number_entries)

# Container to hold all responses.
summaries = []

# Process each row.
for idx, row in df.iterrows():
    tech_description = row.get("Technology Description", "")
    if tech_description and isinstance(tech_description, str):
        summary = summarize_technology_description(tech_description)
        # Create a dictionary for this row, including a row index for reference.
        summaries.append({
            "row_index": idx,
            "Product Name": row.get("Product Name", ""),
            "Classification": row.get("Classification", ""),
            "Technology Description Summary": summary
        })
        print(f"Summary for row {idx}:\n{summary}\n{'-'*50}\n")
    else:
        print(f"Row {idx} has no Technology Description.")

# Ensure the output directory exists.
output_dir = os.path.join("..", "..", "data", "pipeline2", "json")
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f"{number_entries}_tech_sum_one_sentence.json")

# Write the JSON responses to file.
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(summaries, f, indent=2)

print(f"All summaries saved to {output_file}")


### Iteration 2

In [None]:
import json
import os
import pandas as pd
from langchain_ollama import ChatOllama  # Use your Ollama integration
from langchain_core.prompts import ChatPromptTemplate

# Define a JSON schema for the summarization response.
json_schema = {
    "title": "SummarizationResponse",
    "description": "Response containing a summarized version of the technology description.",
    "type": "object",
    "properties": {
        "summary": {
            "type": "string",
            "description": "A concise summary of the technology description emphasizing key raw materials and its use in construction."
        }
    },
    "required": ["summary"]
}

# Create a prompt template that only uses the Technology Description.
prompt_template = ChatPromptTemplate.from_template(
    """
You are an expert in summarizing technical documents of building material products to facilitate categorization.
Summarize the following technology description into a concise sentence emphasizing key raw materials and its use in construction.

Technology Description:
{text}

Return your answer in JSON format following this schema:
{json_schema}
"""
)

# Set your Ollama model name (adjust as needed for your environment)
llm_model = "deepseek-r1:8b"  # e.g., "deepseek-r1:8b"
model = ChatOllama(model=llm_model)

def summarize_technology_description(text):
    """Uses Ollama to summarize a long Technology Description."""
    final_prompt = prompt_template.format_prompt(
        text=text, json_schema=json.dumps(json_schema)
    ).to_string()
    
    structured_llm = model.with_structured_output(
        json_schema, method="json_schema", include_raw=True
    )
    raw_response = structured_llm.invoke(final_prompt)
    structured_response = raw_response.get("parsed", None)
    if structured_response and "summary" in structured_response:
        return structured_response["summary"]
    return None

# Path to the CSV file.
csv_path = "../../data/pipeline2/sql/filtered_epd_data.csv"

number_entries = 100

# Read the CSV file and select only the first 100 entries.
df = pd.read_csv(csv_path).head(number_entries)

# Container to hold all responses.
summaries = []

# Process each row.
for idx, row in df.iterrows():
    tech_description = row.get("Technology Description", "")
    if tech_description and isinstance(tech_description, str):
        summary = summarize_technology_description(tech_description)
        # Create a dictionary for this row, including a row index for reference.
        summaries.append({
            "row_index": idx,
            "Product Name": row.get("Product Name", ""),
            "Classification": row.get("Classification", ""),
            "Technology Description Summary": summary
        })
        print(f"Summary for row {idx}:\n{summary}\n{'-'*50}\n")
    else:
        print(f"Row {idx} has no Technology Description.")

# Ensure the output directory exists.
output_dir = os.path.join("..", "..", "data", "pipeline2", "json")
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f"{number_entries}_tech_sum_one_sentence02.json")

# Write the JSON responses to file.
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(summaries, f, indent=2)

print(f"All summaries saved to {output_file}")


## Print Character and Token Count

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import tiktoken
import numpy as np
import statistics
import os

# File path for the JSON summaries
json_file_path = os.path.join("..", "..", "data", "pipeline2", "json", "100_technology_summaries.json")

# Check if the file exists
if not os.path.exists(json_file_path):
    print(f"File not found: {json_file_path}")
else:
    # Load JSON data
    with open(json_file_path, "r", encoding="utf-8") as f:
        summaries = json.load(f)
    
    # Convert JSON list to a DataFrame
    df = pd.DataFrame(summaries)
    
    # We assume that the summarized text is under the key "Technology Description Summary"
    # If your key is different (e.g., "summary"), adjust accordingly.
    if "Technology Description Summary" not in df.columns:
        print("Key 'Technology Description Summary' not found in JSON data.")
    else:
        # Compute the character count of each summary.
        df["char_count"] = df["Technology Description Summary"].astype(str).apply(len)
        
        # Initialize the tokenizer.
        enc = tiktoken.get_encoding("o200k_base")
        df["token_count"] = df["Technology Description Summary"].astype(str).apply(lambda text: len(enc.encode(text)))
        
        # Collect counts into lists.
        lengths = df["char_count"].tolist()
        token_counts = df["token_count"].tolist()
        
        # Calculate basic statistics.
        mean_length = statistics.mean(lengths) if lengths else 0
        mean_tokens = statistics.mean(token_counts) if token_counts else 0
        median_length = np.median(lengths) if lengths else 0
        median_tokens = np.median(token_counts) if token_counts else 0
        std_length = np.std(lengths, ddof=1) if len(lengths) > 1 else 0
        std_tokens = np.std(token_counts, ddof=1) if len(token_counts) > 1 else 0
        percentiles_length = np.percentile(lengths, [25, 50, 75, 90, 95]) if lengths else [0] * 5
        percentiles_tokens = np.percentile(token_counts, [25, 50, 75, 90, 95]) if token_counts else [0] * 5
        iqr_length = percentiles_length[2] - percentiles_length[0]
        iqr_tokens = percentiles_tokens[2] - percentiles_tokens[0]
        
        # Print statistical measures.
        print(f"Mean character count: {mean_length:.2f}")
        print(f"Median character count: {median_length:.2f}")
        print(f"Standard deviation (chars): {std_length:.2f}")
        print(f"25th, 50th, 75th, 90th, 95th percentiles (chars): {percentiles_length}")
        print(f"IQR (chars): {iqr_length:.2f}\n")
        
        print(f"Mean token count: {mean_tokens:.2f}")
        print(f"Median token count: {median_tokens:.2f}")
        print(f"Standard deviation (tokens): {std_tokens:.2f}")
        print(f"25th, 50th, 75th, 90th, 95th percentiles (tokens): {percentiles_tokens}")
        print(f"IQR (tokens): {iqr_tokens:.2f}\n")
        
        # Plot histogram for character counts.
        plt.figure(figsize=(10, 5))
        plt.hist(lengths, bins=20, edgecolor="black")
        plt.title("Character Count Distribution of Summaries")
        plt.xlabel("Character Count")
        plt.ylabel("Frequency")
        plt.show()
        
        # Plot histogram for token counts.
        plt.figure(figsize=(10, 5))
        plt.hist(token_counts, bins=20, edgecolor="black")
        plt.title("Token Count Distribution of Summaries")
        plt.xlabel("Token Count")
        plt.ylabel("Frequency")
        plt.show()
        
        # Function to print histogram data numerically.
        def print_histogram_data(data, bins=20, title="Histogram"):
            counts, bin_edges = np.histogram(data, bins=bins)
            print(title)
            for i in range(len(counts)):
                lower = bin_edges[i]
                upper = bin_edges[i + 1]
                print(f"{lower:5.1f} - {upper:5.1f}: {counts[i]}")
        
        print_histogram_data(lengths, bins=20, title="Character Count Histogram Data")
        print_histogram_data(token_counts, bins=20, title="Token Count Histogram Data")
