# Privacy Policy TL;DR

This notebook uses a transformer-based encoder-decoder model to generate a short
TL;DR (Too Long; Didn't Read) summary of a privacy policy, highlighting key
data collection and data sharing practices.

### Step 1: Import Required Library

Hugging Face's AutoTokenizer and AutoModelForSeq2SeqLM are used directly to avoid
pipeline compatibility issues and ipywidget is imported to simulate an interactive frontend within the notebook.


In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import tempfile, os
import ipywidgets as widgets
from IPython.display import display, clear_output

### Step 2: Load the Pretrained Summarization Model

BART encoderâ€“decoder model is used which is well suited for abstractive
summarization of long documents.



In [2]:
MODEL_NAME = "facebook/bart-large-cnn"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        

### Step 3: Generate TL;DR Summary

The model generates a concise summary highlighting key data collection and
sharing practices. (Backend Function)


In [3]:
def extract_data_collection_sections(text):
    keywords = [
    # Core data collection sections
    "information we collect",
    "data we collect",
    "your activity",
    "information you provide",
    "device information",
    "information from partners",
    "third parties",
    "log data",
    "usage data",

    # Personal identifiers
    "email", "e-mail",
    "phone", "mobile", "contact number",
    "name", "full name", "username",
    "password",

    # Financial & transactional
    "credit card", "debit card",
    "payment", "billing",
    "transaction", "purchase",
    "bank account", "financial information",

    # Location & network tracking
    "location", "precise location",
    "gps", "geolocation",
    "ip address", "ip",
    "wifi", "bluetooth", "cell tower",

    # Device & cross-device tracking
    "device", "device id",
    "advertising id", "persistent identifier",
    "unique identifier",
    "fingerprinting", "browser fingerprint",

    # Cookies & tracking tech
    "cookies",
    "tracking",
    "tracking pixel",
    "sdk",
    "analytics",

    # Biometric & media
    "biometric",
    "face", "facial recognition",
    "voice", "audio",
    "camera",
    "photos", "videos",

    # Communications & social graph
    "messages", "chat",
    "communications",
    "contacts", "address book",
    "call logs",

    # Behaviour & profiling
    "browsing history",
    "search history",
    "interaction data",
    "usage patterns",
    "profiling",
    "inferred",
    "preferences",

    # Advertising & monetization
    "advertising",
    "personalized advertising",
    "ad targeting",
    "marketing",
    "promotions",

    # Data sharing & selling
    "data sharing",
    "data transfer",
    "sell",
    "sale of data",
    "share",
    "disclose",
    "vendors",
    "partners",
    "advertisers",

    # Legal / government sharing
    "law enforcement",
    "government authorities",
    "legal requests",
    "court order",

    # Children & sensitive groups
    "children",
    "minors",
    "under the age",

    # Account & identity
    "account information",
    "profile information",
    "authentication"
]


    extracted_paragraphs = []

    paragraphs = text.split("\n")

    for para in paragraphs:
        para_lower = para.lower()
        if any(k in para_lower for k in keywords):
            extracted_paragraphs.append(para.strip())

    return "\n".join(extracted_paragraphs)

def generate_summary(text):
    filtered_text = extract_data_collection_sections(text)

    if not filtered_text.strip():
        filtered_text = text

    prompt = (
        "Summarize the types of personal data that are collected, tracked, "
        "stored, or shared with third parties in the following privacy policy.\n\n"
        + filtered_text
    )

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=1024,
        truncation=True
    )

    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=320,
        min_length=260,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


def classify_privacy_risk(summary):
    summary = summary.lower()

    high_risk_keywords = [
        "credit card", "debit card", "bank account",
        "payment", "billing", "transaction",
        "biometric", "facial recognition", "voice",
        "fingerprint",
        "precise location", "gps", "geolocation",
        "advertising id", "device id",
        "sell", "sale of data"
    ]
    medium_risk_keywords = [
        "email", "phone", "contact number",
        "username", "profile information",
        "messages", "chats", "communications",
        "photos", "videos",
        "browsing history", "search history",
        "ip address", "cookies",
        "third party", "partners", "vendors",
        "advertising", "ad targeting"
    ]

    low_risk_keywords = [
        "analytics", "usage data",
        "log data", "interaction data"
    ]

    score = 0

    score += 3 * sum(k in summary for k in high_risk_keywords)
    score += 2 * sum(k in summary for k in medium_risk_keywords)
    score += 1 * sum(k in summary for k in low_risk_keywords)

    if score >= 10:
        return "High Privacy Risk"
    elif score >= 4:
        return "Moderate Privacy Risk"
    else:
        return "Low Privacy Risk"



### Step 4: Build an Interactive Frontend

In this step, an interactive user interface is created using ipywidgets. A text area allows the user to paste a privacy policy, while a button enables the user to trigger the analysis and get the required summary.

In [None]:
policy_input = widgets.Textarea(
    placeholder="Paste privacy policy text here or upload a .txt document below using 'Upload' (TEXT ONLY)...",
    layout=widgets.Layout(width="100%", height="220px")
)

file_upload = widgets.FileUpload(
    accept=".txt",
    multiple=False
)

upload_status = widgets.Label("No file uploaded.")

analyze_button = widgets.Button(
    description="Generate Summary",
    button_style="success"
)

clear_button = widgets.Button(
    description="Clear Input",
    button_style="warning"
)

output_area = widgets.Output()


### Step 5: Analysis

In this step, a simple analysis is done based on the summary for stating the privacy risk.  

In [None]:
def update_upload_status(change):
    if file_upload.value:
        upload_status.value = "Text file uploaded."
    else:
        upload_status.value = "No file uploaded."

file_upload.observe(update_upload_status, names="value")

def on_analyze_clicked(b):
    with output_area:
        clear_output()

        text = policy_input.value.strip()

        if file_upload.value:
            uploaded = next(iter(file_upload.value.values()))
            text = uploaded["content"].decode("utf-8", errors="ignore")

        if not text:
            print("Please paste text or upload a .txt file.")
            return

        print("Generating summary...\n")

        summary = generate_summary(text)
        risk = classify_privacy_risk(summary)

        print("Summary:\n")
        print(summary)
        print("\n" + "-" * 60)
        print(f"Privacy Risk Assessment: {risk}")


def on_clear_clicked(b):
    policy_input.value = ""
    file_upload.value.clear()
    upload_status.value = "No file uploaded."
    with output_area:
        clear_output()


analyze_button.on_click(on_analyze_clicked)
clear_button.on_click(on_clear_clicked)


### Step 6: Launch the Interface

In this final step, all frontend components are displayed together to launch the interactive application. The user can now paste a privacy policy, generate a summary, and immediately view the corresponding privacy risk assessment.

In [None]:
display(
    widgets.VBox([
        policy_input,
        widgets.HBox([file_upload, upload_status]),
        widgets.HBox([analyze_button, clear_button]),
        output_area
    ])
)
