<a href="https://colab.research.google.com/github/esmaeelalshikh-sys/Text-Anonymization-App/blob/main/Text_Anonymization_App.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install streamlit pyngrok transformers faker python-docx PyPDF2

Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Collecting faker
  Downloading faker-37.8.0-py3-none-any.whl.metadata (15 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.49.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Downloading faker-37.8.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   

In [5]:
app_code = """
import streamlit as st
from transformers import pipeline
from faker import Faker
from docx import Document
from PyPDF2 import PdfReader
import io
import os

# --- 1. Key Improvement: Caching ---
# This function loads the model only once and keeps it in memory.
@st.cache_resource
def get_ner_pipeline():
    print("Loading AI model...")
    return pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", grouped_entities=True)

# We can also cache the Faker instance.
@st.cache_resource
def get_faker():
    return Faker()

# --- Session State Management ---
# Using None instead of "" makes conditions clearer.
def reset_anonymized_state():
    st.session_state.anonymized_text = None

if 'anonymized_text' not in st.session_state:
    st.session_state.anonymized_text = None

# --- File Reading Functions ---
def load_text_from_docx(docx_file):
    doc = Document(docx_file)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return "\\n".join(full_text)

def load_text_from_pdf(pdf_file):
    pdf_reader = PdfReader(pdf_file)
    full_text = []
    for page in pdf_reader.pages:
        text = page.extract_text()
        if text:
            full_text.append(text)
    return "\\n".join(full_text)

# --- Anonymization Function (modified to accept the model as an argument) ---
def anonymize_text(text, ner_pipeline, faker_instance, entity_types=['PER', 'ORG', 'LOC']):
    entities = ner_pipeline(text)
    new_text = text
    for ent in sorted(entities, key=lambda e: e['start'], reverse=True):
        ent_type = ent['entity_group']
        if ent_type in entity_types:
            if ent_type == 'PER':
                replacement = faker_instance.name()
            elif ent_type == 'ORG':
                replacement = faker_instance.company()
            elif ent_type == 'LOC':
                replacement = faker_instance.city()
            else:
                replacement = '*' * len(ent['word'])

            start = ent['start']
            end = ent['end']
            new_text = new_text[:start] + replacement + new_text[end:]

    return new_text

# --- Application Interface ---
st.title("Text Anonymization App")

# --- 2. Calling the Cached Functions ---
ner_pipeline = get_ner_pipeline()
fake = get_faker()

uploaded_file = st.file_uploader(
    "Choose a file (txt, docx, pdf)",
    type=["txt", "docx", "pdf"],
    on_change=reset_anonymized_state
)

if uploaded_file is not None:
    file_type = uploaded_file.type
    original_file_name = uploaded_file.name

    # Read the file based on its type
    if file_type == "text/plain":
        raw_text = uploaded_file.read().decode("utf-8")
    elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        raw_text = load_text_from_docx(uploaded_file)
    elif file_type == "application/pdf":
        raw_text = load_text_from_pdf(uploaded_file)
    else:
        raw_text = ""

    st.subheader("Extracted Text from File:")
    text_area = st.text_area("", raw_text, height=250, key="raw_text")

    st.subheader("Anonymization Options")
    entity_options_map = {
        'PER': 'Names (Person)',
        'ORG': 'Organizations (Organization)',
        'LOC': 'Locations (Location)',
        'MISC': 'Miscellaneous Entities (Miscellaneous)'
    }
    selected_entity_types = st.multiselect(
        'Select entity types to anonymize:',
        options=list(entity_options_map.keys()),
        format_func=lambda x: entity_options_map[x],
        default=['PER', 'ORG', 'LOC']
    )

    if st.button("Anonymize Text", type="primary"):
        with st.spinner('Processing text...'):
            # ### 3. Pass the pipeline and faker instances to the function ###
            st.session_state.anonymized_text = anonymize_text(text_area, ner_pipeline, fake, selected_entity_types)

    if st.session_state.anonymized_text is not None:
        st.subheader("Anonymized Text:")
        st.text_area("", st.session_state.anonymized_text, height=250, key="anonymized_text")

        st.subheader("Download Anonymized File")

        if file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
            new_doc = Document()
            for para in st.session_state.anonymized_text.split('\\n'):
                new_doc.add_paragraph(para)
            bio = io.BytesIO()
            new_doc.save(bio)
            st.download_button(
                label="Download New File (docx)",
                data=bio.getvalue(),
                file_name=f"anonymized_{original_file_name}",
                mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
            )
        else:
            file_name_without_ext = os.path.splitext(original_file_name)[0]
            st.download_button(
                label="Download New File (txt)",
                data=st.session_state.anonymized_text.encode("utf-8"),
                file_name=f"anonymized_{file_name_without_ext}.txt",
                mime="text/plain"
            )
"""

with open("app.py", "w", encoding="utf-8") as f:
    f.write(app_code)
print("✅ Successfully created the correct and fast app.py file.")

✅ Successfully created the correct and fast app.py file.


In [6]:
from google.colab import userdata
from pyngrok import ngrok
import os
import threading

# Use userdata to fetch the secret token correctly
ngrok_auth_token = userdata.get('NGROK_AUTH_TOKEN')

if ngrok_auth_token:
    ngrok.set_auth_token(ngrok_auth_token)

    def run_streamlit():
        os.system('streamlit run app.py')

    threading.Thread(target=run_streamlit).start()

    # Open an ngrok tunnel to port 8501 on the local host
    public_url = ngrok.connect("8501", proto="http")
    print(f"App URL: {public_url}")
else:
    print("Cannot start ngrok tunnel without the authentication token. Please add it to Colab secrets.")

App URL: NgrokTunnel: "https://f9cfc0dcf76a.ngrok-free.app" -> "http://localhost:8501"
