In [None]:
import pandas as pd
from transformers import pipeline, AutoTokenizer
import torch
import re
from collections import defaultdict
from tqdm import tqdm
import sys
import logging

# First configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Now we can use logger for GPU availability check
if torch.cuda.is_available():
    device = torch.device("cuda")
    logger.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    logger.warning("Using CPU - GPU not available")

# Set recursion limit (safety measure)
sys.setrecursionlimit(10000)

# Initialize models with GPU optimization
try:
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

    # Load models with device_map for automatic GPU placement
    ner_model = pipeline(
        "ner",
        model="dslim/bert-base-NER",
        tokenizer=tokenizer,
        aggregation_strategy="simple",
        device=0 if torch.cuda.is_available() else -1,
        batch_size=8,  # Increased batch size for GPU
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32  # Use mixed precision on GPU
    )

    zero_shot_model = pipeline(
        "zero-shot-classification",
        model="facebook/bart-large-mnli",
        device=0 if torch.cuda.is_available() else -1,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
    )

    # Explicitly move models to GPU if available
    if torch.cuda.is_available():
        ner_model.model = ner_model.model.to('cuda')
        zero_shot_model.model = zero_shot_model.model.to('cuda')

except Exception as e:
    logger.error(f"Model loading failed: {e}")
    raise

# Optimized keyword fallback
EVENT_FALLBACK = {
    "terrorism": {"attack", "terror", "bomb", "isis"},
    "sports": {"match", "tournament", "score", "goal"},
    "politics": {"election", "minister", "government"},
    "accident": {"crash", "collision", "died", "killed"}
}

def chunk_text(text, max_tokens=400):
    """Split text into safe chunks using tokenizer"""
    tokens = tokenizer.tokenize(text)
    for i in range(0, len(tokens), max_tokens):
        yield tokenizer.convert_tokens_to_string(tokens[i:i+max_tokens])

def extract_tags_safely(text):
    """Improved NER processing with proper token reconstruction"""
    text = re.sub(r'[^\w\s]', '', str(text))[:10000]  # Clean and truncate text
    tags = defaultdict(list)

    try:
        # Process in chunks
        for chunk in chunk_text(text):
            if torch.cuda.is_available():
                with torch.cuda.amp.autocast():
                    entities = ner_model(chunk)
            else:
                entities = ner_model(chunk)

            # Variables to reconstruct split tokens
            current_entity = None
            reconstructed_text = ""

            for entity in entities:
                word = entity["word"]

                # Handle subword tokens (starting with ##)
                if word.startswith("##"):
                    if current_entity:
                        reconstructed_text += word[2:]  # Remove ## prefix
                    continue

                # If we have a reconstructed entity, save it
                if current_entity and reconstructed_text:
                    tags[current_entity["entity_group"].lower() + "s"].append(reconstructed_text)

                # Start new entity
                current_entity = entity
                reconstructed_text = word

            # Add the last reconstructed entity if exists
            if current_entity and reconstructed_text:
                tags[current_entity["entity_group"].lower() + "s"].append(reconstructed_text)

    except Exception as e:
        logger.warning(f"NER failed for text: {e}")

    # Post-processing to clean results
    for key in tags:
        # Remove single-letter entries and empty strings
        tags[key] = [x for x in tags[key] if len(x) > 1 and x.strip()]

        # Remove duplicates while preserving order
        seen = set()
        tags[key] = [x for x in tags[key] if not (x in seen or seen.add(x))]

    return tags

def detect_events(text):
    """Hybrid event detection with fallback"""
    text_lower = text.lower()
    events = []

    # Try zero-shot first
    try:
        if torch.cuda.is_available():
            with torch.cuda.amp.autocast():  # Mixed precision for GPU
                zs_result = zero_shot_model(
                    text_lower[:1000],  # Truncate for zero-shot
                    candidate_labels=list(EVENT_FALLBACK.keys()),
                    multi_label=True
                )
        else:
            zs_result = zero_shot_model(
                text_lower[:1000],
                candidate_labels=list(EVENT_FALLBACK.keys()),
                multi_label=True
            )

        events.extend([
            label for label, score in zip(zs_result["labels"], zs_result["scores"])
            if score > 0.65
        ])
    except Exception as e:
        logger.warning(f"Zero-shot failed: {e}")

    # Keyword fallback if no events detected
    if not events:
        for event_type, keywords in EVENT_FALLBACK.items():
            if any(kw in text_lower for kw in keywords):
                events.append(event_type)

    return events

def process_row(row):
    """Safe row processing wrapper"""
    try:
        text = f"{row['title_english']}. {row['summary_english']}"
        tags = extract_tags_safely(text)
        tags["events"] = detect_events(text)
        return {k: list(set(v)) for k,v in tags.items()}
    except Exception as e:
        logger.error(f"Failed processing row: {e}")
        return {"regions": [], "persons": [], "organizations": [], "events": []}

# Main execution
if __name__ == "__main__":
    try:
        df = pd.read_csv("/content/final_translated_news.csv")
        tqdm.pandas(desc="Tagging articles")
        df["tags"] = df.progress_apply(process_row, axis=1)
        df.to_csv("tagged_news_safe.csv", index=False)
        logger.info("Successfully processed %d articles", len(df))
    except Exception as e:
        logger.critical("Fatal error: %s", e)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Device set to use cuda:0
  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():  # Mixed precision for GPU
Tagging articles: 100%|██████████| 225/225 [00:26<00:00,  8.35it/s]


In [None]:
!pip install pandas rapidfuzz pycountry geonamescache


Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Collecting geonamescache
  Downloading geonamescache-2.0.0-py3-none-any.whl.metadata (3.2 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading geonamescache-2.0.0-py3-none-any.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: geonamescache, rapidfuzz, pycountry
Successfully installed geonamescache-2.0.0 

In [None]:
import pandas as pd
import ast
import time
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError

# Load CSV
df = pd.read_csv("tagged_news_safe.csv")

# Parse the string dict safely
def safe_literal_eval(s):
    try:
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        return {}

df['tags'] = df['tags'].apply(safe_literal_eval)

# Setup geocoder with timeout and error handling
geolocator = Nominatim(user_agent="location_cleaner", timeout=10)

def safe_geocode(loc, max_retries=3):
    for _ in range(max_retries):
        try:
            time.sleep(1.1)  # Respect Nominatim's 1 request per second policy
            location = geolocator.geocode(loc, exactly_one=True, language='en')
            if location:
                # Get the last 3 components and strip whitespace
                return ", ".join([part.strip() for part in location.address.split(",")[-3:]])
            return None
        except (GeocoderTimedOut, GeocoderServiceError) as e:
            print(f"Geocoding error for '{loc}': {str(e)}")
            time.sleep(2)  # Wait longer if there's an error
        except Exception as e:
            print(f"Unexpected error for '{loc}': {str(e)}")
            return None
    return None

def clean_locs(locs_list):
    if not isinstance(locs_list, list) or not locs_list:
        return []

    cleaned = []
    for loc in locs_list:
        if not isinstance(loc, str) or not loc.strip():
            continue
        cleaned_loc = safe_geocode(loc.strip())
        if cleaned_loc and cleaned_loc not in cleaned:
            cleaned.append(cleaned_loc)
    return cleaned

# Clean and update locs in tags
for i, row in df.iterrows():
    if not isinstance(row['tags'], dict):
        continue

    tags_dict = row['tags']
    if 'locs' in tags_dict:
        cleaned = clean_locs(tags_dict['locs'])
        tags_dict['locs'] = cleaned
    df.at[i, 'tags'] = tags_dict

# Convert dicts back to string before saving
df['tags'] = df['tags'].apply(str)

# Save updated CSV
df.to_csv("tagged_news_safe_updated.csv", index=False)
print("✅ 'locs' updated in 'tags' and saved to 'tagged_news_safe_updated.csv'")

✅ 'locs' updated in 'tags' and saved to 'tagged_news_safe_updated.csv'


In [None]:
!pip install gender-guesser

Collecting gender-guesser
  Downloading gender_guesser-0.4.0-py2.py3-none-any.whl.metadata (3.0 kB)
Downloading gender_guesser-0.4.0-py2.py3-none-any.whl (379 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/379.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m266.2/379.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m379.3/379.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gender-guesser
Successfully installed gender-guesser-0.4.0


In [None]:
import pandas as pd
import gender_guesser.detector as gender

# Load the dataset
df = pd.read_csv("/content/tagged_news_safe_updated.csv")

# Filter data for the specific agency (KerelaKaumudi) and create a COPY
agency_name = "KerelaKaumudi"
agency_df = df[df["MediaAgency"] == agency_name].copy()

# Initialize gender detector
gender_detector = gender.Detector()

# --- 1. Region Bias Score ---
def extract_locations(tags_str):
    if isinstance(tags_str, str) and "'locs':" in tags_str:
        locs_part = tags_str.split("'locs':")[1].split("]")[0]
        locs = [loc.strip(" '") for loc in locs_part.replace("[", "").replace("]", "").split(",") if loc.strip(" '")]
        return locs
    return []

agency_df["locations"] = agency_df["tags"].apply(extract_locations)
all_locations = [loc for sublist in agency_df["locations"] for loc in sublist]

# Calculate region bias (entropy-based diversity score)
unique_regions = list(set(all_locations))
region_counts = {region: all_locations.count(region) for region in unique_regions}
total_region_mentions = sum(region_counts.values())
region_bias_score = 1 - max(region_counts.values()) / total_region_mentions if total_region_mentions > 0 else 0

# --- Print Region Calculation Data ---
print("\n=== Region Bias Calculation ===")
print(f"Total Locations Mentioned: {total_region_mentions}")
print("Breakdown by Region:")
for region, count in region_counts.items():
    print(f"- {region}: {count} mentions")
print(f"Region Bias Score: {region_bias_score:.2f}")

# --- 2. Gender Bias Score (Using gender-guesser) ---
def extract_gender_mentions(tags_str):
    if isinstance(tags_str, str) and "'pers':" in tags_str:
        pers_part = tags_str.split("'pers':")[1].split("]")[0]
        persons = [p.strip(" '") for p in pers_part.replace("[", "").replace("]", "").split(",") if p.strip(" '")]
        return persons
    return []

agency_df["persons"] = agency_df["tags"].apply(extract_gender_mentions)
all_persons = [person for sublist in agency_df["persons"] for person in sublist]

# Estimate gender using gender-guesser
male_mentions = 0
female_mentions = 0
unknown_gender = 0

gender_data = []  # Store gender classification details

for person in all_persons:
    first_name = person.split()[0] if " " in person else person
    gender_guess = gender_detector.get_gender(first_name)

    if gender_guess in ["male", "mostly_male"]:
        male_mentions += 1
        gender_data.append((person, "Male"))
    elif gender_guess in ["female", "mostly_female"]:
        female_mentions += 1
        gender_data.append((person, "Female"))
    else:
        unknown_gender += 1
        gender_data.append((person, "Unknown"))

total_gender_mentions = male_mentions + female_mentions
gender_bias_score = abs(male_mentions - female_mentions) / total_gender_mentions if total_gender_mentions > 0 else 0

# --- Print Gender Calculation Data ---
print("\n=== Gender Bias Calculation ===")
print(f"Total Persons Mentioned: {len(all_persons)}")
print(f"- Male: {male_mentions}")
print(f"- Female: {female_mentions}")
print(f"- Unknown/Unclassified: {unknown_gender}")
print("Gender Classification Details:")
for person, gender in gender_data:
    print(f"- {person}: {gender}")
print(f"Gender Bias Score: {gender_bias_score:.2f}")

# --- 3. Demography Bias (Urban vs. Rural) ---
urban_keywords = [
    # Major Kerala Cities/Towns
    "Kochi", "Thiruvananthapuram", "Kozhikode", "Thrissur", "Kollam",
    "Alappuzha", "Kannur", "Kottayam", "Palakkad", "Malappuram",
    "Ernakulam", "Trivandrum", "Calicut", "Tellicherry",

    # Malayalam Urban Terms
    "nagaram", "purasabha", "mahanagaram", "town", "borough",
    "corporation", "municipal area",

    # Commercial Hubs
    "business district", "commercial street", "CBD", "market area",
    "chalai", "shopping complex", "high street",

    # Infrastructure
    "metro", "flyover", "mall", "apartment", "skyscraper",
    "technopark", "infopark", "SEZ", "industrial estate",

    # Global Cities
    "Mumbai", "Delhi", "Bangalore", "Dubai", "Singapore"
]
rural_keywords = [
    # Kerala Village Terms
    "gramam", "ooru", "kudumbashree", "panchayat", "kudi",
    "tharavadu", "kaavu", "paddy field", "kole lands", "kandal",

    # Geographic Features
    "kunnu", "puzha", "kadavu", "kayal", "kadu", "thodu", "padam", "nilam",

    # Rural Economy
    "karshaka", "krishi", "karshika", "thozhil", "pokkali",
    "coir", "fishery", "toddy shop", "agrarian",

    # Cultural Terms
    "kettukazhcha", "pooram", "padayani", "theyyam", "vayanashala",

    # Generic Rural Terms
    "countryside", "hamlet", "remote", "tribal", "farmland"
]

urban_mentions = sum(1 for loc in all_locations if any(keyword in loc for keyword in urban_keywords))
rural_mentions = sum(1 for loc in all_locations if any(keyword in loc for keyword in rural_keywords))
total_demo_mentions = urban_mentions + rural_mentions

demography_bias_score = abs(urban_mentions - rural_mentions) / total_demo_mentions if total_demo_mentions > 0 else 0

# --- Print Demography Calculation Data ---
print("\n=== Demography Bias Calculation ===")
print(f"Total Location Mentions: {total_demo_mentions}")
print(f"- Urban: {urban_mentions}")
print(f"- Rural: {rural_mentions}")
print(f"Demography Bias Score: {demography_bias_score:.2f}")

# --- Composite Coverage Bias Score ---
coverage_bias_score = (region_bias_score + gender_bias_score + demography_bias_score) / 3

# --- Final Results ---
print("\n=== Final Coverage Bias Score ===")
print(f"Overall Coverage Bias Score: {coverage_bias_score:.2f}")
print(f"- Region Bias: {region_bias_score:.2f}")
print(f"- Gender Bias: {gender_bias_score:.2f}")
print(f"- Demography Bias: {demography_bias_score:.2f}")


=== Region Bias Calculation ===
Total Locations Mentioned: 147
Breakdown by Region:
- 682035: 2 mentions
- Ruvuma Region: 1 mentions
- 192126: 1 mentions
- Uttar Pradesh: 2 mentions
- Kerala: 21 mentions
- Pakistan: 4 mentions
- 695521: 1 mentions
- 682001: 1 mentions
- Delhi: 5 mentions
- 79601: 1 mentions
- Northeast: 1 mentions
- Alappuzha: 2 mentions
- Rajasthan: 2 mentions
- Goa: 1 mentions
- 678102: 1 mentions
- 678001: 1 mentions
- 190001: 2 mentions
- Bihar: 2 mentions
- United Arab Emirates: 4 mentions
- Estonia: 1 mentions
- Canada: 1 mentions
- 673001: 3 mentions
- 47960: 1 mentions
- Abu Dhabi Emirate: 2 mentions
- India: 46 mentions
- 110006: 4 mentions
- 800001: 1 mentions
- Southern Highlands Zone: 1 mentions
- Tanzania: 1 mentions
- 94000: 1 mentions
- 190017: 1 mentions
- Barmer: 2 mentions
- Rapla County: 1 mentions
- Abu Dhabi: 2 mentions
- Czechia: 1 mentions
- 695001: 7 mentions
- Indiana: 1 mentions
- 471 29: 1 mentions
- Chandigarh: 1 mentions
- United States: 1