<a href="https://colab.research.google.com/github/eteitelbaum/code-satp/blob/Fall-2024/web-scraping-and-inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Scraping**

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

def scrape_satp_data(base_url, years, months):
    data = []
    for year in years:
      for month in months:
          url = f"{base_url}-{month}-{year}"
          print(f"Scraping: {url}")
          response = requests.get(url)
          if response.status_code != 200:
              print(f"Failed to fetch data for {month}: {response.status_code}")
              continue

          soup = BeautifulSoup(response.text, 'html.parser')

          # Extract incident details and dates
          coverpage_news = soup.find_all('div', class_='more')  # Incidents
          coverpage_date = soup.find_all('td', style="width: 15%;")  # Dates

          # Validate counts of incidents and dates
          if len(coverpage_news) != len(coverpage_date):
              print(f"Warning: Mismatch in dates ({len(coverpage_date)}) and incidents ({len(coverpage_news)}) for {month}.")
              continue

          # Group incidents by date to track the nn counter
          incidents_by_date = {}

          # Iterate through the extracted dates and incidents
          for date, incident in zip(coverpage_date, coverpage_news):
              # Clean and format the incident summary
              incident_summary = incident.get_text().strip()
              incident_summary = re.sub(r"\s+", " ", incident_summary)  # Remove extra whitespace
              incident_summary = incident_summary.replace("Read less...", "")  # Remove "Read less..."

              # Clean and format the date
              raw_date = date.get_text().strip()
              day = raw_date.split('-')[-1].strip()
              month_number = f"{months.index(month) + 1:02}"  # Convert month name to two-digit number
              formatted_date = f"{year}-{month_number}-{day.zfill(2)}"

              # Track the nn counter for this date
              if formatted_date not in incidents_by_date:
                  incidents_by_date[formatted_date] = 0
              incidents_by_date[formatted_date] += 1

              # Generate the incident number in mmddyynn format
              nn = f"{incidents_by_date[formatted_date]:02}"  # Increment counter for each summary
              incident_number = f"I{month_number}{day.zfill(2)}{year[-2:]}{nn}"
              # incident_number = int(incident_number)

              # Append to the data list
              data.append({
                  "Incident_Number": incident_number,
                  "Date": formatted_date,
                  "Incident_Summary": incident_summary
              })

      # Convert the data to a pandas DataFrame
    return pd.DataFrame(data), len(data)

In [None]:

# Step 3: Main function
def scrape_save(years, months):
    base_url = "https://www.satp.org/terrorist-activity/india-maoistinsurgency"
    # Scrape data
    satp_data,l = scrape_satp_data(base_url, years, months)
    print(f"Total Incidents Scraped: {l}")
    # Save to Google Sheets
    # save_to_google_sheets(scraped_data, "SATP_Data", "raw_zone_incident_summaries")


years = ["2017","2018"]
months = ["Jan","Feb"]
#months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

scrape_save(years, months)


Scraping: https://www.satp.org/terrorist-activity/india-maoistinsurgency-Jan-2017
Scraping: https://www.satp.org/terrorist-activity/india-maoistinsurgency-Feb-2017
Scraping: https://www.satp.org/terrorist-activity/india-maoistinsurgency-Jan-2018
Scraping: https://www.satp.org/terrorist-activity/india-maoistinsurgency-Feb-2018
Total Incidents Scraped: 484
No new incidents found to upload.


# **Inference**

In [None]:
# Streamlit for creating web apps
import streamlit as st

# Web scraping
import requests
from bs4 import BeautifulSoup
import re

# Data manipulation
import pandas as pd
import numpy as np

# NLP with Transformers
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Google Sheets API
import gspread
from google.oauth2.service_account import Credentials

# Utility modules
import time
import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt


# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

In [None]:
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------
#                                                                       infer_perpetrator
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------


# Load the saved model and tokenizer
perpetrator_model_path = "perpetrator/distilBert"  # Update with your actual path
perpetrator_model = AutoModelForSequenceClassification.from_pretrained(perpetrator_model_path)
perpetrator_tokenizer = AutoTokenizer.from_pretrained(perpetrator_model_path)

perpetrator_model.to(device)

def infer_perpetrator(summary):
    inputs = perpetrator_tokenizer(summary, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = perpetrator_model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()

    label_map = {0:'Security', 1:'Maoist', 2:'Unknown'}

    predicted_perpetrator = label_map.get(predicted_class, "Unknown")
    perpetrator = {
        'perpetrator': predicted_perpetrator
    }
    return perpetrator


In [None]:

# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------
#                                                                       inference_action_type
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------


# Load the saved model and tokenizer
action_model_path = "action_type/distilbert_model"
action_tokenizer = AutoTokenizer.from_pretrained(action_model_path)
action_model = AutoModelForSequenceClassification.from_pretrained(action_model_path)

action_model.to(device)


def inference_action_type(summary):
    """
    Performs inference on an incident summary to predict action types.
    Args:
        summary: The incident summary text.
    Returns:
        A dictionary with action type labels as keys and their predicted probabilities (0 or 1) as values.
    """

    # Tokenize the input summary
    inputs = action_tokenizer(summary, padding=True, truncation=True, return_tensors="pt").to(device)

    # Perform inference
    with torch.no_grad():
        outputs = action_model(**inputs)
        logits = outputs.logits
        probs = torch.sigmoid(logits)  # Get probabilities using sigmoid

    # Convert probabilities to binary predictions (0 or 1) using threshold
    threshold = 0.5
    predictions = (probs > threshold).squeeze().cpu().numpy().astype(int)

    # Create a dictionary to store the results
    labels = ['action_armed_assault', 'action_arrest', 'action_bombing', 'action_infrastructure', 'action_surrender', 'action_seizure', 'action_abduction']
    results = dict(zip(labels, predictions))

    return results


In [None]:

# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------
#                                                                       inference_target_type
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------


# Load the saved model and tokenizer
target_model_path = "target_type/distilBert"
target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
target_model = AutoModelForSequenceClassification.from_pretrained(target_model_path)


target_model.to(device)


def inference_target_type(summary):
    """
    Performs inference on an incident summary to predict target types.
    Args:
        summary: The incident summary text.
    Returns:
        A dictionary with target type labels as keys and their predicted probabilities (0 or 1) as values.
    """

    # Tokenize the input summary
    inputs = target_tokenizer(summary, padding=True, truncation=True, return_tensors="pt").to(device)

    # Perform inference
    with torch.no_grad():
        outputs = target_model(**inputs)
        logits = outputs.logits
        probs = torch.sigmoid(logits)  # Get probabilities using sigmoid

    # Convert probabilities to binary predictions (0 or 1) using threshold
    threshold = 0.5
    predictions = (probs > threshold).squeeze().cpu().numpy().astype(int)

    # Create a dictionary to store the results
    labels = ['target_civilians', 'target_maoist', 'target_no_target', 'target_security', 'target_government']
    results = dict(zip(labels, predictions))
    return results



In [None]:

# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------
#                                                                       predict_counts
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------


# Load the tokenizer and model from the saved directory
total_num_tokenizer = T5Tokenizer.from_pretrained('total_injuries-arrests-surrenders-fatalities-abducted/t5small_finetuned_model')
total_num_model = T5ForConditionalGeneration.from_pretrained('total_injuries-arrests-surrenders-fatalities-abducted/t5small_finetuned_model')

total_num_model.to(device)

def extract_number(text):
    match = re.search(r'\b\d+\b', text)
    if match:
        return int(match.group())
    else:
        return 0

def predict_counts(incident_summary):
    questions = [
        ("How many injuries occurred in the incident?", "total_injuries"),
        ("How many arrests were made in the incident?", "total_arrests"),
        ("How many people surrendered in the incident?", "total_surrenders"),
        ("How many fatalities occurred in the incident?", "total_fatalities"),
        ("How many people were abducted in the incident?", "total_abducted")
    ]
    counts = {}
    for question, label in questions:
        input_text = f"question: {question} context: {incident_summary}"
        input_ids = total_num_tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)
        input_ids = input_ids.to(device)
        outputs = total_num_model.generate(input_ids)
        answer = total_num_tokenizer.decode(outputs[0], skip_special_tokens=True)
        count = extract_number(answer)
        counts[label] = count
    return counts


In [None]:

# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------
#                                                                       predict_damage
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------

damage_model_path = 'damage_details_extraction/t5base_finetuned_model'
damage_model = T5ForConditionalGeneration.from_pretrained(damage_model_path)
damage_tokenizer = T5Tokenizer.from_pretrained(damage_model_path)

damage_model.to(device)


def predict_damage(summary):
    # Prepare the input text
    input_text = f"Extract the property damage value from the incident: {summary}"
    input_ids = damage_tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).input_ids
    input_ids = input_ids.to(device)

    # Generate predictions
    outputs = damage_model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)

    # Decode the output
    predicted_damage = damage_tokenizer.decode(outputs[0], skip_special_tokens=True)
    damage_predictions = {
        'value_property_damage': predicted_damage
    }
    return damage_predictions



In [None]:

# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------
#                                                                       get_location_details
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------


location_model_path = 'location_context_extraction/t5base_finetuned_model'
location_model = T5ForConditionalGeneration.from_pretrained(location_model_path)
location_tokenizer = T5Tokenizer.from_pretrained(location_model_path)

location_model.to(device)


# Updated function to get location details including latitude and longitude
def get_location_details(summary):
    """Given a list of location names, constructs a query, calls the Google Geocoding API,
    and returns state, district, subdistrict, town/village, and latitude/longitude of the most specific level."""

    # Prepare the input text
    input_text = f"Extract the location of the incident: {summary}"
    input_ids = location_tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).input_ids

    input_ids = input_ids.to(device)


    # Generate predictions
    outputs = location_model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)

    # Decode the output
    locations = location_tokenizer.decode(outputs[0], skip_special_tokens=True)

    def remove_specific_key_location_words(locations):
        words_to_remove = ["police", "station","dam","river","rivers","forests","forest"]  # Add more words here
        cleaned_locations = locations.lower()
        for word in words_to_remove:
            cleaned_locations = cleaned_locations.replace(word.lower(), "")
        return cleaned_locations

    locations = remove_specific_key_location_words(locations)

    # Google Maps API key
    API_KEY = st.secrets["googlemapsAPI"]
    GEOCODE_URL = "https://maps.googleapis.com/maps/api/geocode/json"


    #query = ', '.join(locations)
    params = {
        'address': locations,
        'key': API_KEY,
        'components': 'country:IN'
    }
    response = requests.get(GEOCODE_URL, params=params)
    if response.status_code != 200:
        print(f"Error in API call: {response.status_code}")
        return None

    data = response.json()
    if data['status'] != 'OK':
        print(f"Geocoding API error: {data['status']}")
        return {
        'Extracted_Locations': locations,
        'state': None,
        'district': None,
        'subdistrict': None,
        'town_village': None,
        'latitude': None,
        'longitude': None,
        'location_Level': "API couldn't find the Extracted_Locations"
    }

    # Initialize components
    state = district = subdistrict = town_village = None
    latitude = longitude = None
    found_level = None  # Keep track of the most specific level found

    # Iterate over results to find the most specific level
    for result in data.get('results', []):
        temp_state = temp_district = temp_subdistrict = temp_town_village = None
        address_components = result['address_components']

        # Map address components
        for component in address_components:
            types = component['types']
            if 'administrative_area_level_1' in types:
                temp_state = component['long_name']
            elif 'administrative_area_level_2' in types:
                temp_district = component['long_name']
            elif 'administrative_area_level_3' in types:
                temp_subdistrict = component['long_name']
            elif 'locality' in types:
                temp_town_village = component['long_name']
            elif 'sublocality' in types and not temp_town_village:
                temp_town_village = component['long_name']

        # Determine the most specific level in this result
        if temp_town_village and found_level not in ['town_village']:
            state = temp_state
            district = temp_district
            subdistrict = temp_subdistrict
            town_village = temp_town_village
            location = result['geometry']['location']
            latitude = location['lat']
            longitude = location['lng']
            found_level = 'town_village'
        elif temp_subdistrict and found_level not in ['town_village', 'subdistrict']:
            state = temp_state
            district = temp_district
            subdistrict = temp_subdistrict
            town_village = None
            location = result['geometry']['location']
            latitude = location['lat']
            longitude = location['lng']
            found_level = 'subdistrict'
        elif temp_district and found_level not in ['town_village', 'subdistrict', 'district']:
            state = temp_state
            district = temp_district
            subdistrict = None
            town_village = None
            location = result['geometry']['location']
            latitude = location['lat']
            longitude = location['lng']
            found_level = 'district'
        elif temp_state and found_level not in ['town_village', 'subdistrict', 'district', 'state']:
            state = temp_state
            district = None
            subdistrict = None
            town_village = None
            location = result['geometry']['location']
            latitude = location['lat']
            longitude = location['lng']
            found_level = 'state'

        # Break the loop if the most specific level is found
        if found_level == 'town_village':
            break

    return {
        'Extracted_Locations': locations,
        'state': state,
        'district': district,
        'subdistrict': subdistrict,
        'town_village': town_village,
        'latitude': latitude,
        'longitude': longitude,
        'location_Level': found_level,
    }

In [None]:
def update_dataframe(df, return_details, index):
    """Updates the DataFrame with details returned from a function.

    Args:
        df: The pandas DataFrame to update.
        return_details: A dictionary containing the details to add.
        index: The index of the row in the DataFrame to update.
    """
    for column, value in return_details.items():
        df.at[index, column] = value

In [None]:


import time
import datetime
from tqdm import tqdm

def process_dataframe(df, model_inference_function_name, task_description):
    """
    Processes a DataFrame, applying a model inference function to each row and updating the DataFrame.

    Args:
        df: The input DataFrame.
        model_inference_function_name: The name of the function to use for model inference (e.g., get_location_details).  Must be defined in the current scope.
        task_description: A string describing the task being performed.
    """
    start_time = time.time()
    for index, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing rows for {task_description}"):
        summary = row['Incident_Summary']
        details = model_inference_function_name(summary) # Call the provided function
        if details:
            update_dataframe(df, details, index)
    end_time = time.time()
    elapsed_time = end_time - start_time
    elapsed_time_str = str(datetime.timedelta(seconds=elapsed_time))
    print(f"Total time taken for {task_description}: {elapsed_time_str}")
    print(f"{task_description} Completed  and added to the DataFrame.")




In [None]:
satp_data

In [None]:

if satp_data is None:
    print("No data")
    exit(1)
else:
    print("Data fetched")

    process_dataframe(satp_data, infer_perpetrator, task_description="Perpetrator Extraction")
    process_dataframe(satp_data, inference_action_type, task_description="Action Type Extraction")
    process_dataframe(satp_data, inference_target_type, task_description="Target Type Extraction")
    process_dataframe(satp_data, get_location_details, task_description="Location Extraction")
    process_dataframe(satp_data, predict_counts, task_description="Total Injuries, Arrests, Surrenders, Fatalities, Abducted Extraction")
    process_dataframe(satp_data, predict_damage, task_description="Damage Extraction")



Data fetched from Google Sheets.


Processing rows for Perpetrator Extraction: 100%|██████████| 15/15 [00:15<00:00,  1.04s/it]


Total time taken for Perpetrator Extraction: 0:00:15.624175
Perpetrator Extraction Completed  and added to the DataFrame.


Processing rows for Action Type Extraction: 100%|██████████| 15/15 [00:17<00:00,  1.16s/it]


Total time taken for Action Type Extraction: 0:00:17.368592
Action Type Extraction Completed  and added to the DataFrame.


Processing rows for Target Type Extraction: 100%|██████████| 15/15 [00:16<00:00,  1.13s/it]


Total time taken for Target Type Extraction: 0:00:16.960599
Target Type Extraction Completed  and added to the DataFrame.


Processing rows for Location Extraction: 100%|██████████| 15/15 [01:10<00:00,  4.68s/it]


Total time taken for Location Extraction: 0:01:10.133095
Location Extraction Completed  and added to the DataFrame.


Processing rows for Total Injuries, Arrests, Surrenders, Fatalities, Abducted Extraction: 100%|██████████| 15/15 [00:33<00:00,  2.26s/it]


Total time taken for Total Injuries, Arrests, Surrenders, Fatalities, Abducted Extraction: 0:00:33.930252
Total Injuries, Arrests, Surrenders, Fatalities, Abducted Extraction Completed  and added to the DataFrame.


Processing rows for Damage Extraction: 100%|██████████| 15/15 [01:19<00:00,  5.27s/it]

Total time taken for Damage Extraction: 0:01:19.082245
Damage Extraction Completed  and added to the DataFrame.



