PART A

In [5]:
!pip install pdfplumber
import pdfplumber
import pandas as pd
import re

# Path to your PDF file
pdf_path = "police_reports/police_crime_report_1.pdf"

# Open the PDF and extract text
with pdfplumber.open(pdf_path) as pdf:
    full_text = ""
    
    for page in pdf.pages:
        extracted_text = page.extract_text()
        if extracted_text:
            full_text += extracted_text + "\n"

# Split text into lines
lines = full_text.split("\n")

# Extract key-value pairs dynamically
data = []
pattern = r"^(.*?):\s*(.*)$"  # Regex to capture "Field: Value" pattern
current_field = None
current_value = []

for line in lines:
    line = line.strip()
    
    # Match "Field: Value" pattern
    match = re.match(pattern, line)
    if match:
        # If there's an existing field-value pair, save it
        if current_field:
            data.append([current_field, " ".join(current_value).strip()])
        
        # Start a new field
        current_field, value = match.groups()
        current_value = [value.strip()]
    else:
        # If it's a continuation of the previous field (multi-line value)
        if current_value:
            current_value.append(line.strip())

# Add the last field-value pair to the data list
if current_field:
    data.append([current_field, " ".join(current_value).strip()])

# Convert to DataFrame
df = pd.DataFrame(data, columns=["Field", "Value"])

# **Extract Latitude & Longitude separately**
coord_index = df[df["Field"] == "Coordinates"].index

if not coord_index.empty:
    coord_index = coord_index[0]
    coords = df.at[coord_index, "Value"]
    
    # Extract latitude and longitude using regex
    coord_match = re.match(r"\((-?\d+\.\d+),\s*(-?\d+\.\d+)\)", coords)
    
    if coord_match:
        latitude, longitude = coord_match.groups()
        df.at[coord_index, "Field"] = "Latitude"
        df.at[coord_index, "Value"] = latitude
        df.loc[coord_index + 0.5] = ["Longitude", longitude]  # Insert new row
    
    # Reset index for clean output
    df = df.sort_index().reset_index(drop=True)

# Save to CSV
#df.to_csv("formatted_report.csv", index=False)

# Print table for verification
print(df)


                   Field                                              Value
0          Report Number                                        2024-001240
1            Date & Time                                2013-08-11 18:00:00
2      Reporting Officer                            Officer D. Morgan #6234
3      Incident Location                                200 Block of 5TH ST
4               Latitude                                  37.78091651016261
5              Longitude                                  -122.404100362918
6   Detailed Description  Petty theft from locked auto. Personal belongi...
7        Police District                                           Southern
8             Resolution                                               None
9    Suspect Description                             No suspect identified.
10    Victim Information                                          Anonymous


PART B

In [13]:
!pip install pymupdf
import fitz  # PyMuPDF for PDF extraction
import pandas as pd
import re
from datetime import datetime
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# **Step 1: Extract text from PDF**
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    extracted_text = ""

    for page in doc:
        extracted_text += page.get_text("text") + "\n"

    # Print extracted text for debugging
    print("Extracted PDF Text:\n", extracted_text)
    
    return extracted_text

# **Step 2: Extract relevant data using regex**
def extract_crime_details(text):
    extracted_data = {}

    # Regex patterns to capture relevant fields
    extracted_data["Date & Time"] = re.search(r"Date & Time:\s*([\d-]+\s+\d{2}:\d{2}:\d{2})", text)
    extracted_data["Detailed Description"] = re.search(r"Detailed Description:\s*(.+?)(?=\n[A-Z])", text, re.DOTALL)
    extracted_data["Police District"] = re.search(r"Police District:\s*([A-Za-z\s]+)", text)
    extracted_data["Incident Location"] = re.search(r"Incident Location:\s*(.+)", text)
    extracted_data["Resolution"] = re.search(r"Resolution:\s*(.+)", text)
    #extracted_data["Coordinates"] = re.search(r"Coordinates:\s*\(([-+]?\d*\.\d+),\s*([-+]?\d*\.\d+)\)", text)

    # Extract and clean data
    for key, match in extracted_data.items():
        extracted_data[key] = match.group(1).strip() if match else "Unknown"

   # Extract Coordinates (Latitude & Longitude together)
    coordinates_match = re.search(r"Coordinates:\s*\(([-+]?\d*\.\d+),\s*([-+]?\d*\.\d+)\)", text)
    
    if coordinates_match:
        extracted_data["Latitude"] = coordinates_match.group(1).strip()
        extracted_data["Longitude"] = coordinates_match.group(2).strip()
    else:
        extracted_data["Latitude"] = "Unknown"
        extracted_data["Longitude"] = "Unknown"

    # Extract and clean other fields
    for key, match in extracted_data.items():
        extracted_data[key] = match.strip() if isinstance(match, str) else "Unknown"
        
    # Convert date format and extract day of the week
    if extracted_data["Date & Time"] != "Unknown":
        extracted_data["DayOfWeek"] = datetime.strptime(extracted_data["Date & Time"], "%Y-%m-%d %H:%M:%S").strftime("%A")
    else:
        extracted_data["DayOfWeek"] = "Unknown"

    print("\nExtracted Data:\n", extracted_data)  # Debugging step

    return extracted_data

###########


# Function to assign severity based on category
def assign_severity(category):
    severity_mapping = {
        "NON-CRIMINAL": 1, "SUSPICIOUS OCC": 1, "MISSING PERSON": 1, "RUNAWAY": 1, "RECOVERED VEHICLE": 1,
        "WARRANTS": 2, "OTHER OFFENSES": 2, "VANDALISM": 2, "TRESPASS": 2, "DISORDERLY CONDUCT": 2, "BAD CHECKS": 2,
        "LARCENY/THEFT": 3, "VEHICLE THEFT": 3, "FORGERY/COUNTERFEITING": 3, "DRUG/NARCOTIC": 3,
        "STOLEN PROPERTY": 3, "FRAUD": 3, "BRIBERY": 3, "EMBEZZLEMENT": 3,
        "ROBBERY": 4, "WEAPON LAWS": 4, "BURGLARY": 4, "EXTORTION": 4,
        "KIDNAPPING": 5, "ARSON": 5
    }
    return severity_mapping.get(category, -1)  # Assign -1 if unknown

# Load dataset
df = pd.read_csv("crime_data_with_severity.csv")

# Check for missing values and drop them
df = df.dropna(subset=["Descript", "Category"])

# Extract descriptions and categories
descriptions = df["Descript"].astype(str).tolist()
categories = df["Category"].astype(str).tolist()

# Fit vectorizer on crime descriptions
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(descriptions)

# Save vectorizer
joblib.dump(vectorizer, "vectorizer.pkl")
print("Vectorizer trained and saved successfully!")

# Load trained model
model = joblib.load("crime_category_model.pkl")

# Example: Extracted description from PDF
description = "Petty theft from locked auto. Personal belongings were stolen from a parked vehicle."

# Transform input text
description_vectorized = vectorizer.transform([description])

# Predict category
predicted_category = model.predict([description])[0]

# Assign severity
predicted_severity = assign_severity(predicted_category)

# New crime report entry
def update_csv_with_extracted_data(pdf_path, csv_file):
    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)

    # Extract relevant crime details
    extracted_data = extract_crime_details(text)

    # Read the existing CSV file
    existing_df = pd.read_csv(csv_file)

    # Ensure extracted values are properly formatted
    extracted_data["Police District"] = extracted_data["Police District"].split("\n")[0].strip()
    extracted_data["Resolution"] = extracted_data["Resolution"].strip() if extracted_data["Resolution"] else None


    # Create a new row with only matching columns
    new_row = {
    "Dates": extracted_data["Date & Time"],
    "Category": predicted_category,
    "Descript": description,
    "DayOfWeek": extracted_data["DayOfWeek"],
    "PdDistrict": extracted_data["Police District"],
    "Resolution": extracted_data["Resolution"],
    "Address": extracted_data["Incident Location"],
    "Latitude (Y)": extracted_data["Latitude"],
    "Longitude (X)": extracted_data["Longitude"],
    "Severity": predicted_severity
}
    # Append the new row to the dataframe
    #updated_df = existing_df.append(new_row, ignore_index=True)
    updated_df = pd.concat([existing_df, pd.DataFrame([new_row])], ignore_index=True)

    # Save the updated CSV file
    updated_df.to_csv(csv_file, index=False)

    print("✅ Data successfully added to", csv_file)
# **Run the function**
############################## SELECT NEW PDF CRIME REPORT HERE #########################

pdf_file = "police_reports/police_crime_report_4.pdf"  # Change this to your actual PDF file
csv_file = "crime_data_with_severity.csv"  # Your existing CSV file

update_csv_with_extracted_data(pdf_file, csv_file)




Vectorizer trained and saved successfully!
Extracted PDF Text:
 City Police Department 
Official Police Crime Report 
 
Report Number: 
2024-001243 
Date & Time: 
2008-09-20 09:00:00 
Reporting Officer: 
Officer J. Anderson #9567 
Incident Location: 
200 Block of JONES ST 
Coordinates: 
(37.7834687204586, -122.412573643201) 
Detailed Description: 
Burglary of apartment house, unlawful entry. Suspect forced entry 
and stole electronics. 
Police District: 
Tenderloin 
Resolution: 
None 
Suspect Description: 
Unknown suspect, possible forced entry. 
Victim Information: 
Resident reported stolen items. 
 



Extracted Data:
 {'Date & Time': '2008-09-20 09:00:00', 'Detailed Description': 'Burglary of apartment house, unlawful entry. Suspect forced entry \nand stole electronics.', 'Police District': 'Tenderloin \nResolution', 'Incident Location': '200 Block of JONES ST', 'Resolution': 'None', 'Latitude': '37.7834687204586', 'Longitude': '-122.412573643201', 'DayOfWeek': 'Saturday'}


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


✅ Data successfully added to crime_data_with_severity.csv
