# **LEVEL 4** 💻

### 🔷 PART A : PDF Text Extraction and Data Processing

In [3]:
# Install pdfplumber for PDF text extraction
!pip install pdfplumber
import pdfplumber       # Library for PDF extraction
import pandas as pd     # Library for handling data in DataFrame
import re       # Regular expressions for pattern matching



Choose the PDF desired to get extracted in the below cell code.

In [4]:
# Define the path to the PDF file
pdf_path = "police_reports/police_crime_report_1.pdf"


In [5]:
# Open the PDF using pdfplumber and extract text from all pages
with pdfplumber.open(pdf_path) as pdf:
    full_text = ""
    
    for page in pdf.pages:      # Iterate through each page of the PDF
        extracted_text = page.extract_text()        # Extract text from the page
        if extracted_text:          # Check if text is extracted
            full_text += extracted_text + "\n"      # Append text to full_text

# Split the extractetd text into individual lines
lines = full_text.split("\n")

# Create a list to store field-value pairs
data = []
pattern = r"^(.*?):\s*(.*)$"  # Regex to capture "Field: Value" pattern
current_field = None
current_value = []

# Loop through each line to find field-value pairs
for line in lines:
    line = line.strip()            # Remove leading/trailing whitespace
    
    # Check if the line matches the "Field: Value" pattern
    match = re.match(pattern, line)
    if match:
        # If there's an existing field-value pair, save it to data
        if current_field:
            data.append([current_field, " ".join(current_value).strip()])
        
        # Start a new field-value pair
        current_field, value = match.groups()
        current_value = [value.strip()]
    else:
        # If it's a continuation of the previous field (multi-line value)
        if current_value:
            current_value.append(line.strip())

# After the loop, Add the last field-value pair to the data list
if current_field:
    data.append([current_field, " ".join(current_value).strip()])

# Convert the list of field-value pairs into a DataFrame
df = pd.DataFrame(data, columns=["Field", "Value"])

# Extract Latitude & Longitude from the "Coordinates" fields
coord_index = df[df["Field"] == "Coordinates"].index

# If coordinates are found, separate them into latitude and longitude
if not coord_index.empty:
    coord_index = coord_index[0]
    coords = df.at[coord_index, "Value"]
    
    # Extract latitude and longitude using regex
    coord_match = re.match(r"\((-?\d+\.\d+),\s*(-?\d+\.\d+)\)", coords)
    
    if coord_match:
        latitude, longitude = coord_match.groups()
        df.at[coord_index, "Field"] = "Latitude"        # Replace "Coordinates" with "Latitude"
        df.at[coord_index, "Value"] = latitude
        df.loc[coord_index + 0.5] = ["Longitude", longitude]  # Insert new row for Longitude
    
    # Reset the DataFrame and reset the index for clean output
    df = df.sort_index().reset_index(drop=True)


# Print the DataFrame for verification
print(df)


                   Field                                              Value
0          Report Number                                        2024-001240
1            Date & Time                                2013-08-11 18:00:00
2      Reporting Officer                            Officer D. Morgan #6234
3      Incident Location                                200 Block of 5TH ST
4               Latitude                                  37.78091651016261
5              Longitude                                  -122.404100362918
6   Detailed Description  Petty theft from locked auto. Personal belongi...
7        Police District                                           Southern
8             Resolution                                               None
9    Suspect Description                             No suspect identified.
10    Victim Information                                          Anonymous


### 🔷 PART B : PDF Text Extraction, Crime Categorization, and Severity Assignment

In [6]:
# Install pymupdf for PDF text extraction
!pip install pymupdf

import fitz  # PyMuPDF for PDF extraction
import pandas as pd        # Library for handling data in DataFrame
import re       # Regular expressions for pattern matching
from datetime import datetime       # For handling date/time conversion
import joblib   # For saving and loading models
from sklearn.feature_extraction.text import TfidfVectorizer         # For text vectorization

# **Step 1: Extract text from PDF**
def extract_text_from_pdf(pdf_path):        # Open the PDF file
    doc = fitz.open(pdf_path)
    extracted_text = ""

#loop through each page and extract text
    for page in doc:
        extracted_text += page.get_text("text") + "\n"

    # Print extracted text for debugging
    print("Extracted PDF Text:\n", extracted_text)
    
    return extracted_text

# **Step 2: Extract relevant crime details from the text using regex**
def extract_crime_details(text):
    extracted_data = {}

    # Regex patterns to capture relevant fields
    extracted_data["Date & Time"] = re.search(r"Date & Time:\s*([\d-]+\s+\d{2}:\d{2}:\d{2})", text)
    extracted_data["Detailed Description"] = re.search(r"Detailed Description:\s*(.+?)(?=\n[A-Z])", text, re.DOTALL)
    extracted_data["Police District"] = re.search(r"Police District:\s*([A-Za-z\s]+)", text)
    extracted_data["Incident Location"] = re.search(r"Incident Location:\s*(.+)", text)
    extracted_data["Resolution"] = re.search(r"Resolution:\s*(.+)", text)

    # Extract and clean data
    for key, match in extracted_data.items():
        extracted_data[key] = match.group(1).strip() if match else "Unknown"

   # Extract Coordinates (Latitude & Longitude together)
    coordinates_match = re.search(r"Coordinates:\s*\(([-+]?\d*\.\d+),\s*([-+]?\d*\.\d+)\)", text)
    
    if coordinates_match:
        extracted_data["Latitude"] = coordinates_match.group(1).strip()
        extracted_data["Longitude"] = coordinates_match.group(2).strip()
    else:
        extracted_data["Latitude"] = "Unknown"
        extracted_data["Longitude"] = "Unknown"

    # Extract and clean other fields
    for key, match in extracted_data.items():
        extracted_data[key] = match.strip() if isinstance(match, str) else "Unknown"
        
    # Convert date format and extract day of the week
    if extracted_data["Date & Time"] != "Unknown":
        extracted_data["DayOfWeek"] = datetime.strptime(extracted_data["Date & Time"], "%Y-%m-%d %H:%M:%S").strftime("%A")
    else:
        extracted_data["DayOfWeek"] = "Unknown"

    print("\nExtracted Data:\n", extracted_data)  # Debugging step

    return extracted_data


# Function to assign severity based on category
def assign_severity(category):
    severity_mapping = {
        "NON-CRIMINAL": 1, "SUSPICIOUS OCC": 1, "MISSING PERSON": 1, "RUNAWAY": 1, "RECOVERED VEHICLE": 1,
        "WARRANTS": 2, "OTHER OFFENSES": 2, "VANDALISM": 2, "TRESPASS": 2, "DISORDERLY CONDUCT": 2, "BAD CHECKS": 2,
        "LARCENY/THEFT": 3, "VEHICLE THEFT": 3, "FORGERY/COUNTERFEITING": 3, "DRUG/NARCOTIC": 3,
        "STOLEN PROPERTY": 3, "FRAUD": 3, "BRIBERY": 3, "EMBEZZLEMENT": 3,
        "ROBBERY": 4, "WEAPON LAWS": 4, "BURGLARY": 4, "EXTORTION": 4,
        "KIDNAPPING": 5, "ARSON": 5
    }
    return severity_mapping.get(category, -1)  # Assign -1 if unknown

# Load existing dataset of crime data with severity
df = pd.read_csv("crime_data_with_severity.csv")

# Check for missing values and drop them
df = df.dropna(subset=["Descript", "Category"])

# Extract descriptions and categories
descriptions = df["Descript"].astype(str).tolist()
categories = df["Category"].astype(str).tolist()

# Fit vectorizer on crime descriptions
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(descriptions)

# Save vectorizer
joblib.dump(vectorizer, "vectorizer.pkl")
print("Vectorizer trained and saved successfully!")

# Load a pre-trained machine learning model for crime category prediction
model = joblib.load("crime_category_model.pkl")

# Example: Extracted description from PDF
description = "Petty theft from locked auto. Personal belongings were stolen from a parked vehicle."

# Transform input text
description_vectorized = vectorizer.transform([description])

# Predict crime category
predicted_category = model.predict([description])[0]

# Assign a severity level based on the predicted category
predicted_severity = assign_severity(predicted_category)

# New crime report entry
def update_csv_with_extracted_data(pdf_path, csv_file):
    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)

    # Extract relevant crime details
    extracted_data = extract_crime_details(text)

    # Read the existing CSV file
    existing_df = pd.read_csv(csv_file)

    # Ensure extracted values are properly formatted
    extracted_data["Police District"] = extracted_data["Police District"].split("\n")[0].strip()
    extracted_data["Resolution"] = extracted_data["Resolution"].strip() if extracted_data["Resolution"] else None


    # Create a new row with only matching columns
    new_row = {
    "Dates": extracted_data["Date & Time"],
    "Category": predicted_category,
    "Descript": description,
    "DayOfWeek": extracted_data["DayOfWeek"],
    "PdDistrict": extracted_data["Police District"],
    "Resolution": extracted_data["Resolution"],
    "Address": extracted_data["Incident Location"],
    "Latitude (Y)": extracted_data["Latitude"],
    "Longitude (X)": extracted_data["Longitude"],
    "Severity": predicted_severity
}
    # Append the new row to the dataframe
    updated_df = pd.concat([existing_df, pd.DataFrame([new_row])], ignore_index=True)

    # Save the updated DataFrame to the CSV file
    updated_df.to_csv(csv_file, index=False)

    print("✅ Data successfully added to", csv_file)
# **Run the function with a new PDF Crime report and update the csv file


################################################ SELECT NEW PDF CRIME REPORT HERE ###################################################

pdf_file = "police_reports/police_crime_report_1.pdf"  # Update with the PDF File desired to extract
csv_file = "crime_data_with_severity.csv"  # Update with the existing CSV File

update_csv_with_extracted_data(pdf_file, csv_file)




Vectorizer trained and saved successfully!
Extracted PDF Text:
 City Police Department 
Official Police Crime Report 
 
Report Number: 
2024-001240 
Date & Time: 
2013-08-11 18:00:00 
Reporting Officer: 
Officer D. Morgan #6234 
Incident Location: 
200 Block of 5TH ST 
Coordinates: 
(37.78091651016261, -122.404100362918) 
Detailed Description: 
Petty theft from locked auto. Personal belongings were stolen from a 
parked vehicle. 
Police District: 
Southern 
Resolution: 
None 
Suspect Description: 
No suspect identified. 
Victim Information: 
Anonymous 
 



Extracted Data:
 {'Date & Time': '2013-08-11 18:00:00', 'Detailed Description': 'Petty theft from locked auto. Personal belongings were stolen from a \nparked vehicle.', 'Police District': 'Southern \nResolution', 'Incident Location': '200 Block of 5TH ST', 'Resolution': 'None', 'Latitude': '37.78091651016261', 'Longitude': '-122.404100362918', 'DayOfWeek': 'Sunday'}


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


✅ Data successfully added to crime_data_with_severity.csv


# Level 4 Part A & B: Done✅
## Next: Level 4 Part C 🖥️