In [19]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

from descriptions import descriptions

3

In [6]:
import warnings
warnings.filterwarnings("ignore")

In [18]:
category_names = {0: "Fire", 1: "Crime", 2: "Health"}

data = {
    "Description": [],
    "IncidentCategory": []
}

for key, item in descriptions.items():
    data["Description"] += item

    if key == "fire":
        category_key = 0
    elif key == "crime":
        category_key = 1
    elif key == "health":
        category_key = 2
    else:
        raise Exception("Invalid key")

    data["IncidentCategory"] += [category_key for _ in range(len(item))]

dict_items([('health', ['In a bustling corridor, I spot someone collapsing, gasping for breath. Swiftly dialing emergency services; panic sets in.', 'Heart pounding, I witness a seizure unfold in the library. Urgently seek help, feeling the seconds slip away.', 'Chaos erupts as a student falls unconscious in the cafeteria. Grabbing attention, urgently pleading for assistance.', 'A sudden commotion - a person clutching their chest in the lecture hall. Dialing 999, urgency escalating.', 'Stumbling upon a distressing scene - a student choking in the courtyard. Swiftly call for aid, fear gripping.', 'Alarming sight: a fellow student collapses, unresponsive. Dialing for help, urgency heightening.', 'Panic ensues as a health crisis unfolds in the gym. Rushing to alert authorities, time slipping away.', 'Frantically reacting to a fainting episode during class. Dialing for medical help urgently.', 'In the common area, a student clutches their abdomen, in evident pain. Urgently seeking assistan

In [105]:
df = pd.DataFrame(data)


df["Description"] = (
    df["Description"].str.replace(",", "").str.lower()
)

# Tokenization
df["Description"] = df["Description"].apply(nltk.word_tokenize)

# Stop Word Removal
stop_words = set(stopwords.words("english"))
df["Description"] = df["Description"].apply(
    lambda tokens: [word for word in tokens if word not in stop_words]
)

# # Stemming
# stemmer = PorterStemmer()
# df["Description"] = df["Description"].apply(
#     lambda tokens: [stemmer.stem(word) for word in tokens]
# )

# Join the tokens back into a single string
df["Description"] = df["Description"].apply(' '.join)

In [106]:
df

Unnamed: 0,Description,IncidentCategory
0,bustling corridor spot someone collapsing gasp...,2
1,heart pounding witness seizure unfold library ...,2
2,chaos erupts student falls unconscious cafeter...,2
3,sudden commotion - person clutching chest lect...,2
4,stumbling upon distressing scene - student cho...,2
...,...,...
237,language lab student reports receiving threate...,1
238,psychology class heated argument two students ...,1
239,cafeteria student 's wallet stolen bag 're dis...,1
240,physics lab student notices expensive equipmen...,1


In [107]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf_vectorizer.fit_transform(df["Description"])

In [108]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_matrix,
    df["IncidentCategory"],
    test_size=0.2,
    random_state=42,
)

In [115]:
# Train the SVM Model
svm_classifier = SVC(kernel="linear", C=1.0, random_state=42)
svm_classifier.fit(X_train, y_train)

In [116]:
# Make predictions on the test set
predictions = svm_classifier.predict(X_test)

# Evaluate the model
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       1.00      1.00      1.00        15
           2       0.96      1.00      0.98        22

    accuracy                           0.98        49
   macro avg       0.99      0.97      0.98        49
weighted avg       0.98      0.98      0.98        49



In [117]:
# Test user provided response.
user_description = "There is fire happening in my hostel."

# Preprocess the user's description to match the format used during training
user_description = user_description.lower()  # Convert to lowercase
user_description = user_description.replace(r"[^\w\s]", "")  # Remove punctuation

# Vectorize the user's description using the same TF-IDF vectorizer used during training
user_description_vector = tfidf_vectorizer.transform([user_description])

# Predict the incident category using the trained model
predicted_category = svm_classifier.predict(user_description_vector)

# Map the category label to the actual category name
predicted_category_name = category_names[predicted_category[0]]

# Display the prediction
print(
    "Predicted Incident Category:",
    f"{predicted_category_name} ({predicted_category[0]})",
)

Predicted Incident Category: Health (2)
