In [19]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

from descriptions import descriptions

3

In [6]:
import warnings
warnings.filterwarnings("ignore")

In [118]:
category_names = {0: "Fire", 1: "Crime", 2: "Health"}

data = {
    "Description": [],
    "IncidentCategory": []
}

for key, item in descriptions.items():
    data["Description"] += item

    if key == "fire":
        category_key = 0
    elif key == "crime":
        category_key = 1
    elif key == "health":
        category_key = 2
    else:
        raise Exception("Invalid key")

    data["IncidentCategory"] += [category_key for _ in range(len(item))]

In [119]:
import string

def remove_punctuation(input_string):
    # Make a translation table that maps all punctuation characters to None
    translator = str.maketrans("", "", string.punctuation)

    # Apply the translation table to the input string
    result = input_string.translate(translator)

    return result

'Hello World'

In [123]:
df = pd.DataFrame(data)

df["Description"] = df["Description"].apply(remove_punctuation)

df["Description"] = (
    df["Description"].str.lower()
)

# Tokenization
df["Description"] = df["Description"].apply(nltk.word_tokenize)

# Stop Word Removal
stop_words = set(stopwords.words("english"))
df["Description"] = df["Description"].apply(
    lambda tokens: [word for word in tokens if word not in stop_words]
)

# Stemming
stemmer = PorterStemmer()
df["Description"] = df["Description"].apply(
    lambda tokens: [stemmer.stem(word) for word in tokens]
)

# Join the tokens back into a single string
df["Description"] = df["Description"].apply(' '.join)

In [124]:
df

Unnamed: 0,Description,IncidentCategory
0,bustl corridor spot someon collaps gasp breath...,2
1,heart pound wit seizur unfold librari urgent s...,2
2,chao erupt student fall unconsci cafeteria gra...,2
3,sudden commot person clutch chest lectur hall ...,2
4,stumbl upon distress scene student choke court...,2
...,...,...
237,languag lab student report receiv threaten mes...,1
238,psycholog class heat argument two student turn...,1
239,cafeteria student wallet stolen bag theyr dist...,1
240,physic lab student notic expens equip tamper a...,1


In [125]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf_vectorizer.fit_transform(df["Description"])

In [126]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_matrix,
    df["IncidentCategory"],
    test_size=0.4,
    random_state=42,
)

In [127]:
# Train the SVM Model
svm_classifier = SVC(kernel="linear", C=1.0, random_state=42)
svm_classifier.fit(X_train, y_train)

In [128]:
# Make predictions on the test set
predictions = svm_classifier.predict(X_test)

# Evaluate the model
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       1.00      0.96      0.98        28
           1       1.00      1.00      1.00        27
           2       0.98      1.00      0.99        42

    accuracy                           0.99        97
   macro avg       0.99      0.99      0.99        97
weighted avg       0.99      0.99      0.99        97



In [129]:
# Test user provided response.
user_description = "There is fire happening in my hostel."

# Preprocess the user's description to match the format used during training
user_description = user_description.lower()  # Convert to lowercase
user_description = user_description.replace(r"[^\w\s]", "")  # Remove punctuation

# Vectorize the user's description using the same TF-IDF vectorizer used during training
user_description_vector = tfidf_vectorizer.transform([user_description])

# Predict the incident category using the trained model
predicted_category = svm_classifier.predict(user_description_vector)

# Map the category label to the actual category name
predicted_category_name = category_names[predicted_category[0]]

# Display the prediction
print(
    "Predicted Incident Category:",
    f"{predicted_category_name} ({predicted_category[0]})",
)

Predicted Incident Category: Health (2)
