In [34]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/danielcoker/nltk_data...


True

In [49]:
def synonym_antonym_extractor(phrase):
     from nltk.corpus import wordnet
     synonyms = []
     antonyms = []

     for syn in wordnet.synsets(phrase):
          for l in syn.lemmas():
               synonyms.append(l.name())

     return set(synonyms)



In [64]:
words_dicts = {}

words_dicts["crime"] = [
    "felony",
    "misdemeanor",
    "offense",
    "violation",
    "wrongdoing",
    "transgression",
    "unlawful",
    "illegal",
    "lawlessness",
    "delinquency",
    "misconduct",
    "fraud",
    "corruption",
    "robbery",
    "burglary",
    "theft",
    "assault",
    "homicide",
    "justice",
    "punishment"
]

words_dicts["health"] = [    "Wellness",
    "Fitness",
    "Nutrition",
    "Exercise",
    "Hygiene",
    "Vitality",
    "Well-being",
    "Stamina",
    "Immunity",
    "Balance",
    "Restoration",
    "Holistic",
    "Medicine",
    "Mental health",
    "Physicality",
    "Nutrient",
    "Therapeutic",
    "Recovery",
    "Vigor",
    "Organic"]

words_dicts["fire"] = ["Blaze", "Flame", "Inferno", "Combustion", "Ember",
    "Heat", "Incinerate", "Ignite", "Arson", "Conflagration",
    "Pyre", "Scorch", "Kindle", "Spark", "Furnace",
    "Bonfire", "Char", "Ash", "Wildfire", "Hearth"]

def merge_sets(list_of_sets):
    merged_set = set()
    for s in list_of_sets:
        merged_set = merged_set.union(s)
    return merged_set


def get_related_words(word):
    related_words = []

    for word in words_dicts[word]:
        res = synonym_antonym_extractor(word)
        
        related_words.append(res)

    return list(set(merge_sets(related_words)))
    # return related_words



In [2]:
import warnings
warnings.filterwarnings("ignore")

In [17]:
# Category Names
category_names = {0: "Fire", 1: "Crime", 2: "Health"}

# Sample Data
data = {
    "Description": [
        "There was a fire in the chemistry lab at the university.",
        "A theft occurred in the electronics store last night.",
        "A student had a medical emergency during a class.",
        "I witnessed a hit-and-run accident on Main Street.",
        "There was a large fight at the local bar involving multiple people.",
        "A car crashed into a tree in the park.",
        "Someone reported a suspicious package at the train station.",
        "A person was assaulted in the park in the evening.",
        "I found an injured bird in my backyard.",
        "There's a gas leak in the apartment building.",
        "A burglary took place at my neighbor's house.",
        "I saw a person who fainted on the subway platform.",
        "A dog is stuck in a tree in the park.",
        "A drunk driver was seen swerving on the highway.",
        "A building is on fire in the industrial area.",
        "I spotted a missing child at the shopping mall.",
        "A car was stolen from the parking lot of the grocery store.",
        "There's a fire outbreak in the neighborhood.",
        "A fight broke out at a soccer game.",
        "I heard gunshots in the neighborhood last night.",
        "I was heading home near the Appatapiti area by Rheoboth Lodge. I was on my phone when suddenly, a motorcyclist approached from behind, snatched my phone, and sped away.",
        
    ],
    "IncidentCategory": [0, 1, 2, 1, 1, 2, 1, 1, 2, 0, 1, 2, 2, 1, 0, 1, 1, 0, 0, 2, 1],
}

In [89]:
df = pd.DataFrame(data)

# Text preprocessing and Feature Extraction
df["Description"] = (
    df["Description"].str.replace(r"[^\w\s]", "").str.lower()
)  # Convert to lowercase
df["Description"] = df["Description"].str.replace(r"[^\w\s]", "")  # Remove punctuation

# Tokenization
df["Description"] = df["Description"].apply(nltk.word_tokenize)



# # Stop Word Removal
stop_words = set(stopwords.words("english"))
df["Description"] = df["Description"].apply(
    lambda tokens: [word for word in tokens if word not in stop_words]
)

# # Stemming
stemmer = PorterStemmer()
df["Description"] = df["Description"].apply(
    lambda tokens: [stemmer.stem(word) for word in tokens]
)

# # Join the tokens back into a single string
df["Description"] = df["Description"].apply(' '.join)

df

Unnamed: 0,Description,IncidentCategory
0,fire chemistri lab univers .,0
1,theft occur electron store last night .,1
2,student medic emerg class .,2
3,wit hit-and-run accid main street .,1
4,larg fight local bar involv multipl peopl .,1
5,car crash tree park .,2
6,someon report suspici packag train station .,1
7,person assault park even .,1
8,found injur bird backyard .,2
9,'s ga leak apart build .,0


In [96]:
# TF-IDF vectorization
print("Vectorising the text...")
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf_vectorizer.fit_transform(df["Description"])

# Split data into training and testing sets
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_matrix,
    df["IncidentCategory"],
    test_size=0.4,
    random_state=42,
)

Vectorising the text...
Splitting data into training and testing sets...


In [97]:
# Train the SVM Model
print("Training the model...")
svm_classifier = SVC(kernel="linear", C=1.0, random_state=42)
svm_classifier.fit(X_train, y_train)
print("Training complete!")

# Make predictions on the test set
predictions = svm_classifier.predict(X_test)

# Evaluate the model
report = classification_report(y_test, predictions)
print(report)

Training the model...
Training complete!
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.33      1.00      0.50         3
           2       0.00      0.00      0.00         3

    accuracy                           0.33         9
   macro avg       0.11      0.33      0.17         9
weighted avg       0.11      0.33      0.17         9



In [99]:
# Test user provided response.
user_description = "I saw a person who fainted on the subway platform."

# Preprocess the user's description to match the format used during training
user_description = user_description.lower()  # Convert to lowercase
user_description = user_description.replace(r"[^\w\s]", "")  # Remove punctuation

# Vectorize the user's description using the same TF-IDF vectorizer used during training
print("Vectorising the user's description...")
user_description_vector = tfidf_vectorizer.transform([user_description])

# Predict the incident category using the trained model
print("Predicting the incident category...")
predicted_category = svm_classifier.predict(user_description_vector)

# Map the category label to the actual category name
predicted_category_name = category_names[predicted_category[0]]

# Display the prediction
print(
    "Predicted Incident Category:",
    f"{predicted_category_name} ({predicted_category[0]})",
)

Vectorising the user's description...
Predicting the incident category...
Predicted Incident Category: Crime (1)


In [85]:
dics = []

def is_word_in_text(word_list, text):
    for word in word_list:
        if word.lower() in text.lower():
            return True
    return False

word_sample = "There was a fire in the chemistry lab at the university."
def search_words(text):
    result = {
        "fire": 1 if is_word_in_text(fire_related_words, text) else 0,
        "health": 1 if is_word_in_text(health_related_words, text) else 0,
        "crime": 1 if is_word_in_text(crime_related_words, text) else 0
    }

    dics.append(result)
    return result, text


In [86]:
search_words(word_sample)

({'fire': 1, 'health': 0, 'crime': 0},
 'There was a fire in the chemistry lab at the university.')

In [87]:
dics

[{'fire': 1, 'health': 0, 'crime': 0}]

In [88]:
df["Description"].apply(search_words)

0     ({'fire': 1, 'health': 0, 'crime': 0}, there w...
1     ({'fire': 0, 'health': 0, 'crime': 1}, a theft...
2     ({'fire': 0, 'health': 0, 'crime': 0}, a stude...
3     ({'fire': 0, 'health': 0, 'crime': 0}, i witne...
4     ({'fire': 0, 'health': 0, 'crime': 0}, there w...
5     ({'fire': 1, 'health': 0, 'crime': 0}, a car c...
6     ({'fire': 0, 'health': 0, 'crime': 0}, someone...
7     ({'fire': 0, 'health': 0, 'crime': 1}, a perso...
8     ({'fire': 0, 'health': 0, 'crime': 0}, i found...
9     ({'fire': 0, 'health': 0, 'crime': 0}, there '...
10    ({'fire': 0, 'health': 1, 'crime': 1}, a burgl...
11    ({'fire': 0, 'health': 0, 'crime': 0}, i saw a...
12    ({'fire': 0, 'health': 1, 'crime': 0}, a dog i...
13    ({'fire': 0, 'health': 0, 'crime': 0}, a drunk...
14    ({'fire': 1, 'health': 0, 'crime': 0}, a build...
15    ({'fire': 0, 'health': 0, 'crime': 0}, i spott...
16    ({'fire': 0, 'health': 0, 'crime': 0}, a car w...
17    ({'fire': 1, 'health': 0, 'crime': 0}, the

In [70]:
dics

[{'fire': 1, 'health': 0, 'crime': 0},
 {'fire': 1, 'health': 0, 'crime': 0},
 {'fire': 0, 'health': 0, 'crime': 1},
 {'fire': 0, 'health': 0, 'crime': 0},
 {'fire': 0, 'health': 0, 'crime': 0},
 {'fire': 0, 'health': 0, 'crime': 0},
 {'fire': 1, 'health': 0, 'crime': 0},
 {'fire': 0, 'health': 0, 'crime': 0},
 {'fire': 0, 'health': 0, 'crime': 1},
 {'fire': 0, 'health': 0, 'crime': 0},
 {'fire': 0, 'health': 0, 'crime': 0},
 {'fire': 0, 'health': 0, 'crime': 0},
 {'fire': 0, 'health': 0, 'crime': 0},
 {'fire': 0, 'health': 1, 'crime': 0},
 {'fire': 0, 'health': 0, 'crime': 0},
 {'fire': 1, 'health': 0, 'crime': 0},
 {'fire': 0, 'health': 0, 'crime': 0},
 {'fire': 0, 'health': 0, 'crime': 0},
 {'fire': 1, 'health': 0, 'crime': 0},
 {'fire': 0, 'health': 0, 'crime': 0},
 {'fire': 0, 'health': 0, 'crime': 0},
 {'fire': 0, 'health': 0, 'crime': 0}]