In [14]:
import pandas as pd

data = [
    # LOW RISK – original + new (łącznie 10)
    {"id": 1, "text": "Bicycle stolen from locked shed overnight. No previous incidents reported.", "escalation_level": "low"},
    {"id": 4, "text": "Group of teenagers seen drinking and shouting in the park. Dispersed on arrival of officers.", "escalation_level": "low"},
    {"id": 7, "text": "Single report of graffiti on garage door. No suspects identified.", "escalation_level": "low"},
    {"id": 11, "text": "Resident reports a one-off noise disturbance from nearby construction work. No further issues.", "escalation_level": "low"},
    {"id": 12, "text": "Caller reports a broken window in a communal hallway. No witnesses or previous complaints.", "escalation_level": "low"},
    {"id": 13, "text": "Parking dispute between two drivers, both left before police arrival.", "escalation_level": "low"},
    {"id": 14, "text": "Report of fireworks being set off in a local park by teenagers. Area clear on arrival.", "escalation_level": "low"},
    {"id": 15, "text": "Lost mobile phone reported at shopping centre. No suspicious activity.", "escalation_level": "low"},
    {"id": 16, "text": "Single complaint about loud television from next-door neighbour. No previous calls.", "escalation_level": "low"},
    {"id": 17, "text": "Dog found running loose in street. Owner located shortly after.", "escalation_level": "low"},

    # MEDIUM RISK – original + new (łącznie 10)
    {"id": 2, "text": "Neighbours report loud arguing between a couple. Police attended, no injuries, first reported incident.", "escalation_level": "medium"},
    {"id": 5, "text": "Caller reports ongoing harassment from neighbour, including repeated banging on walls and shouting late at night.", "escalation_level": "medium"},
    {"id": 8, "text": "Shop employee reports a customer returning several times to argue about a refund.", "escalation_level": "medium"},
    {"id": 9, "text": "Caller reports ongoing conflict with neighbour over shared driveway access.", "escalation_level": "medium"},
    {"id": 10, "text": "Woman reports ex-friend sending passive-aggressive messages and showing up at social events uninvited.", "escalation_level": "medium"},
    {"id": 18, "text": "Neighbours report repeated shouting late at night from the same apartment. No injuries, but ongoing disturbance.", "escalation_level": "medium"},
    {"id": 19, "text": "Resident reports strong smell of cannabis from neighbour’s flat and regular visitors late at night.", "escalation_level": "medium"},
    {"id": 20, "text": "Father reports escalating arguments with teenage son, no violence recorded.", "escalation_level": "medium"},
    {"id": 21, "text": "Staff member reports a colleague raising their voice aggressively during a disagreement.", "escalation_level": "medium"},

    # HIGH RISK – original + new (łącznie 9)
    {"id": 3, "text": "Victim states her ex-partner has been waiting outside her workplace for several days and sending threatening messages.", "escalation_level": "high"},
    {"id": 6, "text": "Victim reports ex-partner breached restraining order, entered property and threatened to kill her in front of the children.", "escalation_level": "high"},
    {"id": 22, "text": "Victim reports ex-partner has tried to contact her over 50 times this week despite block and previous warnings.", "escalation_level": "high"},
    {"id": 23, "text": "Caller states neighbour threatened to 'come back with a knife' after an argument over parking.", "escalation_level": "high"},
    {"id": 24, "text": "Woman reports partner smashed her phone during argument, previous police attendance recorded.", "escalation_level": "high"},
    {"id": 25, "text": "Victim reports being followed from work by unknown male for three consecutive days.", "escalation_level": "high"},
    {"id": 26, "text": "Caller states ex-partner breached restraining order again and attempted to enter property.", "escalation_level": "high"},
    {"id": 27, "text": "Man reports receiving explicit death threats online related to ongoing harassment case.", "escalation_level": "high"}
]

df = pd.DataFrame(data)
df


Unnamed: 0,id,text,escalation_level
0,1,Bicycle stolen from locked shed overnight. No ...,low
1,4,Group of teenagers seen drinking and shouting ...,low
2,7,Single report of graffiti on garage door. No s...,low
3,11,Resident reports a one-off noise disturbance f...,low
4,12,Caller reports a broken window in a communal h...,low
5,13,"Parking dispute between two drivers, both left...",low
6,14,Report of fireworks being set off in a local p...,low
7,15,Lost mobile phone reported at shopping centre....,low
8,16,Single complaint about loud television from ne...,low
9,17,Dog found running loose in street. Owner locat...,low


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# 1. Tekst i etykiety
X = df['text']
y = df['escalation_level']

# 2. Podział na trening / test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 3. TF-IDF - zamiana tekstu na liczby
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english'
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 4. Model - Logistic Regression
clf = LogisticRegression(max_iter=1000, multi_class='multinomial')
clf.fit(X_train_vec, y_train)

# 5. Ewaluacja
y_pred = clf.predict(X_test_vec)

print("=== MODEL PERFORMANCE ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# 6. Przykładowe predykcje
print("\n=== EXAMPLE PREDICTIONS ===")
n_examples = min(5, len(X_test))
examples = X_test.sample(n_examples, random_state=1)
examples_vec = vectorizer.transform(examples)
preds = clf.predict(examples_vec)

for text, label in zip(examples, preds):
    print("\nTEXT:", text)
    print("Predicted risk:", label)

# 7. Najważniejsze słowa dla każdej klasy
print("\n=== TOP WORDS PER CLASS ===")
feature_names = vectorizer.get_feature_names_out()
classes = clf.classes_

for i, cls in enumerate(classes):
    coefs = clf.coef_[i]
    top_indices = np.argsort(coefs)[-10:]
    print(f"\nTop words for class '{cls}':")
    for idx in top_indices:
        print(feature_names[idx])


=== MODEL PERFORMANCE ===
Accuracy: 0.5555555555555556

Classification report:
               precision    recall  f1-score   support

        high       0.50      0.33      0.40         3
         low       0.50      1.00      0.67         3
      medium       1.00      0.33      0.50         3

    accuracy                           0.56         9
   macro avg       0.67      0.56      0.52         9
weighted avg       0.67      0.56      0.52         9

Confusion matrix:
 [[1 2 0]
 [0 3 0]
 [1 1 1]]

=== EXAMPLE PREDICTIONS ===

TEXT: Woman reports ex-friend sending passive-aggressive messages and showing up at social events uninvited.
Predicted risk: high

TEXT: Caller reports ongoing harassment from neighbour, including repeated banging on walls and shouting late at night.
Predicted risk: medium

TEXT: Caller states neighbour threatened to 'come back with a knife' after an argument over parking.
Predicted risk: low

TEXT: Caller reports ongoing conflict with neighbour over shared 



In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Oddzielenie tekstu od etykiet
X = df['text']
y = df['escalation_level']

# 2. Podział danych na trening i test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)


# 3. Przekształcenie tekstu w liczby (TF-IDF)
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english'
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

X_train_vec


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 64 stored elements and shape (6, 59)>

In [6]:
from sklearn.linear_model import LogisticRegression

# Tworzymy model AI
clf = LogisticRegression(max_iter=1000, multi_class='multinomial')

# Trenujemy model na danych treningowych
clf.fit(X_train_vec, y_train)

print("Model trained successfully!")


Model trained successfully!




In [7]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Przewidywanie etykiet dla zbioru testowego
y_pred = clf.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.5

Classification report:
               precision    recall  f1-score   support

        high       0.50      1.00      0.67         1
         low       1.00      0.50      0.67         2
      medium       0.00      0.00      0.00         1

    accuracy                           0.50         4
   macro avg       0.50      0.50      0.44         4
weighted avg       0.62      0.50      0.50         4


Confusion matrix:
 [[1 0 0]
 [0 1 1]
 [1 0 0]]


In [9]:
# Losowo wybieramy 5 opisów z testu i przewidujemy poziom ryzyka
examples = X_test.sample(3, random_state=1)
examples_vec = vectorizer.transform(examples)
preds = clf.predict(examples_vec)

for text, label in zip(examples, preds):
    print("\nTEXT:", text)
    print("Predicted risk:", label)



TEXT: Shop owner reports same individual returning repeatedly to the store, making aggressive comments and refusing to leave.
Predicted risk: high

TEXT: Single report of graffiti on garage door. No suspects identified.
Predicted risk: medium

TEXT: Victim states her ex-partner has been waiting outside her workplace for several days and sending threatening messages.
Predicted risk: high


In [10]:
import numpy as np

feature_names = vectorizer.get_feature_names_out()
classes = clf.classes_

for i, cls in enumerate(classes):
    coefs = clf.coef_[i]
    top_indices = np.argsort(coefs)[-10:]
    print(f"\nTop words for class '{cls}':")
    for idx in range(len(top_indices)):
        print(feature_names[top_indices[idx]])



Top words for class 'high':
partner
physical
order
property
threatened
threats
restraining
weapons
reports
victim

Top words for class 'low':
teenagers
officers
music
address
flat
complaint
calls
previous
noise
student

Top words for class 'medium':
night
incident
couple
arguing
injuries
attended
neighbours
reported
report
police


In [16]:
!pip install transformers datasets -q


In [17]:
from transformers import pipeline

# Tworzymy zero-shot classifier (model, który potrafi przypisywać tekst do etykiet)
zero_shot_clf = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"
)

candidate_labels = [
    "low escalation risk",
    "medium escalation risk",
    "high escalation risk"
]

print("Model loaded and ready!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


Model loaded and ready!


In [18]:
# Sprawdzamy kilka znanych nam narracji z df
test_examples = [
    "Woman reports ex-friend sending passive-aggressive messages and showing up at social events uninvited.",
    "Caller reports ongoing harassment from neighbour, including repeated banging on walls and shouting late at night.",
    "Caller states neighbour threatened to 'come back with a knife' after an argument over parking.",
    "Caller states ex-partner breached restraining order again and attempted to enter property."
]

for txt in test_examples:
    result = zero_shot_clf(txt, candidate_labels)
    print("\nTEXT:", txt)
    print("Predicted label:", result['labels'][0])
    print("Scores:", dict(zip(result['labels'], [round(s, 3) for s in result['scores']])))



TEXT: Woman reports ex-friend sending passive-aggressive messages and showing up at social events uninvited.
Predicted label: medium escalation risk
Scores: {'medium escalation risk': 0.49, 'high escalation risk': 0.466, 'low escalation risk': 0.044}

TEXT: Caller reports ongoing harassment from neighbour, including repeated banging on walls and shouting late at night.
Predicted label: medium escalation risk
Scores: {'medium escalation risk': 0.567, 'high escalation risk': 0.403, 'low escalation risk': 0.03}

TEXT: Caller states neighbour threatened to 'come back with a knife' after an argument over parking.
Predicted label: high escalation risk
Scores: {'high escalation risk': 0.557, 'medium escalation risk': 0.416, 'low escalation risk': 0.027}

TEXT: Caller states ex-partner breached restraining order again and attempted to enter property.
Predicted label: medium escalation risk
Scores: {'medium escalation risk': 0.534, 'high escalation risk': 0.42, 'low escalation risk': 0.046}


In [19]:
# Używamy modelu zero-shot do oceny całego df
bert_predictions = []

for i, row in df.iterrows():
    txt = row['text']
    res = zero_shot_clf(txt, candidate_labels)
    predicted = res['labels'][0]
    bert_predictions.append(predicted)

df['bert_pred'] = bert_predictions
df[['text', 'escalation_level', 'bert_pred']].head(10)


Unnamed: 0,text,escalation_level,bert_pred
0,Bicycle stolen from locked shed overnight. No ...,low,medium escalation risk
1,Group of teenagers seen drinking and shouting ...,low,medium escalation risk
2,Single report of graffiti on garage door. No s...,low,low escalation risk
3,Resident reports a one-off noise disturbance f...,low,low escalation risk
4,Caller reports a broken window in a communal h...,low,medium escalation risk
5,"Parking dispute between two drivers, both left...",low,medium escalation risk
6,Report of fireworks being set off in a local p...,low,medium escalation risk
7,Lost mobile phone reported at shopping centre....,low,medium escalation risk
8,Single complaint about loud television from ne...,low,low escalation risk
9,Dog found running loose in street. Owner locat...,low,medium escalation risk


In [20]:
# Sprowadzamy bert_pred do prostych klas: low / medium / high
def normalize_label(label):
    if "low" in label:
        return "low"
    if "medium" in label:
        return "medium"
    if "high" in label:
        return "high"
    return "medium"

df['bert_class'] = df['bert_pred'].apply(normalize_label)

from sklearn.metrics import accuracy_score, classification_report

print("Accuracy (BERT zero-shot vs true labels):", accuracy_score(df['escalation_level'], df['bert_class']))
print("\nClassification report:\n", classification_report(df['escalation_level'], df['bert_class']))


Accuracy (BERT zero-shot vs true labels): 0.4444444444444444

Classification report:
               precision    recall  f1-score   support

        high       0.60      0.38      0.46         8
         low       0.75      0.30      0.43        10
      medium       0.33      0.67      0.44         9

    accuracy                           0.44        27
   macro avg       0.56      0.45      0.44        27
weighted avg       0.57      0.44      0.44        27

