In [24]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [25]:
# Load the CSV data
csv_path = Path("chicago_crime.csv")
df = pd.read_csv(csv_path)

In [26]:
print(df.isnull().sum())

CASE#                        0
DATE  OF OCCURRENCE          0
BLOCK                        0
 IUCR                        0
 PRIMARY DESCRIPTION         0
 SECONDARY DESCRIPTION       0
 LOCATION DESCRIPTION     1053
ARREST                       0
DOMESTIC                     0
BEAT                         0
WARD                         6
FBI CD                       0
X COORDINATE              4073
Y COORDINATE              4073
LATITUDE                  4073
LONGITUDE                 4073
LOCATION                  4073
dtype: int64


In [27]:
df.dropna(subset=['WARD'], inplace=True)

In [28]:
# Preprocessing steps
df[' PRIMARY DESCRIPTION'] = df[' PRIMARY DESCRIPTION'].str.lower()
df[' PRIMARY DESCRIPTION'] = df[' PRIMARY DESCRIPTION'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
df[' PRIMARY DESCRIPTION'] = df[' PRIMARY DESCRIPTION'].apply(lambda x: word_tokenize(x))
df[' PRIMARY DESCRIPTION'] = df[' PRIMARY DESCRIPTION'].apply(lambda x: ' '.join(x))
df[' PRIMARY DESCRIPTION'] = df[' PRIMARY DESCRIPTION'].str.lower()

In [29]:
# Create TF-IDF vectors
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df[' PRIMARY DESCRIPTION'])

In [30]:
# Define features and target
X = X_tfidf  # Use TF-IDF vectors as features
y = df['WARD']

In [31]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [32]:
# Initialize and train the Random Forest model
random_forest = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=42)

In [33]:
# Make predictions using the trained model
y_pred = random_forest.predict(X_test)

In [34]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)

In [35]:
# Generate a classification report for more detailed metrics
report = classification_report(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.05938316372285257
Classification Report:
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      1428
         2.0       0.00      0.00      0.00      1160
         3.0       0.00      0.00      0.00      2194
         4.0       0.00      0.00      0.00      2551
         5.0       0.00      0.00      0.00      1998
         6.0       0.05      0.04      0.04      2841
         7.0       0.00      0.00      0.00      2518
         8.0       0.00      0.00      0.00      2405
         9.0       0.00      0.00      0.00      2280
        10.0       0.00      0.00      0.00      1530
        11.0       0.00      0.00      0.00       995
        12.0       0.00      0.00      0.00       867
        13.0       0.00      0.00      0.00       806
        14.0       0.00      0.00      0.00      1013
        15.0       0.00      0.00      0.00      1133
        16.0       0.00      0.00      0.00      1977
        17.0       0.00     

In [37]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [44]:
num_rows = X_tfidf.getnnz(axis=0)
xgb_model = XGBClassifier(n_estimators=100, max_depth=3, random_state=42)
xgb_model.fit(X_tfidf, y_encoded)
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)


In [45]:
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.028606334724009305
Classification Report:
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      1428
         2.0       0.00      0.00      0.00      1160
         3.0       0.00      0.00      0.00      2194
         4.0       0.00      0.00      0.00      2551
         5.0       0.00      0.00      0.00      1998
         6.0       0.05      0.04      0.04      2841
         7.0       0.00      0.00      0.00      2518
         8.0       0.00      0.00      0.00      2405
         9.0       0.00      0.00      0.00      2280
        10.0       0.00      0.00      0.00      1530
        11.0       0.00      0.00      0.00       995
        12.0       0.00      0.00      0.00       867
        13.0       0.00      0.00      0.00       806
        14.0       0.00      0.00      0.00      1013
        15.0       0.00      0.00      0.00      1133
        16.0       0.00      0.00      0.00      1977
        17.0       0.00    