# Libraries and Data

In [10]:
import pandas as pd
import os

# Data cleaning
from library import la_functions as la

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Modelling
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report

# Export model
import pickle

ModuleNotFoundError: No module named 'library'

In [4]:
# fetch all the data from the raw_data folder
df = pd.read_csv('../raw_data/data.csv')

In [3]:
columns_keep = [
#'division_number',
#'date_reported',
'date_occurred',
#'area',
'area_name',
#'reporting_district',
#'part',
#'crime_code',
'crime_description',
#'modus_operandi',
'victim_age',
'victim_sex',
'victim_descent',
#'premise_code',
'premise_description',
#'weapon_code',
'weapon_description',
#'status',
#'status_description',
#'crime_code_1',
#'crime_code_2',
#'crime_code_3',
#'crime_code_4',
'location',
#'cross_street',
'latitude',
'longitude',
]

In [4]:
df = df[columns_keep]
df.head(3)

Unnamed: 0,date_occurred,area_name,crime_description,victim_age,victim_sex,victim_descent,premise_description,weapon_description,location,latitude,longitude
0,2020-01-08 22:30:00,Southwest,BATTERY - SIMPLE ASSAULT,36,F,B,SINGLE FAMILY DWELLING,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",1100 W 39TH PL,34.0141,-118.2978
1,2020-01-01 03:30:00,Central,BATTERY - SIMPLE ASSAULT,25,M,H,SIDEWALK,UNKNOWN WEAPON/OTHER WEAPON,700 S HILL ST,34.0459,-118.2545
2,2020-02-13 12:00:00,Central,SEX OFFENDER REGISTRANT OUT OF COMPLIANCE,0,X,X,POLICE FACILITY,,200 E 6TH ST,34.0448,-118.2474


# Exploratory Data Analysis

## Basics

In [26]:
len(df)

852950

In [27]:
# Number of duplicated values
df.duplicated().sum()

3963

In [49]:
df.describe()

Unnamed: 0,victim_age,latitude,longitude,YEAR OCC,MONTH OCC,DAY OCC
count,852950.0,852950.0,852950.0,852950.0,852950.0,852950.0
mean,29.742191,33.983232,-118.040106,2021.532406,6.419488,15.334084
std,21.79947,1.756263,6.089068,1.099147,3.380271,8.96467
min,-3.0,0.0,-118.6676,2020.0,1.0,1.0
25%,5.0,34.0141,-118.4297,2021.0,4.0,7.0
50%,31.0,34.0585,-118.3215,2022.0,6.0,15.0
75%,45.0,34.1632,-118.2739,2022.0,9.0,23.0
max,120.0,34.3343,0.0,2023.0,12.0,31.0


- Note: All crimes are unique, however when you remove a number of columns it appears that they are duplicates

In [28]:
# Missing values
df.isnull().sum().sort_values(ascending=False)

weapon_description     556202
victim_descent         112614
victim_sex             112606
premise_description       518
date_occurred               0
area_name                   0
crime_description           0
victim_age                  0
location                    0
latitude                    0
longitude                   0
dtype: int64

## Features

In [32]:
print(df.victim_descent.value_counts())
print(df.victim_sex.value_counts())

victim_descent
H    261145
W    173440
B    120896
X     83214
O     67532
A     18700
K      4579
F      3580
C      3313
J      1181
V       893
I       805
Z       426
P       229
U       170
D        66
G        63
L        56
S        46
-         2
Name: count, dtype: int64
victim_sex
M    351362
F    313468
X     75420
H        93
-         1
Name: count, dtype: int64


In [39]:
pd.set_option('display.max_rows', 10)
pd.DataFrame(df.premise_description.value_counts())

Unnamed: 0_level_0,count
premise_description,Unnamed: 1_level_1
STREET,216018
SINGLE FAMILY DWELLING,144367
"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",104461
PARKING LOT,59551
OTHER BUSINESS,40412
...,...
MTA - SILVER LINE - LAC/USC MEDICAL CENTER,2
DEPT OF DEFENSE FACILITY,2
MTA - SILVER LINE - DOWNTOWN STREET STOPS,2
HORSE RACING/SANTA ANITA PARK*,2


- What do '-' mean?
- Code drop down using dictionary mapping
- What does H mean?

### Geolocation

# Data Cleaning

In [None]:
# Remove victim_sex rows with missing data

In [None]:
# Remove victim_race rows with missing data

In [None]:
# Remove victim age = 0

In [5]:
# Split 'date_occurred' into year, month and day.
df['date_occurred'] = pd.to_datetime(df['date_occurred'])
df['year_occurred'] = df['date_occurred'].dt.year
df['month_occurred'] = df['date_occurred'].dt.month
df['day_occurred'] = df['date_occurred'].dt.day
df.head(3)

Unnamed: 0,date_occurred,area_name,crime_description,victim_age,victim_sex,victim_descent,premise_description,weapon_description,location,latitude,longitude,year_occurred,month_occurred,day_occurred
0,2020-01-08 22:30:00,Southwest,BATTERY - SIMPLE ASSAULT,36,F,B,SINGLE FAMILY DWELLING,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",1100 W 39TH PL,34.0141,-118.2978,2020,1,8
1,2020-01-01 03:30:00,Central,BATTERY - SIMPLE ASSAULT,25,M,H,SIDEWALK,UNKNOWN WEAPON/OTHER WEAPON,700 S HILL ST,34.0459,-118.2545,2020,1,1
2,2020-02-13 12:00:00,Central,SEX OFFENDER REGISTRANT OUT OF COMPLIANCE,0,X,X,POLICE FACILITY,,200 E 6TH ST,34.0448,-118.2474,2020,2,13


In [6]:
df.drop(columns=['date_occurred'], inplace=True)

# Baseline Model  - Logistic Regression

In [7]:
X = df[['victim_age','latitude','longitude','day_occurred','month_occurred','year_occurred']]
y = df['crime_description']

In [21]:
df.victim_age.value_counts()

victim_age
 0      211842
 30      19421
 35      19008
 31      18603
 29      18552
         ...  
 97         63
-1          60
-2          13
 120         1
-3           1
Name: count, Length: 103, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# DATA CLEANING PIPES
def isolate_age(X):
    return X.apply(lambda age: age if 1 <= age <= 99 else None)

def dropna(X):
    return X.dropna()

# Remove null values from Age
dropna_pipe = FunctionTransformer(dropna)

# select valid age range
age_range_pipe = FunctionTransformer(isolate_age)


In [37]:
# Preprocess numerical data
norm_scaler = MinMaxScaler()
preprocessor = ColumnTransformer(
    transformers=[
        #('dropna_pipe', dropna_pipe, ['victim_age']),
        #('age_range_pipe', age_range_pipe, ['victim_age']),
        ('num', norm_scaler, ['victim_age', 'latitude', 'longitude', 'day_occurred', 'month_occurred', 'year_occurred'])
    ])

In [38]:
# Logistic Regression model
lr_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [39]:
lr_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [41]:
accuracy = lr_model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.1892666627586611


In [42]:
y_pred = lr_model.predict(X_test)

In [44]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [45]:
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
                                                          precision    recall  f1-score   support

                                                   ARSON       0.00      0.00      0.00       473
            ASSAULT WITH DEADLY WEAPON ON POLICE OFFICER       0.00      0.00      0.00       209
          ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT       0.00      0.00      0.00      9717
                                       ATTEMPTED ROBBERY       0.00      0.00      0.00       883
                                BATTERY - SIMPLE ASSAULT       0.11      0.56      0.18     13639
                                BATTERY ON A FIREFIGHTER       0.00      0.00      0.00        54
                                 BATTERY POLICE (SIMPLE)       0.00      0.00      0.00       457
                             BATTERY WITH SEXUAL CONTACT       0.00      0.00      0.00       727
BEASTIALITY, CRIME AGAINST NATURE SEXUAL ASSLT WITH ANIM       0.00      0.00      0.00       

  _warn_prf(average, modifier, msg_start, len(result))
