In [None]:
import numpy as np
import pandas as pd

import os
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd

Download file `Crime_Data_from_2010_to_2019.csv` from:  
<https://data.lacity.org/Public-Safety/Crime-Data-from-2010-to-2019/63jg-8b9z>

In [None]:
# filename = 'Crime_Data_from_2020_to_Present.csv'
filename = 'Crime_Data_from_2010_to_2019.csv'

df = pd.read_csv(filename)
print(df.shape)

## remove entries with missing lat/lon
df = df[ (df['LAT'] != 0) | (df['LON'] != 0) ]


if 'AREA ' in df.columns:
    df.rename(columns={"AREA ": "AREA"}, inplace=True)

df.columns

In [None]:
# df = df[:50000]



# ## truncate lat/lon coords
# lat_prec = 100
# df['LAT'] = np.trunc(lat_prec * df['LAT']) / lat_prec
# lon_prec = 100
# df['LON'] = np.trunc(lon_prec * df['LON']) / lon_prec

# print(df.shape)
# df.head()

In [None]:
print(f"There are {len(df['Crm Cd 1'].unique())} unique crimes")

# df['Crm Cd 1'].value_counts().sort_index().cumsum().iloc[:10]

## lowest crime codes are more serious

serious_crime_codes = df['Crm Cd 1'].value_counts().sort_index().cumsum().index[:10].values
print(serious_crime_codes)

In [None]:
df['serious'] = df['Crm Cd 1'].isin(serious_crime_codes).astype(int)
df.sample(10)

In [None]:
# columns = ['DATE OCC', 'TIME OCC', 'AREA', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Weapon Used Cd', 'LOCATION', 'LAT', 'LON', 'serious']
columns = ['TIME OCC', 'AREA', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Weapon Used Cd', 'LOCATION', 'LAT', 'LON', 'serious']
crimes = df[columns]
crimes = crimes.dropna()

crimes['Premis Cd'] = crimes['Premis Cd'].astype(int).astype(str)
crimes['Weapon Used Cd'] = crimes['Weapon Used Cd'].astype(int).astype(str)
print(crimes.dtypes)
crimes.sample(10)

In [None]:
crimes_areas = pd.get_dummies(crimes['AREA'], prefix='Area')
print(crimes_areas.shape[1])
crimes_sex = pd.get_dummies(crimes['Vict Sex'], prefix='Sex')
print(crimes_sex.shape[1])
crimes_descent = pd.get_dummies(crimes['Vict Descent'], prefix='Descent')
print(crimes_descent.shape[1])
crimes_premise = pd.get_dummies(crimes['Premis Cd'], prefix='Premise')
print(crimes_premise.shape[1])
crimes_weapon = pd.get_dummies(crimes['Weapon Used Cd'], prefix='Weapon')
print(crimes_weapon.shape[1])
# crimes_location = pd.get_dummies(crimes['LOCATION'], prefix='Location')
# print(crimes_location.shape[1])


In [None]:
crimes_onehot = pd.concat([crimes[['serious', 'TIME OCC']], crimes_areas, crimes_sex, crimes_descent, crimes_premise, crimes_weapon], axis=1)

# crimes_onehot.reset_index(drop=True, inplace=True)

print(crimes_onehot.shape)
crimes_onehot.head(10)

In [None]:
from sklearn.model_selection import train_test_split

X = crimes_onehot.iloc[:, 1:]
y = crimes_onehot.iloc[:, 0]
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10)

#Train the model using the training sets
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# print(y_pred)

In [None]:
from sklearn import metrics

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
## create the dataframe with predictions

dataset = pd.concat([y_test], axis=1)

dataset.rename(columns={'serious': 'label'}, inplace=True)

dataset['pred'] = y_pred

dataset['lat'] = crimes['LAT']
dataset['lon'] = crimes['LON']
print(dataset['label'].value_counts())
dataset.head()


In [None]:
dataset_serious = dataset[dataset['label']==1]
dataset_serious = dataset_serious.drop('label', axis='columns')
print(dataset_serious.shape)
dataset_serious.reset_index(inplace=True, drop=True)
dataset_serious.to_csv('../Crime.csv')

In [None]:
# dataset_notserious = dataset[dataset['label']==0]
# dataset_notserious.drop('label', axis='columns', inplace=True)
# dataset_notserious.shape
# print(dataset_notserious.shape)
# dataset_notserious.reset_index(inplace=True, drop=True)
# dataset_notserious.to_csv('../Crime_notserious.csv')