<a href="https://www.kaggle.com/code/fotimakhongulomova/policing-equity?scriptVersionId=144629954" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Notebook Imports

In [None]:
!pip install prince

In [None]:
import os
import pandas as pd
import numpy as np
from datetime import date
import seaborn as sns
from scipy import stats
from umap import UMAP

import matplotlib.pyplot as plt
import warnings
from prince import MCA

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.manifold import TSNE

warnings.filterwarnings('ignore')

# Constants

In [None]:
FILE = '/kaggle/input/data-science-for-good/Dept_49-00035/49-00035_Incidents_2016.csv'

# Load Data

In [None]:
data = pd.read_csv(FILE, low_memory=False)
df = pd.DataFrame(data)

df.drop([0], axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

# Step 1: Data Exploration and Preprocessing

In [None]:
df.info()

In [None]:
print(f"\nThe shape of the data: {df.shape}\n")
print(f"The types of the data: \n{df.dtypes}\n")
print(f"The empty row in the data: \n{df.isnull().sum()}")

In [None]:
df['INCIDENT_DATE'].replace(to_replace=np.nan, value=0, inplace=True)

In [None]:
# Extracting dates

df['INCIDENT_DATE'] = pd.to_datetime(df.INCIDENT_DATE, format="mixed")
df["INCIDENT_YEAR"] = df["INCIDENT_DATE"].dt.strftime('%Y')
df["INCIDENT_MONTH"] = df["INCIDENT_DATE"].dt.strftime('%m')
df["INCIDENT_WEEKDAY"] = df["INCIDENT_DATE"].dt.strftime('%w')
df["INCIDENT_TIME"] = df["INCIDENT_DATE"].dt.strftime("%H:%M")
df['INCIDENT_DATE'] = df['INCIDENT_DATE'].astype(str)
df["INCIDENT_DATE"] = df["INCIDENT_DATE"].str.split(" ").str[0].str.split("-").str[2]  # Corrected date splitting

In [None]:
df.tail(20)

### Data Cleaning

In [None]:
df.duplicated().value_counts()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
for feature in ['INCIDENT_DATE', "INCIDENT_YEAR", "INCIDENT_MONTH", 'INCIDENT_WEEKDAY']:
    df[feature].replace(to_replace=np.nan, value=0, inplace=True)
    df[feature] = df[feature].astype(int)
    value = round(df[feature].mean())
    df[feature].replace(to_replace=np.nan, value=value, inplace=True)

In [None]:
df['INCIDENT_YEAR'] = df['INCIDENT_YEAR'].astype(int)
replace = df[(df['INCIDENT_YEAR'] < 2015) | (df['INCIDENT_YEAR'] > 2018)]['INCIDENT_YEAR'].tolist()
value = round(df[(df['INCIDENT_YEAR'] >= 2015) | (df['INCIDENT_YEAR'] <= 2018)]['INCIDENT_YEAR'].mean())
df['INCIDENT_YEAR'].replace(to_replace=replace, value=value, inplace=True)

In [None]:
for feature in df.columns:
    df[feature].replace(to_replace=np.nan, value="UNKNOWN", inplace=True) 

In [None]:
responses = df['INCIDENT_TIME'].unique().tolist()

times = {
    '00:00': '00:',
    '01:00': '01:',
    '02:00': '02:',
    '03:00': '03:',
    '04:00': '04:',
    '05:00': '05:',
    '06:00': '06:',
    '07:00': '07:',
    '08:00': '08:',
    '09:00': '09:',
    '10:00': '10:',
    '11:00': '11:',
    '12:00': '12:',
    '13:00': '13:',
    '14:00': '14:',
    '15:00': '15:',
    '16:00': '16:',
    '17:00': '17:',
    '18:00': '18:',
    '19:00': '19:',
    '20:00': '20:',
    '21:00': '21:',
    '22:00': '22:',
    '23:00': '23:',
}

In [None]:
for response in responses:
        for key, values in times.items():            
            if values in response:
                df['INCIDENT_TIME'].replace(response, value=key, inplace=True)                 
                break

In [None]:
responses = df['INCIDENT_TIME'].unique().tolist()

categories = {
    'MORNING': ['05:00', '06:00', '07:00', '08:00', '09:00', '10:00', '11:00'],
    'AFTERNOON': ['12:00', '13:00','14:00', '15:00', '16:00', '17:00'],
    'EVENING': ['18:00', '19:00', '20:00', '21:00', '22:00'],
    'NIGHT': ['23:00','00:00', '01:00', '02:00', '03:00', '04:00'],
}

for response in responses:
    for key, values in categories.items():
        if any(value in response for value in values):
            df['INCIDENT_TIME'].replace(response, value=key, inplace=True)
            break

In [None]:
responses = df['INCIDENT_REASON'].unique().tolist()

category = {
    'STOLEN VEHICLE': ['CARJACKING', 'STOLEN VEHICLE', 'VEH/VES/ETC', 'VEHICLE'],
    'DOMESTIC VIOLENCE': ['INJURY', 'SPOUSE', 'COHABITANT', 'SP/COHAB'],
    'ASSAULT': ['ASSAULT', 'ADW', 'INJ', 'BATTERY', 'INJ'],
    'PROSTITUTION': ['PROSTITUTION'],
    'DISORDERLY CONDUCT': ['DISORDERLY', 'CONDUCT', 'DISTURB', 'PEACE', 'PUBLIC', 'DISRUPT'],
    'WEAPON': ['WEAPON', 'WPN', 'WPN:ILUSE'],
    'DUI': ['DUI'],
    'THREATS': ['THREAT', 'THRET'],
    'NARCOTICS': ['DRUGS', 'NARCOTIC/CONTROLLED', 'MARIJUANA/HASHISH', 'SUBSTANCE'],
    'VANDALISM': ['VANDALISM'],
    'ROBBERY': ['ROBBERY', 'STEAL'],
    'BURGLARY': ['BURGLARY'],
    'THEFT': ['THEFT', 'SHOPLIFTING-COMMERCIAL', 'SHOPLIFT', 'THFT'],
    'SHOOTING': ['SHOOT'],
    'POSSESSION': ['POSSESS'],
    'MURDER': ['MURDER', 'HARM/DEATH:ELDER/DEP', 'ADLT:HARM/DEATH'],
    'FIREARM': ['FIREARM'],
    'CRUELTY': ['CRUELTY'],
    'ARSON': ['ARSON:INHABITED', 'ARSON:', 'ARSON', 'FIRE'],
    "FRAUD": ['CREDIT', "USE ANOTHER'S PERSONAL ID"],
    'LOST PROPERTY': ['LOST', 'PROPERTY'],
}

for response in responses:
    for key, values in category.items():
#         print(f"Response: {response}, Key: {key}, Values: {values}")
        if any(value in response for value in values):
            df['INCIDENT_REASON'].replace(response, value=key, inplace=True)
            break

In [None]:
values = df['INCIDENT_REASON'].value_counts()

for i, count in values.items():  # Iterate through unique values and their counts
    if count <= 2:
        df.loc[df['INCIDENT_REASON'] == i, 'INCIDENT_REASON'] = 'OTHER'

In [None]:
df['INCIDENT_REASON'].value_counts()

In [None]:
responses = df['CRIME_TYPE'].unique().tolist()

category = {
    'BURGLARY': ['BURG'],
    'FRAUD': ['FORGERY'],
    'THEFT': ['THEFT'],
    'STOLEN VEHICLE': ['VEHICLE', 'CAR', 'RECOVERED', 'STOLEN'],
    'ASSAULT': ['ASSAULT']
}

In [None]:
for response in responses:
    for key, values in category.items():            
        if any(value in response for value in values):
            df['CRIME_TYPE'].replace(response, value=key, inplace=True)                 
            break

In [None]:
values = df['CRIME_TYPE'].value_counts()

for i, count in values.items():  # Iterate through unique values and their counts
    if count <= 2:
        df.loc[df['CRIME_TYPE'] == i, 'CRIME_TYPE'] = 'OTHER'

In [None]:
df['CRIME_TYPE'].value_counts()

In [None]:
print(f"The empty row in the data: \n{df.isnull().sum()}")

In [None]:
df.shape

In [None]:
df['INCIDENT_YEAR'].value_counts()
df['INCIDENT_YEAR'] = df['INCIDENT_YEAR'].astype(str)

### Data Reduction

In [None]:
df.drop(['INCIDENT_UNIQUE_IDENTIFIER', 'LOCATION_FULL_STREET_ADDRESS_OR_INTERSECTION'], axis=1, inplace=True)

In [None]:
df[20:40]

In [None]:
df.tail(20)

### Data Encoding

In [None]:
df.dtypes

In [None]:
# Label encoding
label_df = df.copy()

le = LabelEncoder()
for col in label_df.columns:
    label_df[col] = le.fit_transform(df[col])
    
label_df.head()

In [None]:
cor_mat = label_df.corr(method='spearman')

plt.figure(figsize=(15, 7))
sns.set(font_scale=0.6)
mask = np.triu(np.ones_like(cor_mat, dtype=np.bool))
ax = sns.heatmap(cor_mat, annot=True, fmt=".2g", vmin=-1, vmax=1,
                annot_kws={'size': 'medium'}, linewidths=0.8, mask=mask)
plt.show()

In [None]:
df.drop(['CRIME_TYPE'], axis=1, inplace=True)

# Step 2: Data Visualization

### Number of offenses distributed by YEAR and MONTH

In [None]:
# Pie Chart

# Creating data
data = [len(df[df['INCIDENT_YEAR'] == feature]) for feature in df['INCIDENT_YEAR'].value_counts().index[0:2]]
labels = df['INCIDENT_YEAR'].value_counts().index[0:2]

colors = ['#A0D568', '#FFCE54'] # creating color parameters
explode = (0.02, 0.02) # creating explode data

# Creating autocpt arguments
def func(pct, allvalues):
    absolute = int(pct / 100.*np.sum(allvalues))
    return "{:.1f}%\n({:d})".format(pct, absolute)

# Creating plot
plt.figure(figsize=(18, 8))
plt.rcParams.update({'font.size': 12})
plt.subplot(1, 2, 1)
plt.pie(data, labels=labels, explode=explode, colors=colors, 
                                  autopct=lambda pct: func(pct, data), startangle=90, textprops=dict(color ="#3F1D38"))

# Adding legend
plt.legend(labels, fontsize="12", title ="Years", loc ='best',)

plt.title("Number of offenses distributed by YEAR")

# Bat chart
color=['#B5F1CC']
plt.subplot(1, 2, 2)
order = df['INCIDENT_MONTH'].value_counts().sort_index().index
ax = sns.countplot(x='INCIDENT_MONTH', data=df, order=order, palette=color)
for label in ax.containers[0]:
    ax.annotate(format(int(label.get_height())), 
                (label.get_x() + label.get_width() / 2., label.get_height()), 
                ha='center', va='center', xytext=(0, 9), textcoords='offset points')
    
plt.title('Number of offenses distributed by MONTH')
plt.xlabel('Months')

plt.tight_layout() 
# plt.savefig('year_month.png')
plt.show()

### Number of offenses distributed by DATES, WEEKDAYS and TIME

In [None]:
plt.figure(figsize=(40, 5))
plt.subplot(1, 2, 1)

# Bar chart for dates
order = df['INCIDENT_DATE'].value_counts().sort_index().index
sns.set(font_scale=1)
ax = sns.countplot(x='INCIDENT_DATE',data=df, order=order, color='#7C9D96')

for label in ax.containers[0]:
    ax.annotate(format(int(label.get_height())), 
                (label.get_x() + label.get_width() / 2., label.get_height()), 
                ha='center', va='center', xytext=(0, 9), textcoords='offset points')

plt.title('Number of Offences Distributed by Dates')
plt.xlabel('Dates')
sns.set(font_scale=1.25)

# plt.savefig('dates.png')
plt.show()

In [None]:
# Bar chart for dates
plt.figure(figsize=(25, 5))
plt.subplot(1, 2, 1)

order = df['INCIDENT_WEEKDAY'].value_counts().sort_index().index
ax = sns.countplot(x='INCIDENT_WEEKDAY',data=df, order=order, color='#9D76C1')

for label in ax.containers[0]:
    ax.annotate(format(int(label.get_height())), 
                (label.get_x() + label.get_width() / 2., label.get_height()), 
                ha='center', va='center', xytext=(0, 9), textcoords='offset points')

plt.title('Number of Offences Distributed by Weekday')
plt.xlabel('Weekday')



# Plot the count of offenses by time
plt.subplot(1, 2, 2)
order = df['INCIDENT_TIME'].value_counts().index
ax = sns.countplot(x='INCIDENT_TIME', data=df, color='#64CCC5', order=order)

for label in ax.containers[0]:
    ax.annotate(format(int(label.get_height())), 
                (label.get_x() + label.get_width() / 2., label.get_height()), 
                ha='center', va='center', xytext=(0, 9), textcoords='offset points')

plt.title('Number of Offenses Distributed by Time')
plt.xlabel('Time')
plt.ylabel('Count')

plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

# plt.savefig('weekday_time.png')
plt.show()

### Number of offenses distributed by INCIDENT_REASON

In [None]:
plt.figure(figsize=(25, 10))
order = df['INCIDENT_REASON'].value_counts().index
ax = sns.countplot(y=df['INCIDENT_REASON'], order=order, color='#6499E9')

for label in ax.containers:
    ax.bar_label(label)

plt.ylabel('Location District')

# plt.savefig('INCIDENT_REASON.png')
plt.show()

### Number of offenses distributed by Location District

In [None]:
# Bar chart: Location district
plt.figure(figsize=(25, 25))
order = df['LOCATION_DISTRICT'].value_counts().sort_index().index
ax = sns.countplot(y=df['LOCATION_DISTRICT'], order=order, color='#FFB000')

for label in ax.containers:
    ax.bar_label(label)

plt.ylabel('Location District')

# plt.savefig('LOCATION_DISTRICT.png')
plt.show()

In [None]:
df['INCIDENT_YEAR'] = df['INCIDENT_YEAR'].astype(str)

# Step 3: Dimensionality Reduction

In [None]:
mca = MCA(n_components=130, n_iter=3, copy=True, check_input=True, random_state=42, one_hot=True)
mca = mca.fit(label_df)

In [None]:
mca.eigenvalues_summary

In [None]:
features = mca.column_contributions_.sum(axis=1).sort_values(ascending=False)[:90].keys()

In [None]:
selected_features = []

for feature in features:
    for i in range(len(df.columns)):
        if df.columns[i] in feature:
            selected_features.append(df.columns[i])
            
selected_features = list(set(selected_features))
selected_features

In [None]:
reduced_df = pd.DataFrame(label_df[selected_features], columns=selected_features)

In [None]:
reduced_df.head()

# Step 4: K-Means Clustering

### Choosing the Number of Clusters k

In [None]:
# create a k-Means model an Elbow-Visualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(1, 10), timings=False)
 
# fit the visualizer and show the plot
visualizer.fit(reduced_df)
visualizer.show()

### K-Means clustering

In [None]:
# clustering
n_clusters = 2
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(label_df[selected_features])
 
# extract centroids of clusters into a dataframe
centers = kmeans.cluster_centers_

# extract cluster labels
labels = kmeans.labels_ 

In [None]:
# creating dataframe for reduced data 
reduced_df['CLUSTER'] = labels
reduced_df['CLUSTER'] = reduced_df['CLUSTER'].astype(str)
reduced_df.head()

# Step 5: Cluster Analysis

In [None]:
# Analyzing cluster characteristics
cluster_characteristics = []
for i in range(n_clusters):    
    cluster_data = label_df[selected_features][labels == i]
    cluster_size = len(cluster_data)    
    cluster_center = centers[i]
    avg_distance = np.mean(np.linalg.norm(cluster_data - cluster_center, axis=1))    
    cluster_characteristics.append({
        "Cluster": i,        
        "Size": cluster_size,
        "Avg Distance": avg_distance,    
    })
    
cluster_characteristics

In [None]:
# Calculate mean and median for each cluster
cluster_means = reduced_df.groupby('CLUSTER').mean()
cluster_medians = reduced_df.groupby('CLUSTER').median()

# Print the results
print("Cluster Means:")
print(cluster_means)

print("\nCluster Medians:")
print(cluster_medians)

In [None]:
# Print the cluster centroids
print("Cluster Centroids:")
print(centers)

### Silhouette Score & Davies-Bouldin Index & Variance Ratio Criterion & Within-Cluster Sum of Squares (WCSS)

In [None]:
silhouette_avg = silhouette_score(label_df[selected_features], kmeans.labels_)
print('Silhouette Score', silhouette_avg)

db_index = davies_bouldin_score(label_df[selected_features], kmeans.labels_)
print('Davies-Bouldin Index:', db_index)

ch_score = calinski_harabasz_score(label_df[selected_features], kmeans.labels_)
print('Calinski-Harabasz Index (Variance Ratio Criterion):', ch_score)

wcss = kmeans.inertia_
print('Within-Cluster Sum of Squares (WCSS):', wcss)

# Step 6: Visualization of Clusters

In [None]:
silhouette_scores = []
for n_cluster in range(2, 19):
    kmeans = KMeans(n_clusters=n_cluster)
    kmeans.fit(reduced_df)
    
    clusters = kmeans.labels_
    centroids = kmeans.cluster_centers_
    
    silhouette_scores.append(silhouette_score(reduced_df, clusters, metric='euclidean'))
    
plt.plot(silhouette_scores)
plt.ylabel('Silhouette score')
plt.xlabel('k')
plt.title("Silhouette score for K-means cell's behaviour")
sns.despine()

In [None]:
db_index = []
for n_cluster in range(2, 19):
    kmeans = KMeans(n_clusters=n_cluster)
    kmeans.fit(reduced_df)
    
    clusters = kmeans.labels_
    centroids = kmeans.cluster_centers_
    
    db_index.append(davies_bouldin_score(reduced_df, clusters))
    
plt.plot(db_index)
plt.ylabel('Davies-Bouldin Index')
plt.xlabel('k')
plt.title("'Davies-Bouldin Index for K-means cell's behaviour")
sns.despine()

In [None]:
# Visualization of kmeans with TSNE

tsne = TSNE(n_components=2, random_state=0)
projections = tsne.fit_transform(reduced_df)

plt.figure(figsize=(8, 6))
for i in range(n_clusters):
    plt.scatter(projections[labels == i][:, 0], 
                projections[labels == i][:, 1], label=f'Cluster {i + 1}')
    
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title('K-means Clustering with t-SNE')
plt.legend(loc ='best')
plt.show()

In [None]:
# Visualization of kmeans with UMAP

umap = UMAP(n_components=n_clusters, random_state=0, init='random')
umap_projections = umap.fit_transform(reduced_df)

plt.figure(figsize=(8, 6))
for i in range(2):
    plt.scatter(umap_projections[labels == i][:, 0], 
                umap_projections[labels == i][:, 1], label=f'Cluster {i + 1}')
    
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.title('K-means Clustering with UMAP')
plt.legend(loc ='best')
plt.show()

In [None]:
reduced_df.dtypes

In [None]:
cor_mat = reduced_df.corr(method='spearman')

plt.figure(figsize=(15, 7))
sns.set(font_scale=0.6)
mask = np.triu(np.ones_like(cor_mat, dtype=np.bool))
ax = sns.heatmap(cor_mat, annot=True, fmt=".2g", vmin=-1, vmax=1,
                annot_kws={'size': 'medium'}, linewidths=0.8, mask=mask)
plt.show()

In [None]:
sns.pairplot(hue="CLUSTER", data=reduced_df, corner=True)
plt.show()