# Arrest Data

## By Zilin Wang

## Source Data Info

### Data Name
    Pittsburgh Police Arrest Data
    
### Data URL
    https://data.wprdc.org/dataset/arrest-data

### Source Organization
    City of Pittsburgh

## Data Overview

### Key Data
* Neighborhood: Neighborhood where incident occured.
* Incident Frequency of each neighborhood: The total number of arrest at each neighbourhood.
* Arrest Proportion: 

# Data Analysis

In [50]:
import pandas as pd
from difflib import get_close_matches
import numpy as np  

# Display settings: show all rows/columns and prevent truncation
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

# 1. Read in the two CSV files
arrests = pd.read_csv('arrest-data.csv')
pop     = pd.read_csv('population_by_neighborhood.csv')

# 2. Basic cleaning: strip leading/trailing whitespace and convert to lowercase
arrests['nb'] = arrests['INCIDENTNEIGHBORHOOD'].str.strip().str.lower()
pop['nb']     = pop['Neighborhood'].str.strip().str.lower()

# 3. Manual mapping for the most common mismatches
manual_map = {
    'arlington': 'arlington - arlington heights (combined)',
    'arlington heights': 'arlington - arlington heights (combined)',
    'central business district': 'central business district (downtown)',
    'golden triangle/civic arena': 'central business district (downtown)',
    'mount oliver': 'mt. oliver',
    'mt. oliver boro': 'mt. oliver',
    'mt. oliver neighborhood': 'mt. oliver',
    'spring hill-city view': 'spring hill-city',
    'central north side': 'central northside',
    'troy hill-herrs island': 'troy hill',
    # Exclude entries outside the city/county/state
    'outside city': None,
    'outside county': None,
    'outside state': None
}
arrests['nb_manual'] = arrests['nb'].map(manual_map)

# 4. Fuzzy-match the remaining unmapped neighborhood names
pop_names = pop['nb'].dropna().unique().tolist()
mask = arrests['nb_manual'].isna() & arrests['nb'].notna()
def fuzzy_match(x):
    matches = get_close_matches(x, pop_names, n=1, cutoff=0.8)
    return matches[0] if matches else None
arrests.loc[mask, 'nb_manual'] = arrests.loc[mask, 'nb'].apply(fuzzy_match)

# 5. Filter out records that remain unmapped or are the 'chateau' anomaly
cleaned = arrests[
    arrests['nb_manual'].notna() & 
    (arrests['nb_manual'] != 'chateau')
].copy()

# 6. Aggregate counts, merge population, and compute arrests per 1,000 residents
arrest_counts = (
    cleaned
    .groupby('nb_manual')
    .size()
    .rename('CrimeCount')
    .reset_index()
)
crime_rate = (
    arrest_counts
    .merge(pop[['nb','2020_Total_Population']],
           left_on='nb_manual', right_on='nb', how='left')
)
crime_rate['Arrests_per_1000'] = (
    crime_rate['CrimeCount'] 
    / crime_rate['2020_Total_Population'] 
    * 1000
)

# 7. Filter out neighborhoods with fewer than 500 residents
threshold = 500
crime_rate = crime_rate[crime_rate['2020_Total_Population'] >= threshold].copy()

# 8. Apply a "curved" transformation:
#    - take log1p of the arrest rate
#    - perform reverse min–max normalization on the log values
#    - scale the result to the 0–10 range
crime_rate['LogRate'] = np.log1p(crime_rate['Arrests_per_1000'])
min_lr = crime_rate['LogRate'].min()
max_lr = crime_rate['LogRate'].max()
crime_rate['CurvedScore_010'] = (
    (max_lr - crime_rate['LogRate']) 
    / (max_lr - min_lr) 
    * 10
).round(2)

# 9. Sort by the curved score in descending order and assign rank
ranked = crime_rate.sort_values('CurvedScore_010', ascending=False).copy()
ranked['Rank'] = range(1, len(ranked) + 1)

# 10. Print the full results
print(ranked[[
    'Rank', 
    'nb_manual', 
    '2020_Total_Population', 
    'Arrests_per_1000', 
    'CurvedScore_010'
]].to_string(index=False))

 Rank                                nb_manual  2020_Total_Population  Arrests_per_1000  CurvedScore_010
    1                      squirrel hill north                  11879         24.412829            10.00
    2                           swisshelm park                   1339         32.113518             9.31
    3                            regent square                    971         38.105046             8.88
    4                            new homestead                    917         42.529989             8.60
    5                            north oakland                  10691         42.559162             8.60
    6                          stanton heights                   4427         50.372713             8.17
    7                      squirrel hill south                  15317         53.012992             8.04
    8                             point breeze                   5348         56.282723             7.89
    9                              morningside         