In [806]:
import pandas as pd
from datetime import datetime, timedelta
import datetime as dt
import holidays
import re

# Group cleaned_crime_df by unique locations, summing the number of crimes per type of crime

crime_df = pd.read_csv("cleaned_crime.csv")

# Group by location and count unique crimes
crime_counts = crime_df.groupby(['Location', 'Crime']).size().reset_index(name='Count')
location_coords = crime_df[['Location', 'Latitude', 'Longitude']].drop_duplicates()

crime_counts = crime_counts.merge(location_coords, on='Location', how='left')
crime_counts = crime_counts.dropna().reset_index()

crime_df


Unnamed: 0,File Number,Date of Report,Crime Date Time,Crime,Reporting Area,Neighborhood,Location,Latitude,Longitude,Year,Month
0,2009-01323,2009-02-21 09:53:00,02/21/2009 09:20 - 09:30,Threats,105.0,East Cambridge,"100 OTIS ST, Cambridge, MA",42.370427,-71.082645,2009,2
1,2009-01324,2009-02-21 09:59:00,02/20/2009 22:30 - 02/21/2009 10:00,Auto Theft,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA",42.393764,-71.139481,2009,2
2,2009-01327,2009-02-21 12:32:00,02/19/2009 21:00 - 02/21/2009 12:00,Hit and Run,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA",42.393764,-71.139481,2009,2
3,2009-01331,2009-02-21 15:05:00,02/21/2009 15:00 - 15:10,Larceny (Misc),1303.0,Strawberry Hill,"0 NORUMBEGA ST, Cambridge, MA",42.375145,-71.153088,2009,2
4,2009-01346,2009-02-22 05:02:00,02/22/2009 05:02,OUI,105.0,East Cambridge,"FIFTH ST & GORE ST, Cambridge, MA",,,2009,2
...,...,...,...,...,...,...,...,...,...,...,...
92750,2024-03755,2024-05-07 13:13:00,05/04/2024 12:00 - 18:00,Larceny from MV,411.0,Area 4,"100 BISHOP ALLEN DR, Cambridge, MA",42.365860,-71.102518,2024,5
92751,2024-03756,2024-05-07 14:41:00,05/07/2024 14:40 - 14:41,Accident,611.0,Mid-Cambridge,"MASSACHUSETTS AVE & PEABODY ST, Cambridge, MA",42.374898,-71.118521,2024,5
92752,2024-03777,2024-05-07 20:13:00,05/07/2024 15:00 - 19:15,Larceny of Bicycle,411.0,Area 4,"0 COLUMBIA ST, Cambridge, MA",42.370400,-71.095941,2024,5
92753,2024-03806,2024-05-08 16:09:00,05/07/2024 04:00 - 04:05,Larceny from MV,1005.0,West Cambridge,"0 FOSTER PL, Cambridge, MA",42.375982,-71.129503,2024,5


In [807]:
# Getting time/month/year from 'Crime Date Time' using date time

def extract_date_time(crime_date_time):
    match = re.search(r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}', crime_date_time)
    if match: 
        return match.group(0)
    
    return None

def extract_time(crime_date_time):
    match = re.search(r'\d{2}:\d{2}', crime_date_time)
    if match:
        return match.group(0)
    return None

def round_hour(crime_date_time):
    time_str = extract_time(crime_date_time)

    time_obj = dt.datetime.strptime(time_str, '%H:%M')
# Round to the nearest hour
    total_seconds = time_obj.hour * 3600 + time_obj.minute * 60
    rounded_hours = round(total_seconds / 3600)
    rounded_hours = rounded_hours % 24
    
    # Create a new datetime object for the rounded hour
    rounded_time = dt.datetime(year=1000, month=1, day=1, hour=rounded_hours)
    
    # convert back to string
    return rounded_time.strftime('%H:%M')

data = '02/21/2009 22:50'	
rounded = round_hour(data)

rounded

# crime_df

'23:00'

In [808]:
crime_df['Crime Date Time'] = crime_df['Crime Date Time'].apply(extract_date_time)
crime_df = crime_df.dropna().reset_index()
crime_df['Crime Date Time'] = crime_df['Crime Date Time'].astype(str)
crime_df['Time Rounded'] = crime_df['Crime Date Time'].apply(round_hour)
crime_df['Exact Time'] = pd.to_datetime(crime_df['Crime Date Time']).dt.time


In [811]:
# Generate Month and Year
crime_df['Crime Date Time'] = pd.to_datetime(crime_df['Crime Date Time'])
crime_df['Date of Report'] = pd.to_datetime(crime_df['Date of Report'])
crime_df['Date'] = crime_df['Crime Date Time'].dt.date
crime_df['Year'] = crime_df['Crime Date Time'].dt.year
crime_df['Year Reported'] = crime_df['Date of Report'].dt.year
crime_df['Month'] = crime_df['Crime Date Time'].dt.month
crime_df['Day of Week'] = pd.to_datetime(crime_df['Date']).dt.day_name()




In [812]:
crime_df.head(10)

Unnamed: 0,index,File Number,Date of Report,Crime Date Time,Crime,Reporting Area,Neighborhood,Location,Latitude,Longitude,Year,Month,Time Rounded,Exact Time,Date,Date of Report\t,Year Reported,Day of Week
0,0,2009-01323,2009-02-21 09:53:00,2009-02-21 09:20:00,Threats,105.0,East Cambridge,"100 OTIS ST, Cambridge, MA",42.370427,-71.082645,2009,2,09:00,09:20:00,2009-02-21,2009-02-21 09:53:00,2009,Saturday
1,1,2009-01324,2009-02-21 09:59:00,2009-02-20 22:30:00,Auto Theft,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA",42.393764,-71.139481,2009,2,22:00,22:30:00,2009-02-20,2009-02-21 09:59:00,2009,Friday
2,2,2009-01327,2009-02-21 12:32:00,2009-02-19 21:00:00,Hit and Run,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA",42.393764,-71.139481,2009,2,21:00,21:00:00,2009-02-19,2009-02-21 12:32:00,2009,Thursday
3,3,2009-01331,2009-02-21 15:05:00,2009-02-21 15:00:00,Larceny (Misc),1303.0,Strawberry Hill,"0 NORUMBEGA ST, Cambridge, MA",42.375145,-71.153088,2009,2,15:00,15:00:00,2009-02-21,2009-02-21 15:05:00,2009,Saturday
4,5,2009-01357,2009-02-22 21:39:00,2009-02-22 21:39:00,Aggravated Assault,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA",42.393764,-71.139481,2009,2,22:00,21:39:00,2009-02-22,2009-02-22 21:39:00,2009,Sunday
5,6,2009-01363,2009-02-23 10:19:00,2009-02-20 20:00:00,Commercial Break,501.0,Cambridgeport,"600 Massachusetts Ave, Cambridge, MA",42.364916,-71.103308,2009,2,20:00,20:00:00,2009-02-20,2009-02-23 10:19:00,2009,Friday
6,7,2009-01365,2009-02-23 11:24:00,2009-02-23 11:00:00,Street Robbery,501.0,Cambridgeport,"600 Massachusetts Ave, Cambridge, MA",42.364916,-71.103308,2009,2,11:00,11:00:00,2009-02-23,2009-02-23 11:24:00,2009,Monday
7,8,2009-01385,2009-02-23 20:16:00,2009-02-23 20:16:00,Housebreak,1108.0,North Cambridge,"100 CLIFTON ST, Cambridge, MA",42.396425,-71.136653,2009,2,20:00,20:16:00,2009-02-23,2009-02-23 20:16:00,2009,Monday
8,9,2009-01391,2009-02-24 09:02:00,2009-02-22 19:30:00,Hit and Run,105.0,East Cambridge,"100 SEVENTH ST, Cambridge, MA",42.370239,-71.085774,2009,2,20:00,19:30:00,2009-02-22,2009-02-24 09:02:00,2009,Sunday
9,10,2009-01395,2009-02-24 12:30:00,2009-02-24 12:15:00,Shoplifting,501.0,Cambridgeport,"500 Massachusetts Ave, Cambridge, MA",42.364157,-71.10177,2009,2,12:00,12:15:00,2009-02-24,2009-02-24 12:30:00,2009,Tuesday


In [813]:
# Adding holidays

us_holidays = holidays.US()
def us_holiday_check(date):
    holiday_name = us_holidays.get(date)
    if holiday_name:
        return holiday_name
    else:
        return 'Average Nonholiday'
    
crime_df['Holiday'] = crime_df['Date'].apply(us_holiday_check)
crime_df
num_holiday_rows = crime_df['Holiday'].notna().sum()
print(num_holiday_rows)
print(crime_df.shape)

80396
(80396, 19)


In [814]:
# Combine observed holidays with actual holidays

holiday_mapping = {
    "New Year's Day (observed)": "New Year's Day",
    "Veterans Day (observed)": "Veterans Day",
    "Independence Day (observed)": "Independence Day",
    "Christmas Day (observed)": "Christmas Day",
    "Juneteenth National Independence Day": "Juneteenth",
    "Juneteenth National Independence Day (observed)": "Juneteenth"
}

crime_df['Holiday'] = crime_df['Holiday'].replace(holiday_mapping)

In [815]:
crime_df.to_csv('curated_crime.csv', index=False)

In [816]:
# Check for unique crime types - these will be the possible variables
unique_crime_types = crime_df['Crime'].unique()

print("Unique crime types:")
print(unique_crime_types)
unique_crime_types.shape

Unique crime types:
['Threats' 'Auto Theft' 'Hit and Run' 'Larceny (Misc)'
 'Aggravated Assault' 'Commercial Break' 'Street Robbery' 'Housebreak'
 'Shoplifting' 'Forgery' 'Simple Assault' 'Warrant Arrest' 'Disorderly'
 'Larceny from Building' 'Mal. Dest. Property' 'Trespassing'
 'Larceny from MV' 'Larceny from Person' 'Missing Person'
 'Larceny from Residence' 'Harassment' 'Liquor Possession/Sale'
 'Flim Flam' 'Phone Calls' 'Larceny of Bicycle' 'Drugs'
 'Indecent Exposure' 'OUI' 'Larceny of Plate' 'Annoying & Accosting'
 'Sex Offender Violation' 'Rec. Stol. Property' 'Commercial Robbery'
 'Kidnapping' 'Drinking in Public' 'Larceny of Services' 'Counterfeiting'
 'Peeping & Spying' 'Homicide' 'Extortion/Blackmail' 'Stalking'
 'Weapon Violations' 'Arson' 'Embezzlement' 'Prostitution'
 'Violation of R.O.' 'Violation of H.O.' 'Domestic Dispute' 'Gambling'
 'Accident' 'Noise Complaint' 'Suspicious Package' 'Taxi Violation']


(53,)

In [None]:
toggle_csv_crime_types = False # keep this false!!!!

# Larceny crimes

In [817]:
# Compiling larceny_df

larceny_crime_types = [
    'Larceny (Misc)', 'Larceny from Building', 'Larceny from MV',
    'Larceny from Person', 'Larceny from Residence', 'Larceny of Bicycle',
    'Larceny of Plate', 'Larceny of Services'
]

larceny_df = crime_df[crime_df['Crime'].isin(larceny_crime_types)]
if toggle_csv_crime_types is True:
    larceny_df.to_csv('Larceny_Crimes.csv', index=False)

# Fraud and Financial crimes

In [818]:
fraud_and_financial_crime_types = [
    'Forgery', 'Counterfeiting', 'Embezzlement', 'Flim Flam', 
    'Larceny of Services', 'Gambling'
]

fraud_and_financial_df = crime_df[crime_df['Crime'].isin(fraud_and_financial_crime_types)]
if toggle_csv_crime_types is True:
    fraud_and_financial_df.to_csv('Fraud_and_Financial_Crimes.csv', index=False)

# Violent Crimes

In [819]:
# Violent Crimes
violent_crime_types = [
    'Aggravated Assault', 'Simple Assault', 'Homicide', 'Kidnapping', 
    'Street Robbery', 'Commercial Robbery', 'Domestic Dispute', 'Weapon Violations'
]

# Property Crimes

In [820]:
property_crime_types = [
    'Commercial Break', 'Housebreak', 'Mal. Dest. Property', 'Arson', 
    'Rec. Stol. Property', 'Extortion/Blackmail', 'Burglary'
]

property_df = crime_df[crime_df['Crime'].isin(property_crime_types)]
if toggle_csv_crime_types is True:
    property_df.to_csv('Property_Crimes.csv', index=False)

# Traffic related crimes

In [821]:
crime_df = pd.read_csv('merged.csv')

traffic_crime_types = [
    'Auto Theft', 'Hit and Run', 'OUI', 'Larceny of Bicycle', 'Larceny of Plate', 
    'Accident', 'Taxi Violation'
]

traffic_df = crime_df[crime_df['Crime'].isin(traffic_crime_types)]
if toggle_csv_crime_types is True:
    traffic_df.to_csv('Traffic_Crimes.csv', index=False)

# Harassment Crimes

In [822]:
crime_df = pd.read_csv('merged.csv')

harassment_crime_types = [
    'Harassment', 'Indecent Exposure', 'Violation of R.O.', 
    'Violation of H.O.', 'Threats', 'Peeping & Spying', 
    'Sex Offender Violation', 'Stalking', 'Annoying & Accosting'
]

harassment_df = crime_df[crime_df['Crime'].isin(harassment_crime_types)]
if toggle_csv_crime_types is True:
    harassment_df.to_csv('Harassment_Crimes.csv', index=False)

# Public disorder

In [823]:
public_disorder_crime_types = [
    'Disorderly', 'Drinking in Public', 'Noise Complaint', 'Annoying & Accosting'
]

public_disorder_df = crime_df[crime_df['Crime'].isin(public_disorder_crime_types)]
if toggle_csv_crime_types is True:
    public_disorder_df.to_csv('Public_Disorder_Crimes.csv', index=False)

# Drinking/drugs related crimes

In [824]:
drugs_and_related_crime_types = [
    'Drugs', 'Liquor Possession/Sale', 'Drinking in Public', 'OUI'
]

drugs_and_related_df = crime_df[crime_df['Crime'].isin(drugs_and_related_crime_types)]
if toggle_csv_crime_types is True:
    drugs_and_related_df.to_csv('Drugs_and_Related_Crimes.csv', index=False)


# Prostitution related crimes

In [825]:
prostitution_related_crime_types = [
    'Prostitution', 'Sex Offender Violation', 'Indecent Exposure'
]

prostitution_related_df = crime_df[crime_df['Crime'].isin(prostitution_related_crime_types)]
if toggle_csv_crime_types is True:
    prostitution_related_df.to_csv('Prostitution_Related_Crimes.csv', index=False)

# Misc crimes

In [826]:

miscellaneous_crime_types = [
    'Phone Calls', 'Missing Person', 'Suspicious Package'
]

miscellaneous_df = crime_df[crime_df['Crime'].isin(miscellaneous_crime_types)]
if toggle_csv_crime_types is True:
    miscellaneous_df.to_csv('Miscellaneous_Crimes.csv', index=False)

# Warrant related crimes

In [827]:
warrant_related_crime_types = [
    'Warrant Arrest', 'Disorderly', 'Violation of R.O.', 
    'Violation of H.O.', 'Stalking', 'Domestic Dispute'
]

warrant_related_df = crime_df[crime_df['Crime'].isin(warrant_related_crime_types)]
if toggle_csv_crime_types is True:
    warrant_related_df.to_csv('Warrant_Related_Crimes.csv', index=False)