# Seattle Crime

In [24]:
import pandas as pd
import numpy as np
import arff
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

## Data Exploration and Data Preprocessing

In [2]:
with open("Seattle_Crime_Data_06-23-2019-4.arff", "r") as f:
    data = arff.load(f)

df = pd.DataFrame(data['data'], columns=[attr[0] for attr in data['attributes']])

In [3]:
df.head()

Unnamed: 0,Report_Number,Occurred_Time,Reported_Time,Crime_Subcategory,Primary_Offense_Description,Precinct,Sector,Beat,Neighborhood
0,1975000000000.0,900.0,1500.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,SOUTH,R,R3,LAKEWOOD/SEWARD PARK
1,1976000000000.0,1.0,2359.0,SEX OFFENSE-OTHER,SEXOFF-INDECENT LIBERTIES,UNKNOWN,,,UNKNOWN
2,1979000000000.0,1600.0,1430.0,CAR PROWL,THEFT-CARPROWL,EAST,G,G2,CENTRAL AREA/SQUIRE PARK
3,19810000000000.0,2029.0,2030.0,HOMICIDE,HOMICIDE-PREMEDITATED-WEAPON,SOUTH,S,S2,BRIGHTON/DUNLAP
4,1981000000000.0,2000.0,435.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,SOUTHWEST,W,W3,ROXHILL/WESTWOOD/ARBOR HEIGHTS


In [4]:

df.replace('UNKNOWN', np.nan, inplace=True)

# Check for missing values (NaN or NA) again
missing_values_with_unknown = df.isna().sum()

print("Missing Values with 'UNKNOWN' treated as NaN:")
print(missing_values_with_unknown)

Missing Values with 'UNKNOWN' treated as NaN:
Report_Number                     0
Occurred_Time                     2
Reported_Time                     2
Crime_Subcategory               262
Primary_Offense_Description       0
Precinct                       3352
Sector                         3346
Beat                           3298
Neighborhood                   3366
dtype: int64


In [5]:
df.shape

(523590, 9)

In [6]:
df.dtypes

Report_Number                  float64
Occurred_Time                  float64
Reported_Time                  float64
Crime_Subcategory               object
Primary_Offense_Description     object
Precinct                        object
Sector                          object
Beat                            object
Neighborhood                    object
dtype: object

In [7]:
# Assuming df is your DataFrame
number_of_unique_values_in_each_column = df.nunique()

print("Number of Unique Values in Each Column:")
print(number_of_unique_values_in_each_column)



Number of Unique Values in Each Column:
Report_Number                  523590
Occurred_Time                    1440
Reported_Time                    1440
Crime_Subcategory                  30
Primary_Offense_Description       144
Precinct                            5
Sector                             23
Beat                               64
Neighborhood                       58
dtype: int64


In [8]:
df.shape

(523590, 9)

In [9]:
numerical_stats = df.describe()

print("Descriptive Statistics for Numerical Features:")
numerical_stats


Descriptive Statistics for Numerical Features:


Unnamed: 0,Report_Number,Occurred_Time,Reported_Time
count,523590.0,523588.0,523588.0
mean,16346330000000.0,1358.650429,1353.362726
std,7421096000000.0,688.348689,589.368521
min,200800000.0,0.0,0.0
25%,20080000000000.0,900.0,950.0
50%,20120000000000.0,1500.0,1407.0
75%,20160000000000.0,1920.0,1817.0
max,20190000000000.0,2359.0,2359.0


### Missing Values

In [10]:
total_values = df.size
missing_values = df.isnull().sum().sum()
percentage_missing = (missing_values / total_values) * 100

print("Percentage of missing values in the DataFrame: {:.2f}%".format(percentage_missing))

# For a specific column
for column in df.columns:
    total_values_column = len(df[column])
    missing_values_column = df[column].isnull().sum()
    percentage_missing_column = (missing_values_column / total_values_column) * 100

    print("Percentage of missing values in column '{}': {:.2f}%".format(column, percentage_missing_column))


Percentage of missing values in the DataFrame: 0.29%
Percentage of missing values in column 'Report_Number': 0.00%
Percentage of missing values in column 'Occurred_Time': 0.00%
Percentage of missing values in column 'Reported_Time': 0.00%
Percentage of missing values in column 'Crime_Subcategory': 0.05%
Percentage of missing values in column 'Primary_Offense_Description': 0.00%
Percentage of missing values in column 'Precinct': 0.64%
Percentage of missing values in column 'Sector': 0.64%
Percentage of missing values in column 'Beat': 0.63%
Percentage of missing values in column 'Neighborhood': 0.64%


In [11]:
mask = df.isnull().sum(axis=1) >= 2
missing_3_col = df[mask]
missing_3_count= len(missing_3_col)
percentage_3_missing = (len(missing_3_col) / len(df)) * 100  # Calculate percentage based on the total number of rows in the DataFrame
print('There are {} row which is {:.2f}% with missing 2 or more columns from the dataset'.format(missing_3_count ,percentage_3_missing))
missing_3_col


There are 3355 row which is 0.64% with missing 2 or more columns from the dataset


Unnamed: 0,Report_Number,Occurred_Time,Reported_Time,Crime_Subcategory,Primary_Offense_Description,Precinct,Sector,Beat,Neighborhood
1,1.976000e+12,1.0,2359.0,SEX OFFENSE-OTHER,SEXOFF-INDECENT LIBERTIES,,,,
9,1.999000e+13,,,THEFT-SHOPLIFT,THEFT-SHOPLIFT,,,,
19,2.004000e+13,0.0,1650.0,SEX OFFENSE-OTHER,SEXOFF-OTHER,,,,
60,2.007000e+13,1824.0,0.0,SEX OFFENSE-OTHER,SEXOFF-OTHER,,,,
86,2.007000e+13,430.0,2145.0,SEX OFFENSE-OTHER,SEXOFF-OTHER,,,,
...,...,...,...,...,...,...,...,...,...
522729,2.019000e+13,1200.0,845.0,THEFT-ALL OTHER,THEFT-OTH,,,,
522732,2.019000e+13,1310.0,2316.0,THEFT-ALL OTHER,THEFT-OTH,,,,
522916,2.019000e+12,1539.0,1539.0,TRESPASS,TRESPASS,,,,
523222,2.019000e+12,2300.0,1407.0,THEFT-ALL OTHER,THEFT-PKPOCKET,,,,


In [12]:
def preprocess_dataset(df):
    """
    Preprocess the dataset by filling missing values and dropping rows with more than 3 missing values.

    Parameters:
    df (DataFrame): The input DataFrame to be preprocessed.

    Returns:
    DataFrame: Preprocessed DataFrame.
    """
    df_copy = df.copy()
    
    df_copy.loc[df_copy['Primary_Offense_Description'] == 'BURGLARY-OTHER', 'Crime_Subcategory'] = 'OTHER BURGLARYS'
    df_copy.loc[df_copy['Primary_Offense_Description'] == 'ROBBERY-OTHER', 'Crime_Subcategory'] = 'OTHER ROBBERY'
    
    neighborhood_by_precinct = df_copy.groupby('Precinct')['Neighborhood'].agg(lambda x: x.value_counts().index[0])

    for index, row in df_copy[df_copy['Neighborhood'].isnull()].iterrows():
        if row['Precinct'] in neighborhood_by_precinct:
            df_copy.at[index, 'Neighborhood'] = neighborhood_by_precinct[row['Precinct']]
    
    precinct_by_neighborhood = df_copy.groupby('Neighborhood')['Precinct'].agg(lambda x: x.value_counts().index[0])

    for index, row in df_copy[df_copy['Precinct'].isnull()].iterrows():
        if row['Neighborhood'] in precinct_by_neighborhood:
            df_copy.at[index, 'Precinct'] = precinct_by_neighborhood[row['Neighborhood']]
            
    beat_by_precinct = df_copy.groupby('Precinct')['Beat'].agg(lambda x: x.value_counts().index[0])

    for index, row in df_copy[df_copy['Beat'].isnull()].iterrows():
        if row['Precinct'] in beat_by_precinct:
            df_copy.at[index, 'Beat'] = beat_by_precinct[row['Precinct']]
    
    mask = df_copy.isnull().sum(axis=1) >= 1
    df_copy = df_copy.drop(df_copy[mask].index)
    
    return df_copy




In [13]:
df_copy= preprocess_dataset(df)
df_copy.isna().sum()

Report_Number                  0
Occurred_Time                  0
Reported_Time                  0
Crime_Subcategory              0
Primary_Offense_Description    0
Precinct                       0
Sector                         0
Beat                           0
Neighborhood                   0
dtype: int64

In [14]:

df_copy.head()

Unnamed: 0,Report_Number,Occurred_Time,Reported_Time,Crime_Subcategory,Primary_Offense_Description,Precinct,Sector,Beat,Neighborhood
0,1975000000000.0,900.0,1500.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,SOUTH,R,R3,LAKEWOOD/SEWARD PARK
2,1979000000000.0,1600.0,1430.0,CAR PROWL,THEFT-CARPROWL,EAST,G,G2,CENTRAL AREA/SQUIRE PARK
3,19810000000000.0,2029.0,2030.0,HOMICIDE,HOMICIDE-PREMEDITATED-WEAPON,SOUTH,S,S2,BRIGHTON/DUNLAP
4,1981000000000.0,2000.0,435.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,SOUTHWEST,W,W3,ROXHILL/WESTWOOD/ARBOR HEIGHTS
5,19880000000000.0,155.0,155.0,MOTOR VEHICLE THEFT,VEH-THEFT-AUTO,WEST,M,M2,SLU/CASCADE


In [15]:
df_copy.shape


(520240, 9)

#### Using feature engineering to derive the severity of a crime

In [17]:
df_copy['Crime_Subcategory'].unique()

array(['BURGLARY-RESIDENTIAL', 'CAR PROWL', 'HOMICIDE',
       'MOTOR VEHICLE THEFT', 'THEFT-ALL OTHER', 'DUI',
       'AGGRAVATED ASSAULT-DV', 'RAPE', 'SEX OFFENSE-OTHER',
       'AGGRAVATED ASSAULT', 'ROBBERY-STREET', 'BURGLARY-COMMERCIAL',
       'THEFT-BICYCLE', 'NARCOTIC', 'PROSTITUTION', 'ARSON',
       'PORNOGRAPHY', 'ROBBERY-COMMERCIAL', 'FAMILY OFFENSE-NONVIOLENT',
       'THEFT-SHOPLIFT', 'WEAPON', 'THEFT-BUILDING', 'TRESPASS',
       'LIQUOR LAW VIOLATION', 'ROBBERY-RESIDENTIAL',
       'DISORDERLY CONDUCT', 'OTHER BURGLARYS', 'OTHER ROBBERY', 'GAMBLE',
       'LOITERING', 'BURGLARY-COMMERCIAL-SECURE PARKING',
       'BURGLARY-RESIDENTIAL-SECURE PARKING'], dtype=object)

In [22]:
def severity_category(Crime):
    low_severity = ['CAR PROWL', 'THEFT-ALL OTHER', 'THEFT-BICYCLE', 'PROSTITUTION', 'PORNOGRAPHY', 
                    'TRESPASS', 'LIQUOR LAW VIOLATION', 'DISORDERLY CONDUCT', 'GAMBLE', 'LOITERING']
    medium_severity = ['BURGLARY-RESIDENTIAL', 'MOTOR VEHICLE THEFT', 'DUI', 'AGGRAVATED ASSAULT-DV', 
                       'NARCOTIC', 'THEFT-SHOPLIFT', 'WEAPON', 'THEFT-BUILDING', 'ROBBERY-RESIDENTIAL',
                       'BURGLARY-COMMERCIAL', 'THEFT-BUILDING', 'OTHER BURGLARYS', 'BURGLARY-COMMERCIAL-SECURE PARKING',
                       'BURGLARY-RESIDENTIAL-SECURE PARKING']
    high_severity = ['HOMICIDE', 'RAPE', 'SEX OFFENSE-OTHER', 'AGGRAVATED ASSAULT', 'ROBBERY-STREET', 
                     'ARSON', 'ROBBERY-COMMERCIAL', 'FAMILY OFFENSE-NONVIOLENT', 'OTHER ROBBERY']
    
    if Crime in low_severity:
        return 'low'
    elif Crime in medium_severity:
        return 'medium'
    else:
        return 'high'
    
df_copy['Severity'] = df_copy['Crime_Subcategory'].apply(severity_category)

In [23]:
df_copy.head()

Unnamed: 0,Report_Number,Occurred_Time,Reported_Time,Crime_Subcategory,Primary_Offense_Description,Precinct,Sector,Beat,Neighborhood,Severity
0,1975000000000.0,900.0,1500.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,SOUTH,R,R3,LAKEWOOD/SEWARD PARK,medium
2,1979000000000.0,1600.0,1430.0,CAR PROWL,THEFT-CARPROWL,EAST,G,G2,CENTRAL AREA/SQUIRE PARK,low
3,19810000000000.0,2029.0,2030.0,HOMICIDE,HOMICIDE-PREMEDITATED-WEAPON,SOUTH,S,S2,BRIGHTON/DUNLAP,high
4,1981000000000.0,2000.0,435.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,SOUTHWEST,W,W3,ROXHILL/WESTWOOD/ARBOR HEIGHTS,medium
5,19880000000000.0,155.0,155.0,MOTOR VEHICLE THEFT,VEH-THEFT-AUTO,WEST,M,M2,SLU/CASCADE,medium


#### Label Encording