In [702]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import json

## TEST-VAL-TRAIN SPLIT


In [1146]:
train = pd.read_csv("final_train.csv", index_col = 0)
val = pd.read_csv("validation.csv", index_col = 0)
test= pd.read_csv("test.csv", index_col = 0)

merged = pd.concat([train, val, test])


In [659]:
merged = merged.sample(frac=1, random_state = 42)

In [678]:
n = merged.shape[0]
ti = int(0.7*n)
vi = int((n-ti)*(1/3)+ti)
ti
train = merged.iloc[:ti, :]
val = merged.iloc[ti:vi, :]
test = merged.iloc[vi:,:]

train.to_csv("train_final.csv")
val.to_csv("validtion_final.csv")
test.to_csv("test_final.csv")

In [945]:
train = pd.read_csv("train_final.csv", index_col=0)
train = train[train["Ward"].notna()]
train = train.fillna(df.mean())

## PreProcessing

In [1315]:
def run_preprocess(df):
    df_1 = (df.pipe(start_pipe)
            .pipe(fillna)
            .pipe(extract_fetures_from_date)
            .pipe(set_ward_dist)
            .pipe(create_description_bins)
            .pipe(change_cols_to_binary,["Arrest", "Domestic"])
            .pipe(drop_columns)
            )
    return df_1

In [1316]:
def start_pipe(df):
    return df.copy()

In [1317]:

from sklearn.cluster import KMeans
def create_corr_clusters(df):
    kmeans = KMeans(30)
    clusters = kmeans.fit_predict(df[['X Coordinate','Y Coordinate']])
    df['cor_clusters'] = kmeans.predict(df[['X Coordinate','Y Coordinate']])
    df = df.join(pd.get_dummies(df["cor_clusters"], prefix="clus"))
    return df

In [1318]:
def extract_fetures_from_date(df):
    df["Date"] = pd.to_datetime(df["Date"])
    df["hour"] = df["Date"].apply(lambda d: d.hour)
    df["minute"] = df["Date"].apply(lambda d: d.minute)
    df["weekday"] = df["Date"].apply(lambda d: d.weekday())
    df["hourfloat"] = df["hour"]+df["minute"]/60
    df['x']=np.sin(2.*np.pi*df.hourfloat/24.)
    df['y']=np.cos(2.*np.pi*df.hourfloat/24.)
    df = df.join(pd.get_dummies(df["weekday"], drop_first=True, prefix="wd"))
    df = df.join(pd.get_dummies(df["hour"], drop_first=True, prefix="h"))
    return df

In [1319]:
def create_description_bins(df): #change to most correlated?
    value_list = ['APARTMENT',
                 'RESIDENCE',
                 'STREET',
                 'SIDEWALK',
                 'PARKING LOT / GARAGE (NON RESIDENTIAL)',
                 'SMALL RETAIL STORE',
                 'RESIDENCE - PORCH / HALLWAY',
                 'DEPARTMENT STORE',
                 'GROCERY FOOD STORE',
                 'OTHER (SPECIFY)',
                 'ALLEY',
                 'COMMERCIAL / BUSINESS OFFICE',
                 'RESTAURANT',
                  'CHURCH / SYNAGOGUE / PLACE OF WORSHIP',
                 'VEHICLE NON-COMMERCIAL',
                 'GAS STATION',
                 'RESIDENCE - YARD (FRONT / BACK)',
                 'RESIDENCE - GARAGE',
                 'HOTEL / MOTEL',
                 'DRUG STORE',
                 'CONVENIENCE STORE',
                 'CTA TRAIN',
                 'HOSPITAL BUILDING / GROUNDS',
                 'NURSING / RETIREMENT HOME',
                 'CHA APARTMENT',
                 'CTA BUS']
    df["Location Description"] = df.apply(lambda row: row["Location Description"] if row["Location Description"] in (value_list) else "OTHER", axis=1)
    df = df.join(pd.get_dummies(df["Location Description"], prefix="_type"))
    return df

In [1320]:
def create_beat_bins(df):
    df = df.join(pd.get_dummies(pd.qcut(df['Beat'], 30, labels=range(1,31)), drop_first=True, prefix='_beat'))
    return df

In [1321]:
def change_cols_to_binary(df, cols):
    for col in cols:
        df[col] = df[col].astype(int)
    return df

In [1322]:
def set_ward_dist(df):
    primary_type_lst = ['BATTERY', 'THEFT', 'ASSAULT', 'CRIMINAL DAMAGE', 'DECEPTIVE PRACTICE']
    dest_dict = json.load(open('ward_dist.json', "r"))
    for i, t in enumerate(primary_type_lst):
        df[t] = df.loc[:,"Ward"].apply(lambda d: dest_dict[str(int(d))][i])
    return df

In [1323]:
def drop_columns(df):
    to_drop = ["ID",  "hour", "minute", "Beat","hourfloat","weekday", 
               "Location Description", "Date", 
               "Year", "Updated On", "District", 
               "Ward", "Community Area", "X Coordinate", 
               "Y Coordinate","Block",
              "Case Number", "IUCR","FBI Code", 
               "Description", "Location"]
    df.drop(to_drop, axis=1, inplace=True)
    return df

In [1324]:
def fillna(df): # change to closest? 
    df = df.fillna(df.mean())
    df = df.fillna(method="ffill")
    return df

In [1325]:

TYPE_TO_INDEX = {
    "BATTERY" : 0,
    "THEFT": 1,
    "CRIMINAL DAMAGE":2,
    "DECEPTIVE PRACTICE": 3,
    "ASSAULT": 4
}

INDEX_TO_TYPE = {
    0:"BATTERY",
    1:"THEFT",
    2:"CRIMINAL DAMAGE",
    3:"DECEPTIVE PRACTICE" ,
    4:"ASSAULT"
}

## BASELINE


In [1158]:
train = pd.read_csv("train_final.csv", index_col=0)
val = pd.read_csv("validtion_final.csv", index_col=0)
test = pd.read_csv("test_final.csv", index_col=0)

In [1280]:
train = pd.read_csv("train_final.csv", index_col=0)
val = pd.read_csv("validtion_final.csv", index_col=0)
test = pd.read_csv("test_final.csv", index_col=0)
train_ = run_preprocess(train)
val_ = run_preprocess(val)
test_ = run_preprocess(test)


In [1256]:
def split_x_y(df):
    y = df["Primary Type"].apply(lambda d: TYPE_TO_INDEX[d])
    x = df.drop("Primary Type", axis=1)
    return x, y

In [1281]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier


In [1282]:
x_train, y_train = split_x_y(train_)
x_val, y_val = split_x_y(val_)
x_test, y_test = split_x_y(test_)

In [1326]:
train = pd.read_csv("train_final.csv", index_col=0)
val = pd.read_csv("validtion_final.csv", index_col=0)
test = pd.read_csv("test_final.csv", index_col=0)

merged = pd.concat([train, val, test])
merged_ = run_preprocess(merged)
x_train_final, y_train_final = split_x_y(merged)

In [1328]:
merged_ = run_preprocess(merged)


In [1333]:
x_train_final_, y_train_final_ = split_x_y(merged_)

12617    3
43493    2
35904    1
29404    0
62229    0
Name: Primary Type, dtype: int64

In [1334]:

clf = GradientBoostingClassifier(n_estimators=120, learning_rate=0.2,
                                     max_depth=6,min_samples_split=400,min_samples_leaf=50,
                                     max_features='sqrt', random_state=10).fit(x_train_final_, y_train_final_)


In [1336]:
import pickle

In [1337]:
filename = "boosting_model.pkl"
with open(filename, "wb") as file:
    pickle.dump(clf, file)

In [1340]:
with open(filename, "rb") as file:
    model = pickle.load(file)

In [1339]:
model.score(x_train_final_, y_train_final_)

0.5650771388499298

In [1305]:
train = pd.read_csv("train_final.csv", index_col=0)
val = pd.read_csv("validtion_final.csv", index_col=0)
test = pd.read_csv("test_final.csv", index_col=0)

merged = pd.concat([train, val, test])

In [1288]:
merged.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
12617,12281132,JE126440,01/26/2021 05:30:00 PM,052XX N WINTHROP AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,APARTMENT,False,False,2023,20,48.0,77,11,1167925.0,1934985.0,2021,02/02/2021 03:48:32 PM,41.977143,-87.657836,"(41.977142995, -87.657835757)"
43493,12329533,JE186139,04/01/2021 11:30:00 AM,011XX N CENTRAL AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,True,1524,15,37.0,25,14,1138804.0,1907004.0,2021,04/08/2021 05:05:01 PM,41.90094,-87.765609,"(41.900940461, -87.76560933)"
35904,12321604,JE176756,03/10/2021 06:30:00 PM,030XX N DAMEN AVE,890,THEFT,FROM BUILDING,APARTMENT,False,False,1931,19,32.0,5,06,1162478.0,1920363.0,2021,03/24/2021 05:02:44 PM,41.937136,-87.678278,"(41.937135628, -87.67827782)"
29404,12306775,JE158718,03/04/2021 08:35:00 AM,009XX N ORLEANS ST,486,BATTERY,DOMESTIC BATTERY SIMPLE,ALLEY,True,True,1823,18,27.0,8,08B,1173737.0,1906916.0,2021,03/11/2021 03:47:23 PM,41.899993,-87.637301,"(41.899992933, -87.637300682)"
62229,12364411,JE228799,02/03/2021 02:47:00 AM,053XX W ROSCOE ST,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,1634,16,30.0,15,08B,,,2021,05/14/2021 04:55:48 PM,,,


In [1289]:
primary_type_lst = ['BATTERY', 'THEFT', 'ASSAULT', 'CRIMINAL DAMAGE', 'DECEPTIVE PRACTICE']

dest_dict = {}

for i in range(1, 51):

    dest_dict[i] = []

    count = 0

    for t in primary_type_lst:

        crime_sum = merged[merged['Ward'] == i]['Primary Type'].value_counts()[t]

        count += crime_sum

        dest_dict[i].append(crime_sum)

    dest_dict[i] = list(map(lambda i: i / count, dest_dict[i]))

dest_dict

{1: [0.1875,
  0.38109756097560976,
  0.11280487804878049,
  0.15396341463414634,
  0.16463414634146342],
 2: [0.16666666666666666,
  0.4546783625730994,
  0.07163742690058479,
  0.10818713450292397,
  0.19883040935672514],
 3: [0.3184498736310025,
  0.22072451558550968,
  0.15669755686604886,
  0.16933445661331087,
  0.13479359730412804],
 4: [0.27393617021276595,
  0.32092198581560283,
  0.14184397163120568,
  0.12677304964539007,
  0.13652482269503546],
 5: [0.29486023444544635,
  0.24526600541027954,
  0.151487826871055,
  0.1740306582506763,
  0.13435527502254282],
 6: [0.36341161928306553,
  0.17676143386897405,
  0.18479604449938195,
  0.19901112484548825,
  0.07601977750309023],
 7: [0.3264490339773484,
  0.18321119253830778,
  0.17455029980013326,
  0.1938707528314457,
  0.12191872085276483],
 8: [0.33847231955150664,
  0.18149964961457604,
  0.18500350385423966,
  0.18990889978976874,
  0.1051156271899089],
 9: [0.2884902840059791,
  0.19357249626307924,
  0.16517189835575485

In [1291]:
with open('ward_dist.json', 'w') as fp:
    json.dump(dest_dict, fp)

In [1292]:
b = json.load(open('ward_dist.json', "r"))