In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime

In [253]:
train = pd.read_csv("train.csv", index_col=0)
pd.set_option('display.max_columns' ,None)

## PreProcessing

In [260]:
def run_preprocess(df):
    df_1 = (df.pipe(start_pipe)
           .pipe(create_weekday_column)
           .pipe(create_time_bins)
           .pipe(create_description_bins)
           .pipe(create_beat_bins)
          .pipe(change_cols_to_binary,["Arrest", "Domestic"])
          .pipe(drop_columns))
    return df_1

In [222]:
def start_pipe(df):
    return df.copy()

In [236]:
def create_weekday_column(df):
    df["Weekday"] = df["Date"].apply(lambda d: datetime.strptime(d.split(" ")[0], "%m/%d/%Y").strftime('%A'))
    df = df.join(pd.get_dummies(df["Weekday"], drop_first=True))
    return df

In [224]:
def create_time_bins(dfDt):
    df["am_pm"] = df["Date"].apply(lambda d: d.split(" ")[2].strip(" ")).apply(lambda d: 1 if d == "AM" else 0)
    df["hour"] = df["Date"].apply(lambda d: d.split(" ")[1].split(":")[0])
    df["24_h"] = df.apply(lambda row: int(row["hour"]) if row["am_pm"] == 1 else int(row["hour"]) + 12, axis=1)
    df = df.join(pd.get_dummies(df["24_h"], drop_first=True, prefix="_h"))
    return df

In [233]:
def create_description_bins(df):
    value_list = df["Location Description"].value_counts()[:25].index.tolist()
    df["Location Decsription"] = train.apply(lambda row: row["Location Description"] if row["Location Description"] in (value_list) else "OTHER", axis=1)
    df = df.join(pd.get_dummies(train["Location Description"], drop_first=True, prefix="_type"))
    return df

In [229]:
def create_beat_bins(df):
    df = df.join(pd.get_dummies(pd.qcut(df['Beat'], 30, labels=range(1,31)), drop_first=True, prefix='_beat'))
    return df

In [237]:
def change_cols_to_binary(df, cols):
    for col in cols:
        df[col] = df[col].astype(int)
    return df

In [256]:
def drop_columns(df):
    to_drop = ["ID", "am_pm", "hour", "24_h", "Beat", 
               "Location Description", "Weekday", "Date", 
               "Year", "Updated On", "Block", "District", 
               "Ward", "Community Area", "X Coordinate", 
               "Y Coordinate", "Latitude", "Longitude",
              "Case Number", "IUCR","FBI Code", 
               "Description", "Location"]
    df.drop(to_drop, axis=1, inplace=True)
    return df

In [261]:
train_1 = run_preprocess(train)


## BASELINE


In [265]:
train = pd.read_csv("train.csv")
val = pd.read_csv("validation.csv")
test = pd.read_csv("test.csv")