In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import datetime
import numpy as np

In [3]:
df= pd.read_csv('../input/crimesdistrict/crime_with_coordinates.csv', low_memory=False)
df.head()

Unnamed: 0.1,Unnamed: 0,incident_id,case_number,incident_datetime,incident_type_primary,incident_description,clearance_type,address_1,address_2,city,...,parent_incident_type,Census Tract 1,Census Block 1,Census Block Group 1,Neighborhood 1,Police District 1,Council District 1,neighborhood,police_dt,council_dt
0,59596,55236827.0,11-0050386,1910-12-22 00:00:00,LARCENY/THEFT,Buffalo Police are investigating this report o...,,200 Block CRESTWOOD AV,,BUFFALO,...,Theft,,,,,,,North Park,District D,DELAWARE
1,121509,710086379.0,15-0760407,1914-12-01 22:00:00,LARCENY/THEFT,Buffalo Police are investigating this report o...,,100 Block LINCOLN PW,,BUFFALO,...,Theft,,,,,,,Elmwood Bidwell,District D,DELAWARE
2,204751,942663542.0,06-1840984,1951-07-03 19:51:00,UUV,Buffalo Police are investigating this report o...,,200 Block W FERRY ST,,BUFFALO,...,Theft of Vehicle,,,,,,,Upper West Side,District B,NIAGARA
3,146812,146621859.0,13-0730379,1951-12-05 02:20:21,ASSAULT,Buffalo Police are investigating this report o...,,200 Block CAMBRIDGE AV,,BUFFALO,...,Assault,,,,,,,Genesee-Moselle,District E,MASTEN
4,233717,942606470.0,08-3530528,1952-08-30 16:00:00,LARCENY/THEFT,Buffalo Police are investigating this report o...,,200 Block JEFFERSON AV,,BUFFALO,...,Theft,,,,,,,Broadway Fillmore,District C,ELLICOTT


In [4]:
df.shape

(267997, 30)

In [5]:
import datetime
df_police = df[['incident_datetime', 'police_dt']]

In [6]:
df_police.incident_datetime = pd.to_datetime(df_police.incident_datetime)
df_police = pd.DataFrame({'no_of_incidents' : df_police.groupby(
   [ df_police.police_dt, df_police.incident_datetime.dt.date] ).size()}).reset_index()

In [7]:
df_police.tail()

Unnamed: 0,police_dt,incident_datetime,no_of_incidents
27380,District E,2021-02-15,6
27381,District E,2021-02-16,3
27382,District E,2021-02-17,2
27383,District E,2021-02-18,3
27384,District E,2021-02-19,2


In [8]:
df_police.isna().sum()


police_dt            0
incident_datetime    0
no_of_incidents      0
dtype: int64

In [9]:
#Data Series Start Date	1/2009 (unreliable data before this date)
df_police['year'] = pd.DatetimeIndex(df_police['incident_datetime']).year
df_police= df_police[df_police.year>=2009]

df_police.head()

Unnamed: 0,police_dt,incident_datetime,no_of_incidents,year
995,District A,2009-01-01,17,2009
996,District A,2009-01-02,8,2009
997,District A,2009-01-03,8,2009
998,District A,2009-01-04,6,2009
999,District A,2009-01-05,9,2009


In [10]:
df_police['year'].unique()


array([2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019,
       2020, 2021])

In [11]:
df_police.nunique()

police_dt               5
incident_datetime    4433
no_of_incidents        34
year                   13
dtype: int64

#### create dummies variables district

In [12]:
# dummy_district = pd.get_dummies(df_police['police_dt'])

# dummy_district.columns = df_police['police_dt'].unique()

# df_police = pd.concat([df_police,dummy_district], axis=1)


In [13]:
df_police.drop(['incident_datetime'] ,axis= 1,  inplace=True)

In [14]:
df_police.head()

Unnamed: 0,police_dt,no_of_incidents,year
995,District A,17,2009
996,District A,8,2009
997,District A,8,2009
998,District A,6,2009
999,District A,9,2009


# Pipeline

In [15]:
X=  df_police.drop('no_of_incidents', axis = 1)
y = df_police.no_of_incidents

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y , test_size= 0.2, random_state= 42)

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

#SimpleImputer fill any missing values 
#Scaler numeric transformer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

#One hot encoder to transform categorial values into integers.

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [18]:
#Select les columns numeric 
#Select les columns categoric 
integer_features = list(X.columns[X.dtypes == 'int64'])
continuous_features = list(X.columns[X.dtypes == 'float64'])
categorical_features = list(X.columns[X.dtypes == 'object'])
numeric_features = integer_features + continuous_features 


from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

## Model selection

In [19]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBRegressor
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(), 
    LinearDiscriminantAnalysis(), 
    XGBRegressor()
    ]
pipes= []
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)  
    pipes.append(pipe) 
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))


KNeighborsClassifier(n_neighbors=3)
model score: 0.088
SVC(C=0.025, probability=True)
model score: 0.098
DecisionTreeClassifier()
model score: 0.107
RandomForestClassifier()
model score: 0.107
AdaBoostClassifier()
model score: 0.086
GradientBoostingClassifier()
model score: 0.107
LinearDiscriminantAnalysis()
model score: 0.099
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)
model score: 0.376
