In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.utils.np_utils import to_categorical

In [3]:
df_train = pd.read_csv('/Users/dominicdebiaso/Development/datasets/kaggle_crime_train.csv', parse_dates=['Dates'])
df_test = pd.read_csv('/Users/dominicdebiaso/Development/datasets/kaggle_crime_test.csv', 
                      parse_dates=['Dates'], index_col='Id')

### Data Processing

In [4]:
def process_data(df):
    # Create additional variables from the 'date' field
    df['hour'] = df['Dates'].dt.hour
    df['day'] = df['Dates'].dt.day
    df['dayofyear'] = df['Dates'].dt.dayofyear
    df['weekofyear'] = df['Dates'].dt.weekofyear
    df['month'] = df['Dates'].dt.month
    df['year'] = df['Dates'].dt.year
    
    # If the 'address' field does not contain "block" then crime occured on a street corner
#     pd.Categorical(df['Category']).categories
    df['corner_crime'] = np.where(df['Address'].str.contains('Block', case=False), 0, 1)
    
    le = LabelEncoder()
    col_list = ['DayOfWeek', 'PdDistrict']
    for i in col_list:
        df[i+'_num'] = le.fit_transform(df[i])
    
    return pd.concat([df['hour'], df['day'], df['dayofyear'], df['weekofyear'], pd.get_dummies(df['month']), 
                      pd.get_dummies(df['year']), df['corner_crime'], df['DayOfWeek_num'], df['PdDistrict_num'], 
                      df['X'], df['Y']], axis=1)

## Create new dfs
train = process_data(df_train)
test = process_data(df_test)

In [9]:
## Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(train)
X_test = scaler.transform(test)
print(X_train.shape)
print(X_test.shape)

## Convert target variable to numerical format and then dummy encode (ie. OHE)
le = LabelEncoder()
y_train = le.fit_transform(df_train['Category'])
print(y_train.shape)
y_train = to_categorical(y_train)
print(y_train.shape)

(878049, 34)
(884262, 34)
(878049,)
(878049, 39)


### Neural Network Development

In [3]:
# Initialize model
model = Sequential()
model.add(Dense(34, input_dim=34, init='glorot_uniform', activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(17, init='glorot_uniform', activation='relu'))
model.add(BatchNormalization()) #used btwn linear and non-linear layers to normalize activation function as to be 
                                #centered in the linear section of the activation function (such as Sigmoid)
model.add(Dropout(0.4))
model.add(Dense(39, init='glorot_uniform', activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=1000, nb_epoch=10, verbose=0)

In [14]:
# Predict values
y_pred_proba = model.predict_proba(X_test)
df_nn = pd.DataFrame(y_pred_proba, index=df_test.index, columns=le.classes_)
df_nn.to_csv('/Users/dominicdebiaso/Desktop/kaggle_san_fran_crime_neural_net.csv')

