In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

%matplotlib inline

In [3]:
df_train = pd.read_csv('/Users/dominicdebiaso/Development/datasets/kaggle_crime_train.csv', parse_dates=['Dates'])
df_test = pd.read_csv('/Users/dominicdebiaso/Development/datasets/kaggle_crime_test.csv', 
                      parse_dates=['Dates'], index_col='Id')

### Data Munging

In [4]:
df_train.head()

In [6]:
df_train.dtypes

In [7]:
df_train.isnull().sum()

In [8]:
def process_data(df):
    # Create additional variables from the 'date' field
    df['hour'] = df['Dates'].dt.hour
    df['day'] = df['Dates'].dt.day
    df['dayofyear'] = df['Dates'].dt.dayofyear
    df['weekofyear'] = df['Dates'].dt.weekofyear
    df['month'] = df['Dates'].dt.month
    df['year'] = df['Dates'].dt.year
    
    # If the 'address' field does not contain "block" then crime occured on a street corner
#     pd.Categorical(df['Category']).categories
    df['corner_crime'] = np.where(df['Address'].str.contains('Block', case=False), 0, 1)
    
    le = LabelEncoder()
    col_list = ['DayOfWeek', 'PdDistrict']
    for i in col_list:
        df[i+'_num'] = le.fit_transform(df[i])
    
    return pd.concat([df['hour'], df['day'], df['dayofyear'], df['weekofyear'], pd.get_dummies(df['month']), 
                      pd.get_dummies(df['year']), df['corner_crime'], df['DayOfWeek_num'], df['PdDistrict_num'], 
                      df['X'], df['Y']], axis=1)

In [9]:
# Create new dfs
train = process_data(df_train)
test = process_data(df_test)

### Random Forest 

In [None]:
rfc = RandomForestClassifier()
rfc.fit(train, df_train['Category'])
df_rfc_results = pd.DataFrame(rfc.predict_proba(test), index=test.index, columns=rfc.classes_)

### XGBoost 

In [18]:
xgbc = xgb.XGBClassifier()

param_grid = {
    ## Booster parameters
    'learning_rate':np.arange(0,1,0.5).tolist(),      #also 'eta', make model, reduce learning rate to reduce overfit 
                                                      #but add more trees. scales how much each tree's effect changes 
                                                      #overall prediction. Small values lead to smaller steps in 
                                                      #gradient direction while big ones mean faster convergence but 
                                                      #boosting may not result in best optimum
    'n_estimators':range(50,700,200),                 #want lots of trees though too many can lead to overfitting
    'max_depth':range(3,10,4),                        #large impact on model outcome, high values may lead to overfit
                                                      #want to avoid too deep to overfit but deep enough to learn
    'min_child_weight':range(1,6,3),                  #control model complexity, larger = conservative
    'gamma':np.arange(0,1,0.5).tolist(),              #control model complexity, larger = conservative
    'subsample':np.arange(0,1,0.5).tolist(),          #add randomness to make training robust to nosie
    'colsample_bytree':np.arange(0.1,1,0.5).tolist(), #add randomness to make training robust to nosie
    # Regularization parameters
    'reg_alphda':[1e-2, 1, 100],                      #L1, reduce overfitting, gamma mostly handles this
    'reg_lambda':[1e-2, 1, 100],                      #L2, reduce overfitting, gamma mostly handles this
    
    ## Learning Task parameters
    'objective':['multi:softprob']                    #multiclass classification with predicted probabilities
             }
grid = GridSearchCV(xgbc, param_grid, cv=5, n_jobs=1, scoring='neg_log_loss') #logloss: long term growth rate
                                                                              #evalauting preds of probs of given class

In [19]:
grid.fit(train, df_train['Category'])

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'colsample_bytree': [0.1, 0.6], 'learning_rate': [0.0, 0.5], 'n_estimators': [25], 'objective': ['multi:softprob'], 'max_depth': [3, 7], 'gamma': [0.0, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_log_loss', verbose=0)

In [41]:
df_xgbc_results = pd.DataFrame(grid.predict_proba(test), index=test.index, 
                               columns=pd.Categorical(df_train['Category']).categories.tolist())
df_xgbc_results.to_csv('/Users/dominicdebiaso/Desktop/kaggle_sanfran_crime_submission.csv')

### EDA

In [None]:
df['PdDistrict'].value_counts().plot(kind='bar')