In [None]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
sns.set()

In [None]:
df = pd.read_csv('/Users/daviderickson/projects/datasf/data/Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv')

In [None]:
df.shape

In [None]:
print(df.columns)

In [None]:
df.head(5)

In [None]:
df[['Address', 'X', 'Y', 'Location', 'PdId',
       'SF Find Neighborhoods', 'Current Police Districts',
       'Current Supervisor Districts', 'Analysis Neighborhoods']].head(5)

In [None]:
df.describe()

In [None]:
corr = df.corr()

In [None]:
sns.heatmap(corr)

In [None]:
cols_list = ['Address', 'X', 'Y', 'Location', 'PdId',
       'SF Find Neighborhoods', 'Current Police Districts',
       'Current Supervisor Districts', 'Analysis Neighborhoods']
for col in cols_list:
    series = df[col]
    series.unique()
    print(col, '- unique entries - ', len(series.unique()))

In [None]:
for col in df.columns: 
    series = df[col]
    print(len(series.unique()), col)

In [None]:
df.groupby('Resolution').count()['IncidntNum']

In [None]:
df_res_dow = df.pivot_table('IncidntNum', index='Resolution', columns='DayOfWeek', aggfunc='count', margins=False)
dow_list = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

df_res_dow['total'] = df_res_dow.sum(axis=1)
for col in df_res_dow.columns:
    df_res_dow[col] = df_res_dow[col] / df_res_dow['total']
df_res_dow.drop('total', axis=1, inplace=True)
df_res_dow = df_res_dow[dow_list]
df_res_dow

In [None]:
dow_list = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df_cat_dow = df.pivot_table('IncidntNum', index='Category', columns='DayOfWeek', aggfunc='count', margins=False)

df_cat_dow['total'] = df_cat_dow.sum(axis=1)
for col in df_cat_dow.columns:
    df_cat_dow[col] = df_cat_dow[col] / df_cat_dow['total']
df_cat_dow.drop('total', axis=1, inplace=True)
df_cat_dow = df_cat_dow[dow_list]
df_cat_dow

plt.figure(figsize=(5,8))
ax = sns.heatmap(df_cat_dow)

In [None]:
df.groupby('Category')['Category'].count()

In [None]:
df['Category'].unique()

In [None]:
df.groupby('DayOfWeek')['DayOfWeek'].count().plot()

# Random Forest

In [None]:
df.columns

In [None]:
df.columns[df.dtypes == 'object'].to_list()

In [None]:
print(df.shape)
print(df['Resolution'].dropna(axis=0).shape)
print(df['Resolution'].shape)

In [None]:
X.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import validation_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

def make_Xy_RF(df, ylabel='Resolution'): 
    df1 = df.dropna(axis=0) #Drop rows w/ NA
    y = df1[ylabel]
    
    drop_cols = [ylabel, 'IncidntNum', 'Location', 'Address', 'Descript', 'Date', 'Time'] 
        # May also want to drop ':@computed_region_' cols, which have lots of NaNs
    drop_cols = set(drop_cols) & set(df1.columns)
    df1 = df1.drop(drop_cols, axis=1)
    print(df1.columns)
    dummies_cols = df1.columns[df1.dtypes == 'object'].to_list()
    df1 = pd.get_dummies(df1, columns=dummies_cols)
    X = df1.loc[:,:].values
    X_cols = df1.loc[:,:].columns
    return X, y, X_cols

In [None]:
X, y, X_cols = make_Xy_RF(df, ylabel='Resolution')
model = RandomForestClassifier(n_estimators=100, n_jobs=-1)

param_range=[4,5,6,8,10,13,20,30,40,50,60,100]
train_scores_vc, test_scores_vc = \
    validation_curve(estimator=model, X=X, y=y, param_name="n_estimators", param_range=param_range, \
                     cv=10, n_jobs=-1, scoring='accuracy')

In [None]:
train_scores_mean = np.mean(train_scores_vc, axis=1)
train_scores_std = np.std(train_scores_vc, axis=1)
test_scores_mean = np.mean(test_scores_vc, axis=1)
test_scores_std = np.std(test_scores_vc, axis=1)

ax = plt.figure()
plt.plot(param_range,train_scores_mean,'o',label="Train")
plt.plot(param_range,test_scores_mean,'o',label="Test")
plt.xlabel("n_estimators")
plt.ylabel("accuracy")
plt.title("Validation Curves")
plt.legend(loc="best")
plt.xscale('log')
plt.fill_between(param_range, train_scores_mean+train_scores_std, train_scores_mean-train_scores_std, 
                facecolor='blue', alpha=0.5)
plt.fill_between(param_range, test_scores_mean+test_scores_std, test_scores_mean-test_scores_std, 
                facecolor='orange', alpha=0.5)


In [None]:
X, y, X_cols = make_Xy_RF(df, ylabel='Category')
model = RandomForestClassifier(n_estimators=100, n_jobs=-1)

param_range=[4,5,6,8,10,13,20,30,40,50,60,100]
train_scores_vc, test_scores_vc = \
    validation_curve(estimator=model, X=X, y=y, param_name="n_estimators", param_range=param_range, \
                     cv=10, n_jobs=-1, scoring='accuracy')

In [None]:
train_scores_mean = np.mean(train_scores_vc, axis=1)
train_scores_std = np.std(train_scores_vc, axis=1)
test_scores_mean = np.mean(test_scores_vc, axis=1)
test_scores_std = np.std(test_scores_vc, axis=1)

ax = plt.figure()
plt.plot(param_range,train_scores_mean,'o',label="Train")
plt.plot(param_range,test_scores_mean,'o',label="Test")
plt.xlabel("n_estimators")
plt.ylabel("accuracy")
plt.title("Validation Curves")
plt.legend(loc="best")
plt.xscale('log')
plt.fill_between(param_range, train_scores_mean+train_scores_std, train_scores_mean-train_scores_std, 
                facecolor='blue', alpha=0.5)
plt.fill_between(param_range, test_scores_mean+test_scores_std, test_scores_mean-test_scores_std, 
                facecolor='orange', alpha=0.5)


In [None]:
y

In [None]:
df_ass = df[df['Category'] == 'WARRANTS']
X, y, X_cols = make_Xy_RF(df_ass, ylabel='Resolution')
model = RandomForestClassifier(n_estimators=100, n_jobs=-1)

param_range=[5,10,30,100,300]
train_scores_vc, test_scores_vc = \
    validation_curve(estimator=model, X=X, y=y, param_name="n_estimators", param_range=param_range, \
                     cv=10, n_jobs=-1, scoring='accuracy')

In [None]:
train_scores_mean = np.mean(train_scores_vc, axis=1)
train_scores_std = np.std(train_scores_vc, axis=1)
test_scores_mean = np.mean(test_scores_vc, axis=1)
test_scores_std = np.std(test_scores_vc, axis=1)

ax = plt.figure()
plt.plot(param_range,train_scores_mean,'o',label="Train")
plt.plot(param_range,test_scores_mean,'o',label="Test")
plt.xlabel("n_estimators")
plt.ylabel("accuracy")
plt.title("Validation Curves")
plt.legend(loc="best")
plt.xscale('log')
plt.fill_between(param_range, train_scores_mean+train_scores_std, train_scores_mean-train_scores_std, 
                facecolor='blue', alpha=0.5)
plt.fill_between(param_range, test_scores_mean+test_scores_std, test_scores_mean-test_scores_std, 
                facecolor='orange', alpha=0.5)

In [None]:
model.fit(X, y)
feature_importances_rf = pd.DataFrame(model.feature_importances_, index = X_cols, columns=['importance']).sort_values('importance',ascending=False)
feature_importances_rf.head(10)

# RF Classifier
## Predict Category Given Description

In [None]:
keep_cols = ['Category', 'Descript']
df_desc = df[keep_cols].copy()
df_desc.dropna()

y = df_desc['Category']
df_desc.drop('Category', axis=1, inplace=True)

print(df_desc.columns)
df_desc = pd.get_dummies(df_desc)
X = df_desc.loc[:,:].values
X_cols = df_desc.loc[:,:].columns

model = RandomForestClassifier(n_estimators=100, n_jobs=-1)

param_range=[5,10,30,100,300]
train_scores_vc, test_scores_vc = \
    validation_curve(estimator=model, X=X, y=y, param_name="n_estimators", param_range=param_range, \
                     cv=10, n_jobs=-1, scoring='accuracy')

In [None]:
train_scores_mean = np.mean(train_scores_vc, axis=1)
train_scores_std = np.std(train_scores_vc, axis=1)
test_scores_mean = np.mean(test_scores_vc, axis=1)
test_scores_std = np.std(test_scores_vc, axis=1)

ax = plt.figure()
plt.plot(param_range,train_scores_mean,'o',label="Train")
plt.plot(param_range,test_scores_mean,'o',label="Test")
plt.xlabel("n_estimators")
plt.ylabel("accuracy")
plt.title("Validation Curves")
plt.legend(loc="best")
plt.xscale('log')
plt.fill_between(param_range, train_scores_mean+train_scores_std, train_scores_mean-train_scores_std, 
                facecolor='blue', alpha=0.5)
plt.fill_between(param_range, test_scores_mean+test_scores_std, test_scores_mean-test_scores_std, 
                facecolor='orange', alpha=0.5)