In [None]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
sns.set()

In [None]:
df = pd.read_csv('/Users/daviderickson/projects/datasf/data/Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv')

In [None]:
df.shape

In [None]:
print(df.columns)

In [None]:
df.head(5)

In [None]:
df[['Address', 'X', 'Y', 'Location', 'PdId',
       'SF Find Neighborhoods', 'Current Police Districts',
       'Current Supervisor Districts', 'Analysis Neighborhoods']].head(5)

In [None]:
df.describe()

In [None]:
corr = df.corr()

In [None]:
sns.heatmap(corr)

In [None]:
cols_list = ['Address', 'X', 'Y', 'Location', 'PdId',
       'SF Find Neighborhoods', 'Current Police Districts',
       'Current Supervisor Districts', 'Analysis Neighborhoods']
for col in cols_list:
    series = df[col]
    series.unique()
    print(col, '- unique entries - ', len(series.unique()))

In [None]:
for col in df.columns: 
    series = df[col]
    print(len(series.unique()), col)

In [None]:
df.groupby('Category')['DayOfWeek'].describe()

In [None]:
df.groupby('Resolution').count()['IncidntNum']

In [None]:
df_res_dow = df.pivot_table('IncidntNum', index='Resolution', columns='DayOfWeek', aggfunc='count', margins=False)
df_res_dow

In [None]:
dow_list = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

df_res_dow['total'] = df_res_dow.sum(axis=1)
for col in df_res_dow.columns:
    df_res_dow[col] = df_res_dow[col] / df_res_dow['total']
df_res_dow.drop('total', axis=1, inplace=True)
df_res_dow = df_res_dow[dow_list]
df_res_dow

In [None]:
plt.figure(figsize=(5,5))
ax = sns.heatmap(df_res_dow)

In [None]:
df_cat_dow = df.pivot_table('IncidntNum', index='Category', columns='DayOfWeek', aggfunc='count', margins=False)
df_cat_dow

In [None]:
dow_list = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

df_cat_dow['total'] = df_cat_dow.sum(axis=1)
for col in df_cat_dow.columns:
    df_cat_dow[col] = df_cat_dow[col] / df_cat_dow['total']
df_cat_dow.drop('total', axis=1, inplace=True)
df_cat_dow = df_cat_dow[dow_list]
df_cat_dow

In [None]:
plt.figure(figsize=(5,8))
ax = sns.heatmap(df_cat_dow)

In [None]:
df.pivot_table('IncidntNum', index='Resolution', columns='Category', aggfunc='count', margins=True)

In [None]:
df['Category'].unique()

In [None]:
df.groupby('DayOfWeek')['DayOfWeek'].count().plot()

In [None]:
df.count()

In [None]:
'''times = pd.to_datetime(df['Time'])
df['times'] = times'''

In [None]:
'''df.pivot_table('IncidntNum', index='DayOfWeek', columns='Category', aggfunc='count')'''

In [None]:
'''vals, bins, patches = plt.hist(df['DayOfWeek'])'''

# PCA

In [None]:
# PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pca = PCA()
print(pca)

print(df.shape)
    
dummies_cols = ['Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution']
drop_cols = ['Date', 'Time', 'Address', 'Location'] #Date and Tme can be handled better
df1 = pd.get_dummies(df, columns=dummies_cols)
df1 = df1.drop(drop_cols, axis=1)
df1 = df1.dropna(axis=0)
X = df1.loc[:,:].values
X_cols = df1.loc[:,:].columns

scaler=StandardScaler()
X_transformed = scaler.fit_transform(X)

pca.fit(X_transformed)
X_pca = pca.fit(X_transformed).transform(X_transformed)

In [None]:
# Explained Variance 
plt.figure(figsize=(15,8))
ax = sns.barplot(x=np.arange(100), y=pca.explained_variance_[0:100]/sum(pca.explained_variance_))
ax.set_xlabel('Principal Component')
ax.set_ylabel('Explained Variance')

In [None]:
# Histogram of explained variance
vals, bins, patches = plt.hist(pca.explained_variance_/sum(pca.explained_variance_), log=True, bins=50)
plt.title('Histogram of explained variance')
plt.xlabel('Explained Variance')
plt.ylabel('Number of components')

# Observations
num_exp_var = np.sum(vals)-vals[0] #
print(num_exp_var, 'PCA components explain all variance.')
print(np.sum(vals), 'total features')
print(num_exp_var/np.sum(vals), '% of all features')

# Random Forest

In [None]:
df.columns

In [None]:
df.loc[df['Category'] == 'LARCENY/THEFT', 'Category']

In [None]:
df.columns[df.dtypes == 'object'].to_list()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import validation_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

def make_Xy_RF(df, y='Resolution'): 
    drop_cols = ['IncidntNum', 'Location'] 
    df1 = df.drop(drop_cols, axis=1)
    df1.loc[df1['Category'] == 'LARCENY/THEFT', 'Category'] = 'Larceny_Theft'
    # df1 = df1.dropna(axis=0)
    dummies_cols = df1.columns[df1.dtypes == 'object'].to_list()
    df1 = pd.get_dummies(df1, columns=dummies_cols)
    X = df1.loc[:,:].values
    X_cols = df1.loc[:,:].columns
    y = df['Resolution']
    return X, y, X_cols

X, y, X_cols = make_Xy_RF(df, y='Resolution')
model = RandomForestClassifier(n_estimators=100, n_jobs=-1)

param_range=[4,5,6,8,10,13,20,30,40,50,60,100]
train_scores_vc, test_scores_vc = \
    validation_curve(estimator=model, X=X, y=y, param_name="n_estimators", param_range=param_range, \
                     cv=10, n_jobs=-1, scoring='accuracy')

In [None]:
train_scores_mean = np.mean(train_scores_vc, axis=1)
train_scores_std = np.std(train_scores_vc, axis=1)
test_scores_mean = np.mean(test_scores_vc, axis=1)
test_scores_std = np.std(test_scores_vc, axis=1)

ax = plt.figure()
plt.plot(param_range,train_scores_mean,'o',label="Train")
plt.plot(param_range,test_scores_mean,'o',label="Test")
plt.xlabel("n_estimators")
plt.ylabel("accuracy")
plt.title("Validation Curves")
plt.legend(loc="best")
plt.xscale('log')
ax.fill_between(param_range, train_scores_mean+train_scores_std, train_scores_mean+train_scores_std, 
                facecolor='blue', alpha=0.5)
ax.fill_between(param_range, test_scores_mean+test_scores_std, test_scores_mean+test_scores_std, 
                facecolor='red', alpha=0.5)


In [None]:
from sklearn.linear_model import LogisticRegressionCV