In [108]:
import numpy as np
import pandas as pd
import math
import seaborn as sns
sns.set_style('white')
from matplotlib import pyplot as plt
%matplotlib inline
import statsmodels.api as sm
import sklearn
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression 

In [109]:
# read in data
df = pd.read_csv(
    '../data-201-resources-master/New_York_offenses/NEW_YORK-Offenses_Known_to_Law_Enforcement_by_City_2013 - 13tbl8ny_clean.csv',
    header=0)

In [110]:
print(df.columns)

Index(['City', 'Population', 'Violent\ncrime',
       'Murder and\nnonnegligent\nmanslaughter', 'Rape\n(legacy\ndefinition)2',
       'Robbery', 'Aggravated\nassault', 'Property\ncrime', 'Burglary',
       'Larceny-\ntheft', 'Motor\nvehicle\ntheft', 'Arson3', 'HighLow-Crime'],
      dtype='object')


In [111]:
# clean data
#df_data = df.drop(df.columns[4], axis=1)
df_data = df.copy()
df_data.rename(columns = {'Violent\ncrime': 'Violent-Crime', 
                                    'Murder and\nnonnegligent\nmanslaughter': 'Murder',
                                    'Rape\n(legacy\ndefinition)2': 'Rape',
                                    'Aggravated\nassault': 'Assault',
                                    'Property\ncrime': 'Property-Crime',
                                    'Larceny-\ntheft': 'Theft',
                                    'Motor\nvehicle\ntheft': 'MV-theft'}, inplace=True)
# remove outliner
#df_data.drop(index='New York', level=1, inplace=True)
df_data['HighLow-Crime'] = df_data['HighLow-Crime'].apply(lambda x: 1 if x == 'h' else 0)

In [112]:
print(df_data.columns)

Index(['City', 'Population', 'Violent-Crime', 'Murder', 'Rape', 'Robbery',
       'Assault', 'Property-Crime', 'Burglary', 'Theft', 'MV-theft', 'Arson3',
       'HighLow-Crime'],
      dtype='object')


In [113]:
df_data.head()

Unnamed: 0,City,Population,Violent-Crime,Murder,Rape,Robbery,Assault,Property-Crime,Burglary,Theft,MV-theft,Arson3,HighLow-Crime
0,Adams Village,1861,0,0,0,0,0,12,2,10,0,0.0,0
1,Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0.0,0
2,Akron Village,2846,3,0,0,0,3,16,1,15,0,0.0,0
3,Albany,97956,791,8,30,227,526,4090,705,3243,142,,1
4,Albion Village,6388,23,0,3,4,16,223,53,165,5,,1


In [114]:
# Vanilla Binary logistic regression
logReg = LogisticRegression(C=1e9)

X_statsmod = df_data.drop('HighLow-Crime', axis=1)
X_statsmod.drop(['City', 'Population'], axis=1, inplace=True)
X_statsmod.fillna(value=0, inplace=True)
Y_statsmod = df_data['HighLow-Crime']
# fit the model
logit = logReg.fit(X_statsmod, Y_statsmod)
print('Coefficiencies: ')
print(logit.coef_)
print(logit.intercept_)

Y_predict = logReg.predict(X_statsmod)
print('\n Accuracy by admission status')
print(pd.crosstab(Y_predict, Y_statsmod))

print('\n Percentage accuracy')
print(logReg.score(X_statsmod, Y_statsmod))


Coefficiencies: 
[[ 0.50807485  1.68862295 -0.33569832 -0.44443187 -0.4004179   0.06936086
   0.01916634  0.00628777  0.04390675  0.905041  ]]
[-10.73541829]

 Accuracy by admission status
HighLow-Crime    0    1
row_0                  
0              179    0
1                0  168

 Percentage accuracy
1.0




In [123]:
# Ridge logistic regression
# Define the training and test sets
trainsize = int(df_data.shape[0] / 2)
df_train = df_data.iloc[:trainsize, :].copy()
df_test = df_data.iloc[trainsize:, :].copy()

linReg_Ridge = linear_model.Ridge(alpha=0.5, fit_intercept=False)
X_train = df_train.drop('HighLow-Crime', axis=1)
X_train.drop(['City', 'Population'], axis=1, inplace=True)
X_train.fillna(value=0, inplace=True)
Y_train = df_train['HighLow-Crime']
linReg_Ridge.fit(X_train, Y_train)
print('\nR-squared simple model:')
print(linReg_Ridge.score(X_train, Y_train))



R-squared simple model:
0.10756790841182762


In [124]:
X_test = df_test.drop('HighLow-Crime', axis=1)
X_test.drop(['City', 'Population'], axis=1, inplace=True)
X_test.fillna(value=0, inplace=True)
Y_test = df_test['HighLow-Crime']
linReg_Ridge.fit(X_test, Y_test)
print('\nR-squared simple test model:')
print(linReg_Ridge.score(X_test, Y_test))


R-squared simple test model:
0.18383274429054086


In [125]:
# Lasso logistic regression
linReg_Lasso = linear_model.Lasso(alpha=0.25)
linReg_Lasso.fit(X_train, Y_train)
print('R² for the Lasso model :')
print(linReg_Lasso.score(X_train, Y_train))


R² for the Lasso model :
0.2672766894548967




In [126]:
linReg_Lasso.fit(X_test, Y_test)
print('R² for the Lasso model test sample:')
print(linReg_Lasso.score(X_test, Y_test))

R² for the Lasso model test sample:
0.3498159073646857
