In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

import math
import warnings

from IPython.display import display
from sklearn import linear_model
import statsmodels.formula.api as smf

import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)


In [2]:
df = pd.read_csv('~/New_York.csv', header=4, skipfooter=3)
df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,3,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
0,4,Adams Village,1861,0,0,,0,0,0,12,2,10,0,0.0
1,5,Addison Town and Village,2577,3,0,,0,0,3,24,3,20,1,0.0
2,6,Akron Village,2846,3,0,,0,0,3,16,1,15,0,0.0
3,7,Albany,97956,791,8,,30,227,526,4090,705,3243,142,
4,8,Albion Village,6388,23,0,,3,4,16,223,53,165,5,


In [3]:
df.columns = ['List', 'City', 'Population', 'Violent crime', 'Murder', 'Rape1', 'Rape2', 'Robbery', 'Assault', 'Property crime', 'Burglary', 'Larceny', 'Car theft', 'Arson']

In [4]:
df.head()

Unnamed: 0,List,City,Population,Violent crime,Murder,Rape1,Rape2,Robbery,Assault,Property crime,Burglary,Larceny,Car theft,Arson
0,4,Adams Village,1861,0,0,,0,0,0,12,2,10,0,0.0
1,5,Addison Town and Village,2577,3,0,,0,0,3,24,3,20,1,0.0
2,6,Akron Village,2846,3,0,,0,0,3,16,1,15,0,0.0
3,7,Albany,97956,791,8,,30,227,526,4090,705,3243,142,
4,8,Albion Village,6388,23,0,,3,4,16,223,53,165,5,


In [5]:
df.drop(['List'], axis=1)

Unnamed: 0,City,Population,Violent crime,Murder,Rape1,Rape2,Robbery,Assault,Property crime,Burglary,Larceny,Car theft,Arson
0,Adams Village,1861,0,0,,0,0,0,12,2,10,0,0.000
1,Addison Town and Village,2577,3,0,,0,0,3,24,3,20,1,0.000
2,Akron Village,2846,3,0,,0,0,3,16,1,15,0,0.000
3,Albany,97956,791,8,,30,227,526,4090,705,3243,142,
4,Albion Village,6388,23,0,,3,4,16,223,53,165,5,
5,Alfred Village,4089,5,0,,0,3,2,46,10,36,0,
6,Allegany Village,1781,3,0,,0,0,3,10,0,10,0,0.000
7,Amherst Town,118296,107,1,,7,31,68,2118,204,1882,32,3.000
8,Amityville Village,9519,9,0,,2,4,3,210,16,188,6,1.000
9,Amsterdam,18182,30,0,,0,12,18,405,99,291,15,0.000


In [6]:
NY = pd.DataFrame(df)

In [7]:
NY = NY.drop("Rape1",axis=1)

In [8]:
NY = NY.drop("Arson",axis=1)

In [9]:
#Remove New York City from the data - NYC is an outlier
NY = NY[NY.City != "New York"]

In [10]:
NY['Murder1'] = np.where(NY['Murder'] > NY['Murder'].mean(), 1,0)

In [11]:
NY['Robbery1'] = np.where(NY['Robbery'] >0, 1, 0)

In [12]:
NY['Violent crime1'] = np.where(NY['Violent crime'] >0, 1, 0)

In [13]:
NY._get_numeric_data().corr()

Unnamed: 0,List,Population,Violent crime,Murder,Rape2,Robbery,Assault,Property crime,Burglary,Larceny,Car theft,Murder1,Robbery1,Violent crime1
List,1.0,0.047,0.022,0.032,0.019,0.016,0.025,0.017,0.021,0.015,0.021,0.126,0.035,0.06
Population,0.047,1.0,0.829,0.756,0.797,0.816,0.835,0.887,0.82,0.901,0.828,0.503,0.344,0.201
Violent crime,0.022,0.829,1.0,0.955,0.962,0.994,0.997,0.944,0.971,0.913,0.987,0.406,0.168,0.09
Murder,0.032,0.756,0.955,1.0,0.946,0.963,0.938,0.917,0.956,0.881,0.959,0.408,0.132,0.068
Rape2,0.019,0.797,0.962,0.946,1.0,0.949,0.958,0.949,0.974,0.921,0.96,0.403,0.189,0.104
Robbery,0.016,0.816,0.994,0.963,0.949,1.0,0.983,0.933,0.964,0.898,0.989,0.378,0.154,0.078
Assault,0.025,0.835,0.997,0.938,0.958,0.983,1.0,0.942,0.964,0.913,0.977,0.422,0.176,0.097
Property crime,0.017,0.887,0.944,0.917,0.949,0.933,0.942,1.0,0.977,0.995,0.949,0.443,0.267,0.147
Burglary,0.021,0.82,0.971,0.956,0.974,0.964,0.964,0.977,1.0,0.95,0.976,0.402,0.197,0.108
Larceny,0.015,0.901,0.913,0.881,0.921,0.898,0.913,0.995,0.95,1.0,0.916,0.458,0.297,0.164


## Logistic Regression

In [14]:
# Declare a logistic regression classifier.
# Parameter regularization coefficient C.
lr = LogisticRegression(C=1e9)
X = NY[['Murder1', 'Robbery1']]
y = NY['Violent crime1']


# Fit the model.
fit = lr.fit(X, y)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X)

print('\n Accuracy')
print(pd.crosstab(pred_y_sklearn, y))

print('\n Percentage accuracy')
print(lr.score(X, y))

Coefficients
[[ 7.00336739 11.19727277]]
[0.53408257]

 Accuracy
Violent crime1   0    1
row_0                  
1               51  296

 Percentage accuracy
0.8530259365994236


## Ridge Regression

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [26]:
X_train.shape

(232, 2)

In [29]:
print(X_train.shape) 
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(232, 2)
(115, 2)
(232,)
(115,)


In [30]:
ridgeregr.fit(X_train, y_train)
print('R²:')
print(ridgeregr.score(X_test, y_test))

R²:
-1.976635763606006


## Lasso Regression

In [31]:
lass = linear_model.Lasso(alpha=.35)
lassfit = lass.fit(X_train, y_train)
print('R²:')
print(lass.score(X_test, y_test))

R²:
-0.02568726315185343


The best fit model is the vanilla regression. This might be becasue ridge regression works best on models that are complex.  This model is not very complex.  Ridge also works best when features become mulitcolinear.  In this example, features are correlated with one another but using ridge regression did not work at all and produced a terrible R2.  Additionally there is no indication that the vanilla regression is overfit.  Lasso Regression works best when dealing with thousands or hundred of thousands of predictors and need to optimize processor time or when you want to arrive at a simplier solution that is easier to interpret.  In this case the data set has a small amount of features therefore Lasso produced a poor result.  