In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
%matplotlib inline
sns.set_style('white')

Now that you have two new regression methods at your fingertips, it's time to give them a spin. In fact, for this challenge, let's put them together! Pick a dataset of your choice with a binary outcome and the potential for at least 15 features. If you're drawing a blank, the crime rates in 2013 dataset has a lot of variables that could be made into a modelable binary outcome.

Engineer your features, then create three models. Each model will be run on a training set and a test-set (or multiple test-sets, if you take a folds approach). The models should be:

Vanilla logistic regression
Ridge logistic regression
Lasso logistic regression
If you're stuck on how to begin combining your two new modeling skills, here's a hint: the SKlearn LogisticRegression method has a "penalty" argument that takes either 'l1' or 'l2' as a value.

In your report, evaluate all three models and decide on your best. Be clear about the decisions you made that led to these models (feature selection, regularization parameter selection, model evaluation criteria) and why you think that particular model is the best of the three. Also reflect on the strengths and limitations of regression as a modeling approach. Were there things you couldn't do but you wish you could have done


Propertycrime = α + Population + Population2 + Murder + Robbery

The 'population' variable is already set for you, but you will need to create the last three features. Robbery and Murder are currently continuous variables. For this model, please use these variables to create categorical features where values greater than 0 are coded 1, and values equal to 0 are coded 0. You'll use this data and model in a later assignment- for now, just write the code you need to get the data ready. Don't forget basic data cleaning procedures, either! Do some graphing to see if there are any anomalous cases, and decide how you want to deal with them.

In [2]:


df = pd.read_csv('C:\\Users\\david\Desktop\\thinkful datasets\\table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.csv', header=4,)

df
display(df.columns)

data= pd.DataFrame(df) 

needed_data = data.loc[:, ['City','Population', 'Property\ncrime','Murder and\nnonnegligent\nmanslaughter', 'Larceny-\ntheft', 'Robbery','Burglary']]
needed_data.columns = ['City','Population','Property_Crime', 'Murder', 'Theft', 'Robbery', 'Burglary']

needed_data = needed_data.dropna()

#needed_data.Robbery = needed_data.Robbery.where(needed_data.Robbery == 0, 1)
#needed_data.Murder = needed_data.Murder.where(needed_data.Murder == 0, 1)

#needed_data["population2"] = needed_data["Population"]**2

needed_data = needed_data[np.abs(needed_data.Population-needed_data.Population.mean())<=(2*needed_data.Population.std())] #keep only the ones that are within +2 to -2 standard deviations in the column 'Data'.

#dropping city column
needed_data = needed_data.drop(['City'], axis=1)

Index(['City', 'Population', 'Violent\ncrime',
       'Murder and\nnonnegligent\nmanslaughter',
       'Rape\n(revised\ndefinition)1', 'Rape\n(legacy\ndefinition)2',
       'Robbery', 'Aggravated\nassault', 'Property\ncrime', 'Burglary',
       'Larceny-\ntheft', 'Motor\nvehicle\ntheft', 'Arson3'],
      dtype='object')

In [3]:
needed_data.head(5)

Unnamed: 0,Population,Property_Crime,Murder,Theft,Robbery,Burglary
0,1861.0,12.0,0.0,10.0,0.0,2.0
1,2577.0,24.0,0.0,20.0,0.0,3.0
2,2846.0,16.0,0.0,15.0,0.0,1.0
3,97956.0,4090.0,8.0,3243.0,227.0,705.0
4,6388.0,223.0,0.0,165.0,4.0,53.0


In [4]:
needed_data['Property_Crime'].describe()

count      347.000000
mean       385.752161
std       1034.369072
min          0.000000
25%         40.000000
50%        112.000000
75%        340.500000
max      12491.000000
Name: Property_Crime, dtype: float64

Using the Property Crime median of 112 I turn the outcome variable into binary codes of 1 or 0

In [5]:
needed_data['Property_Crime'] = np.where(needed_data['Property_Crime']>=112, 1, 0)

In [6]:


exclude = ['Property_Crime']
# Independent Variable
y = needed_data.Property_Crime
# Dependent Variables
X = needed_data.ix[:, needed_data.columns.difference(exclude)]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  import sys


## Vanilla Logistic Regression

In [7]:
# Declare a logistic regression classifier.
# Parameter regularization coefficient C described above.
lr = LogisticRegression(C=1e9)

# Fit the model.
fit = lr.fit(X, y)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X)

print('\n Accuracy by Property Crime')
print(pd.crosstab(pred_y_sklearn, y))

print('\n Percentage accuracy')
print(lr.score(X, y))

Coefficients
[[ 2.19023802e-03  3.95615006e-04 -9.94390656e-05  4.48101697e-03
   1.60759794e-02]]
[-0.00671328]

 Accuracy by Property Crime
Property_Crime    0    1
row_0                   
0                67    4
1               105  171

 Percentage accuracy
0.6858789625360231


In [19]:
score = cross_val_score(fit, X, y, cv=5)
print('\nCross Validated R2 scores: \n', score)
print("\nOverall Logistic Regression R2: %0.3f (+/- %0.3f)\n" % (score.mean(), score.std() * 2))


Cross Validated R2 scores: 
 [0.97142857 0.9        0.95652174 0.72463768 0.65217391]

Overall Logistic Regression R2: 0.841 (+/- 0.258)



## Ridge Regression Model

In [15]:
Ridge_list = pd.DataFrame(columns=['lambda', 'ridgeR2'])

lambs = [1e-15,1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20, 100, 1000, 10000, 100000, 1e10, 1e15]

for i in lambs:
    ridge = LogisticRegression(penalty='l2', C=i)
    ridge.fit(X, y)
    
    scores = pd.DataFrame([[i, ridge.score(X, y)]],
                           columns=['lambda', 'ridgeR2'])
    Ridge_list = Ridge_list.append(scores, ignore_index=True)

In [16]:
Ridge_list

Unnamed: 0,lambda,ridgeR2
0,1e-15,0.504323
1,1e-08,0.504323
2,0.0001,0.677233
3,0.001,0.688761
4,0.01,0.685879
5,1.0,0.685879
6,5.0,0.685879
7,10.0,0.685879
8,20.0,0.685879
9,100.0,0.685879


The above shows that after 1.0e-03 there is no change to further change to the model.

In [17]:
ridge = LogisticRegression(penalty='l2', C=1e-3)

# Fit the model.
ridge_fit = ridge.fit(X, y)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_ridge = ridge.predict(X)

print('\n Accuracy by Property Crime')
print(pd.crosstab(pred_y_ridge, y))

print('\n Percentage accuracy')
print(ridge.score(X, y))

Coefficients
[[ 2.19023802e-03  3.95615006e-04 -9.94390656e-05  4.48101697e-03
   1.60759794e-02]]
[-0.00671328]

 Accuracy by Property Crime
Property_Crime    0    1
row_0                   
0                67    3
1               105  172

 Percentage accuracy
0.6887608069164265


In [20]:
score = cross_val_score(ridge_fit, X, y, cv=5)
print('\nCross Validated R2 scores: \n', score)
print("\nOverall Ridge Regression R2: %0.3f (+/- %0.3f)\n" % (score.mean(), score.std() * 2))


Cross Validated R2 scores: 
 [0.7        0.77142857 0.71014493 0.72463768 0.65217391]

Overall Ridge Regression R2: 0.712 (+/- 0.077)



## Lasso Logistic Regression Model

In [None]:
lasso_list = pd.DataFrame(columns=['lambda', 'LassoR2'])

lambs = [1e-15,1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20, 100, 1000, 10000, 100000, 1e10, 1e15]

for i in lambs:
    lasso = LogisticRegression(penalty='l1', C=i)
    lasso.fit(X, y)
    
    scores = pd.DataFrame([[i, lasso.score(X, y)]],
                           columns=['lambda', 'LassoR2'])
    lasso_list = lasso_list.append(scores, ignore_index=True)