In [52]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import scale
import statsmodels.api as sm
from sklearn import linear_model
%matplotlib inline
sns.set_style('white')

In [53]:
PATH = 'unit3_data/Table_8_Offenses_Known_to_Law_Enforcement_by_State_by_City_2013.csv'

#load data while skipping the first three and the last 10 rows
data = pd.read_csv(PATH, skiprows=range(0,3))[:-10].copy()

#drop the State column and the the tail empty columns
data.drop(['State'], 1, inplace=True)
data = data.iloc[:, : -3].copy()

In [54]:
#collect the old colun names and print it somewhere, edit and append it back to df
col_lst = data.columns
col_lst = ['City',
         'Population',
         'Violent_crime',
         'Murder_and_nonnegligent_manslaughter',
         'Rape_revised_definition_1',
         'Rape_legacy_definition_2',
         'Robbery',
         'Aggravated_assault',
         'Property_crime',
         'Burglary',
         'Larceny_theft',
         'Motor_vehicle_theft',
         'Arson_3']

data.columns = col_lst
data.head()

Unnamed: 0,City,Population,Violent_crime,Murder_and_nonnegligent_manslaughter,Rape_revised_definition_1,Rape_legacy_definition_2,Robbery,Aggravated_assault,Property_crime,Burglary,Larceny_theft,Motor_vehicle_theft,Arson_3
0,Abbeville,2645,11,1.0,1,,2,7,63,21,39,3,
1,Adamsville,4481,19,1.0,0,,7,11,321,58,252,11,
2,Addison,744,1,0.0,1,,0,0,25,6,17,2,
3,Alabaster,31170,44,0.0,2,,11,31,640,70,544,26,
4,Alexander City,14692,119,2.0,16,,12,89,661,121,510,30,


In [55]:
#I will drop the 'Rape_legacy_definition_2' and Rape_revised_definition_1 and Arson_3 cloums as most of the rows have no data
data.drop(['Rape_legacy_definition_2', 'Rape_revised_definition_1', 'Arson_3'], 1, inplace=True)
data.dropna(inplace=True)

## Outcome variable: Murder_and_nonnegligent_manslaughter
We will convert `Murder_and_nonnegligent_manslaughter` to binary and use it as the outcome variable in out model.

In [56]:
target = data.Murder_and_nonnegligent_manslaughter
Y = (np.where(target>0,1,0)).reshape(-1,1) 

In [57]:
#create X from data, we will also drop the City col since logistic regression count deal with strings
x = data.drop(['Murder_and_nonnegligent_manslaughter', 'City'], 1)
headers = list(x.columns)
for header in headers:
   x[header] = x[header].str.replace(',', '') 

In [58]:
#we will now scale x usign sklearn scale method
sc = scale
X = pd.DataFrame(sc(x))

  This is separate from the ipykernel package so we can avoid doing imports until


#### Now that we have our independent(X) and categorical outcome variable(Y), let's run the model

## Vanilla logistic regression

### First we will implement the midel using stats model

In [59]:
# Declare predictors.
Y = pd.DataFrame(Y)

# The Statsmodels formulation requires a column with constant value 1 that
# will act as the intercept.
X['intercept'] = 1 

# Declare and fit the model.
logit = sm.Logit(Y, X)
result = logit.fit()

# Lots of information about the model and its coefficients, but the
# accuracy rate for predictions is missing.
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.322596
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:                      0   No. Observations:                 9281
Model:                          Logit   Df Residuals:                     9273
Method:                           MLE   Df Model:                            7
Date:                Sun, 20 Jan 2019   Pseudo R-squ.:                  0.3345
Time:                        11:17:58   Log-Likelihood:                -2994.0
converged:                       True   LL-Null:                       -4499.0
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
0              0.4671      0.348      1.343      0.179      -0.215       1.149
1             50.4257      5

  bse_ = np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


##### Let's now see what the accuracy of this model looks.

In [60]:
# Calculate accuracy. First, get probability that each row will be admitted.
pred_statsmod = result.predict(X)

# Since our Y_statsmod mode is binary, will code admission as 1 if probability is greater than .5.
pred_y_statsmod = np.where(pred_statsmod < .5, 0, 1)

# Create accuracy table.
table = pd.crosstab(Y[0], pred_y_statsmod)
print('\n Accuracy by admission status')
print(table, '\n')

#Calculate accuracy
print((table.iloc[0,0] + table.iloc[1,1]) / (table.sum().sum()))


 Accuracy by admission status
col_0     0    1
0               
0      7356  171
1       966  788 

0.8774916496067234


### Now with sklearn, using cross validation

In [65]:
from sklearn.model_selection import cross_val_score
lr = linear_model.LogisticRegression(C=1e9)

In [66]:
print(cross_val_score(lr, X, Y, cv=5))

[0.87075929 0.87399031 0.87284483 0.86422414 0.87654987]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


#### Both logistic model results are excellent, we will now use Ridge and Lasso regression.

In [76]:
ridgeregr = linear_model.Ridge(alpha=10, fit_intercept=False)
print(cross_val_score(ridgeregr, X, Y, cv=5))

[ 0.02340618  0.10576579 -0.03351495  0.10231475  0.00401898]


In [77]:
ridgeregr = linear_model.Ridge(alpha=10, fit_intercept=False) 
ridgeregr.fit(X, Y)

Ridge(alpha=10, copy_X=True, fit_intercept=False, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [78]:
print(ridgeregr.score(X, Y))

0.11891484593753943


# I will come back to this assignment later to get it completed after identifying a more suitable dataset...