In [1]:
import math
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xlrd
from matplotlib.mlab import PCA as mlabPCA
from sklearn import linear_model
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)


In [2]:
def my_pre_processing(raw, pop_cutoff):

    raw.columns = ['City','Population','Violent_Crime','Murder','Rape1','Rape2','Robbery','Assault',
                   'Property_Crime','Burglary','Larceny_Theft','Vehicle_Theft','Arson']
    df = pd.DataFrame(raw)


    #Data Cleaning
    df.at[df['Rape2'].isna() , 'Rape2'] =  df['Rape1']  
    df.drop(columns='Rape1', axis=1, inplace=True)
    df.dropna(subset=['Population'],inplace=True)
    df['Arson'] = np.where(df.Arson.isnull(),0,1)
    df = df[df.Population < pop_cutoff]

    #Feature Engineering
    df['binaryMurder'] = np.where(df.Murder > 0, 1, 0)
    df['binaryRobbery'] = np.where(df.Robbery > 0, 1, 0)
    df['binaryAssault'] = np.where(df.Assault > 0, 1, 0)
    df['binaryRape'] = np.where(df.Rape2 > 0, 1, 0)
    
    df['Murder_Sqrt'] = np.sqrt(df.Murder)
    df['Violent_Crime_Sqrt'] = df.Violent_Crime ** (1/2)
    df['Population_Square'] = np.square(df.Population)
    df['Population_Sqrt'] = np.sqrt(df.Population)
    df['Robbery_Sqrt'] = np.sqrt(df.Robbery)
    df['Assault_Sqrt'] = np.sqrt(df.Assault)
    df['Property_Crime_Sqrt'] = np.sqrt(df.Property_Crime)
    df['Property_Crime_Square'] = np.square(df.Property_Crime)
    df['Rape2_Sqrt'] = np.sqrt(df.Rape2)
    df['Robbery_Assault_Sum'] = df.Assault + df.Robbery
   
    df['RobberyPerHundred'] = df['Robbery'] / df['Population'] * 100
    df['AssaultPerHundred'] = df['Assault'] / df['Population'] * 100

    df['pop_size'] = pd.cut(raw.Population, [0, 3000, 15000, 75000, 200000, 1000000, 99999999999],  
                          labels=["verysmall",'small',"medium","large",'verylarge',"mill"])
    pop_dummies = pd.get_dummies(df['pop_size'])
    df = pd.merge(df, pop_dummies, left_index=True, right_index=True, how='inner')

    df['Violent_Crime'] = pd.cut(raw.Population, bins=3,  
                          labels=["violent_crime_low",'violent_crime_med',"violent_crime_high"])
    pop_dummies = pd.get_dummies(df['Violent_Crime'])
    df = pd.merge(df, pop_dummies, left_index=True, right_index=True, how='inner')
    
    df['Property_Crime_Per_Capita'] = df.Population / df.Property_Crime

    df['Property_Crime_Rate'] = pd.qcut(df.Property_Crime_Per_Capita, q=2, labels=[0,1])
    
    
    feature_list = ['binaryMurder','binaryRobbery','binaryAssault','binaryRape',
                    'verysmall','small','medium','large','verylarge','mill',
                   "violent_crime_low",'violent_crime_med',"violent_crime_high"]
    
    y_var = 'Property_Crime_Rate'
    

    print('Cities processing: {}'.format(df.shape[0]))
    #print('Cities Removed due to pop above {}: '.format(pop_cutoff))
    pop_remove_list = raw.loc[raw.Population > pop_cutoff,['City','Population']].sort_values(
        'Population', ascending=False)
    #if pop_remove_list.shape[0] == 0:
        #print('None')
    #else:
        #print(pop_remove_list)
    
  
    print('\nCities removed due to Null: ')
    null_list = df.isnull().any(axis=1)
    null_list = null_list[null_list.values==True]
    print(len(null_list))
    
    #print(df[df.isnull().any(axis=1)].index)  #,['City','Population']])
    df.dropna(inplace=True)

    X = df[feature_list]
    Y = df[y_var]
    #sns.boxplot(df.Population)
    return X, Y


def my_confusion_matrix(array_Expected,array_Predicted,colName):
    a = np.array(confusion_matrix(array_Expected, array_Predicted ))
    totalExpectedFalse = a[0,0] + a[0,1]
    totalExpectedTrue = a[1,0] + a[1,1]
    correctFalse = a[0,0] 
    correctTrue = a[1,1] 
    correctTruePct = np.round(correctTrue / totalExpectedTrue,3)
    correctFalsePct = np.round(correctFalse / totalExpectedFalse,3)
    print('Regarding {}, the model correctly predicted {} Negatives out of {} expected Negatives: {}'.format(
        colName,correctFalse,totalExpectedFalse,correctFalsePct))
    print('Regarding {}, the model correctly predicted {} Positives out of {} expected Positives: {}'.format(
        colName,correctTrue,totalExpectedTrue,correctTruePct))    
    
    print(a)


In [3]:
cv=10

file1 = 'table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls'
file2 = 'table_8_offenses_known_to_law_enforcement_georgia_by_city_2013.xls'
file3 = 'table_8_offenses_known_to_law_enforcement_illinois_by_city_2013.xls'
file4 = 'table_8_offenses_known_to_law_enforcement_california_by_city_2013.xls'
file5 = 'table_8_offenses_known_to_law_enforcement_utah_by_city_2013.xls'
file6 = 'table_8_offenses_known_to_law_enforcement_kentucky_by_city_2013.xls'
file7 = 'table_8_offenses_known_to_law_enforcement_colorado_by_city_2013.xls'
file8 = 'table_8_offenses_known_to_law_enforcement_alabama_by_city_2013.xls'
file9 = 'table_8_offenses_known_to_law_enforcement_washington_by_city_2013.xls'
file10 = 'table_8_offenses_known_to_law_enforcement_ohio_by_city_2013.xls'

train_files = [file1, file2, file3, file4, file5, file6, file7]
test_files = [file1, file2, file3, file8, file9, file10 ]
pop_cutoff = 999999999

colnames = ['City','Population','Violent_Crime','Murder','Rape1','Rape2','Robbery','Assault',
                   'Property_Crime','Burglary','Larceny_Theft','Vehicle_Theft','Arson']

raw = pd.DataFrame(columns=colnames)

print('----------------------------------------------------')
print('--------------------- TRAINING ---------------------')
print('----------------------------------------------------')
print('------- Pre-Process Training Data -----------')
for file in train_files:
    #print('file_train: {}'.format(file))   
    raw = pd.concat([raw, pd.read_excel(file, skiprows=4, names=colnames, usecols=12)],ignore_index=True )

train_X, train_Y = my_pre_processing(raw, pop_cutoff = pop_cutoff)



print('\n--------- Create Logistic Regression Model using SciKit default L2 (C=0) -----------')
lr = linear_model.LogisticRegression(penalty='l2', C=.0000000001 )
lr.fit(train_X, train_Y)

print('Accuracy: {}'.format(lr.score(train_X,train_Y)))
print('Intercept ' + str(lr.intercept_))
print(*list(lr.coef_), sep="\n")
score = cross_val_score(lr, train_X, train_Y, cv=cv)
print("\nCross Validation Accuracy %i folds: %.2f (+/- %.2f)" % (cv, score.mean(), (score.std() * 2)))
my_confusion_matrix(array_Expected=train_Y, array_Predicted=lr.predict(train_X),colName='PropertyCrime')






print('\n--------- Create Logistic Regression Model using SciKit default L2 (C = 1-----------')
lr = linear_model.LogisticRegression(penalty='l2', C=1.0 )
lr.fit(train_X, train_Y)
print('Accuracy: {}'.format(lr.score(train_X,train_Y)))
print('Intercept ' + str(lr.intercept_))
print(*list(lr.coef_), sep="\n")
score = cross_val_score(lr, train_X, train_Y, cv=cv)
print("\nCross Validation Accuracy %i folds: %.2f (+/- %.2f)" % (cv, score.mean(), (score.std() * 2)))
my_confusion_matrix(array_Expected=train_Y, array_Predicted=lr.predict(train_X),colName='PropertyCrime')


print('\n--------- Create Logistic Regression Model using SciKit L1 -----------')
lr = linear_model.LogisticRegression(penalty='l1')
lr.fit(train_X, train_Y)
print('Accuracy: {}'.format(lr.score(train_X,train_Y)))
print('Intercept ' + str(lr.intercept_))
print(*list(lr.coef_), sep="\n")
score = cross_val_score(lr, train_X, train_Y, cv=cv)
print("\nCross Validation Accuracy %i folds: %.2f (+/- %.2f)" % (cv, score.mean(), (score.std() * 2)))
my_confusion_matrix(array_Expected=train_Y, array_Predicted=lr.predict(train_X),colName='PropertyCrime')




----------------------------------------------------
--------------------- TRAINING ---------------------
----------------------------------------------------
------- Pre-Process Training Data -----------
Cities processing: 1989

Cities removed due to Null: 
3

--------- Create Logistic Regression Model using SciKit default L2 (C=0) -----------
Accuracy: 0.5780463242698892
Intercept [2.49446633e-24]
[-1.15499991e-08 -1.94499984e-08 -1.09999991e-08 -1.71499986e-08
  1.04999991e-08 -1.64999987e-09 -3.94999968e-09 -3.79999969e-09
 -1.04999991e-09 -4.99999959e-11  2.49446633e-24 -4.99999959e-11
  4.99999959e-11]

Cross Validation Accuracy 10 folds: 0.58 (+/- 0.11)
Regarding PropertyCrime, the model correctly predicted 967 Negatives out of 993 expected Negatives: 0.974
Regarding PropertyCrime, the model correctly predicted 181 Positives out of 993 expected Positives: 0.182
[[967  26]
 [812 181]]

--------- Create Logistic Regression Model using SciKit default L2 (C = 1-----------
Accuracy: 