In [1]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn import linear_model

#Allows us to do PCA
from sklearn.decomposition import PCA
from sklearn import preprocessing

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std

data = pd.read_excel('table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls',skiprows=4)
data.columns = ['City', 'Population', 'Violent Crime', 'Murder', 'Rape1', 'Rape2', 'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary', 'Larceny-theft', 'Motorvehicle Theft', 'Arson']
#Its called washdata but is actually the 2014 data set of new york
washdata = pd.read_excel('Table_8_Offenses_Known_to_Law_Enforcement_by_New_York_by_City_2014.xls',skiprows=4)
washdata.columns = ['City', 'Population', 'Violent Crime', 'Murder', 'Rape1', 'Rape2', 'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary', 'Larceny-theft', 'Motorvehicle Theft', 'Arson']
washdata.head(20)
data.isnull().any() #Is NaN present?
data = data.fillna(method='ffill') #propagate non-null values forward or backward.
washdata.isnull().any() 
washdata = data.fillna(method='ffill')

In [2]:
#Create 2 new columns, for whether or not data for murder and robbery are present
data['MurderExists'] = data['Murder'].apply(lambda x: 0 if x == 0.0 else 1)
data['RobberyExists'] = data['Robbery'].apply(lambda x: 0 if x == 0.0 else 1)
washdata['MurderExists'] = washdata['Murder'].apply(lambda x: 0 if x == 0.0 else 1)
washdata['RobberyExists'] = washdata['Robbery'].apply(lambda x: 0 if x == 0.0 else 1)

In [8]:
 #Gets rid of all rows where there is no murder/robbery for both data sets so we dont have to deal with NaN and 0s
newdata = data[~(data[['MurderExists','RobberyExists']] == 0).any(axis=1)]
newwashdata = washdata[~(washdata[['MurderExists','RobberyExists']] == 0).any(axis=1)]

In [9]:
pca = PCA(n_components=3)
pca.fit(newdata[['Murder','Robbery', 'Population']])
print(pca.explained_variance_ratio_)

regr = linear_model.LinearRegression()
Y = newdata['Property Crime'].values.reshape(-1, 1)
X = newdata[['Murder','Robbery', 'Population']]
regr.fit(X, Y)

print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))

[9.99999986e-01 1.38475056e-08 5.80984393e-12]

Coefficients: 
 [[1.73072834e+02 1.59493532e+00 6.33126084e-03]]

Intercept: 
 [292.07751661]

R-squared:
0.9991119530847022


In [11]:
pca = PCA(n_components=3)
pca.fit(newwashdata[['Murder','Robbery', 'Population']])
print(pca.explained_variance_ratio_)

wregr = linear_model.LinearRegression()
wY = newwashdata['Property Crime'].values.reshape(-1, 1)
wX = newwashdata[['Murder','Robbery', 'Population']]
wregr.fit(wX, wY)

print('\nCoefficients: \n', wregr.coef_)
print('\nIntercept: \n', wregr.intercept_)
print('\nR-squared:')
print(wregr.score(wX, wY))

[9.99999986e-01 1.38475056e-08 5.80984393e-12]

Coefficients: 
 [[1.73072834e+02 1.59493532e+00 6.33126084e-03]]

Intercept: 
 [292.07751661]

R-squared:
0.9991119530847022


dat= sm.datasets.get_rdataset("Property Crime", "Murder", "Robbery").data
linear_formula = 'Property ~ Murder+Robbery'
lm = smf.ols(formula=linear_formula, data=dat).fit()
lm.params
lm.pvalues
lm.rsquared