In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import linear_model
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [2]:
df = pd.read_csv('NYState2013orig.csv',skiprows=4)
df.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
0,Adams Village,1861,0,0.0,,0,0,0,12,2,10,0,0.0
1,Addison Town and Village,2577,3,0.0,,0,0,3,24,3,20,1,0.0
2,Akron Village,2846,3,0.0,,0,0,3,16,1,15,0,0.0
3,Albany,97956,791,8.0,,30,227,526,4090,705,3243,142,
4,Albion Village,6388,23,0.0,,3,4,16,223,53,165,5,


In [3]:
# Just a few in the Population column (e.g. row 278) are not actually strings. Convert them all to strings first.
df.Population=df.Population.apply(lambda x:str(x))

In [4]:
# Take out commas in Population column
df.Population = df.Population.str.replace(',','')

In [5]:
# Remove the bottom three rows of the dataframe that are footnotes
df=df.iloc[0:-3,:]

In [6]:
# Make Population an integer
df.Population=df.Population.apply(lambda x:int(x))

In [7]:
# Create a Population squared column
df['Population_sq']=df.Population**2

In [45]:
# Create Property Crime per capita column
df['prop_crime_per_capita']=df.Property_crime/df.Population

In [8]:
# Take out commas in Robbery column
df.Robbery = df.Robbery.str.replace(',','')

In [9]:
# Make Robbery an integer
df.Robbery=df.Robbery.apply(lambda x:int(x))

In [10]:
# Before making Robbery categorical, make a copy of the column for later:
df['RobberyNum']=df['Robbery']

In [11]:
for x in range (len(df)):
    if df.loc[df.index[x],'Robbery'] > 0:
        df.loc[df.index[x],'Robbery'] = 1

In [12]:
df.Robbery.head()

0    0
1    0
2    0
3    1
4    1
Name: Robbery, dtype: int64

In [13]:
def convert_value(num):
    if num > 0:
        return 1
    else: 
        return 0

In [14]:
# Make Robbery categorical
df.Robbery=df.Robbery.apply(lambda x:convert_value(x))

In [15]:
df.Robbery.head()

0    0
1    0
2    0
3    1
4    1
Name: Robbery, dtype: int64

In [16]:
# Re-name the crazy "Murder and nonnegligent manslaughter" column
df.columns = [c.replace(' ', '_') for c in df.columns]

In [17]:
df.columns

Index(['City', 'Population', 'Violent\ncrime',
       'Murder_and\nnonnegligent\nmanslaughter',
       'Rape\n(revised\ndefinition)1', 'Rape\n(legacy\ndefinition)2',
       'Robbery', 'Aggravated\nassault', 'Property\ncrime', 'Burglary',
       'Larceny-\ntheft', 'Motor\nvehicle\ntheft', 'Arson3', 'Population_sq',
       'RobberyNum'],
      dtype='object')

In [18]:
df.columns = [c.replace('\n', '_') for c in df.columns]

In [19]:
df.columns

Index(['City', 'Population', 'Violent_crime',
       'Murder_and_nonnegligent_manslaughter', 'Rape_(revised_definition)1',
       'Rape_(legacy_definition)2', 'Robbery', 'Aggravated_assault',
       'Property_crime', 'Burglary', 'Larceny-_theft', 'Motor_vehicle_theft',
       'Arson3', 'Population_sq', 'RobberyNum'],
      dtype='object')

In [20]:
# Make the murder column categorical
df.Murder_and_nonnegligent_manslaughter=df.Murder_and_nonnegligent_manslaughter.apply(lambda x:convert_value(x))

In [21]:
df.head()

Unnamed: 0,City,Population,Violent_crime,Murder_and_nonnegligent_manslaughter,Rape_(revised_definition)1,Rape_(legacy_definition)2,Robbery,Aggravated_assault,Property_crime,Burglary,Larceny-_theft,Motor_vehicle_theft,Arson3,Population_sq,RobberyNum
0,Adams Village,1861,0,0,,0,0,0,12,2,10,0,0.0,3463321,0
1,Addison Town and Village,2577,3,0,,0,0,3,24,3,20,1,0.0,6640929,0
2,Akron Village,2846,3,0,,0,0,3,16,1,15,0,0.0,8099716,0
3,Albany,97956,791,1,,30,1,526,4090,705,3243,142,,9595377936,227
4,Albion Village,6388,23,0,,3,1,16,223,53,165,5,,40806544,4


In [22]:
df_clean=df[['City','Population','Population_sq','Robbery','Murder_and_nonnegligent_manslaughter','Property_crime']]

In [23]:
df_clean.head()

Unnamed: 0,City,Population,Population_sq,Robbery,Murder_and_nonnegligent_manslaughter,Property_crime
0,Adams Village,1861,3463321,0,0,12
1,Addison Town and Village,2577,6640929,0,0,24
2,Akron Village,2846,8099716,0,0,16
3,Albany,97956,9595377936,1,1,4090
4,Albion Village,6388,40806544,1,0,223


In [24]:
df_features=df[['Population','Population_sq','Robbery','Murder_and_nonnegligent_manslaughter']]

In [25]:
df.Property_crime = df.Property_crime.str.replace(',','')

In [26]:
df.Property_crime = df.Property_crime.apply(lambda x:int(x))

In [27]:
df_target=df['Property_crime']

In [28]:
regr = linear_model.LinearRegression()

# Fit our model to our data.
regr.fit(df_features,df_target)

# Display the attributes we calculated.
print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)

print('R squared: \n',regr.score(df_features,df_target))


Coefficients: 
 [  3.46570268e-02  -2.11108019e-09  -9.62774363e+01   1.51866535e+01]
Intercept: 
 -109.575335623
R squared: 
 0.996124710499


In [30]:
df_features.head()

Unnamed: 0,Population,Population_sq,Robbery,Murder_and_nonnegligent_manslaughter
0,1861,3463321,0,0
1,2577,6640929,0,0
2,2846,8099716,0,0
3,97956,9595377936,1,1
4,6388,40806544,1,0


In [36]:
# Try the robbery feature as original number, rather than the categorical version.
df_features=df[['Population','Population_sq','Murder_and_nonnegligent_manslaughter','RobberyNum']]

In [37]:
df_features.head()

Unnamed: 0,Population,Population_sq,Murder_and_nonnegligent_manslaughter,RobberyNum
0,1861,3463321,0,0
1,2577,6640929,0,0
2,2846,8099716,0,0
3,97956,9595377936,1,227
4,6388,40806544,0,4


In [39]:
regr = linear_model.LinearRegression()

# Fit our model to our data.
regr.fit(df_features,df_target)

# Display the attributes we calculated.
print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)

print('R squared: \n',regr.score(df_features,df_target))

Coefficients: 
 [  1.40623373e-02  -1.53307561e-09   6.70428655e+01   6.87936698e+00]
Intercept: 
 30.6866936661
R squared: 
 0.998524567525


In [40]:
#Seems better, actually, but population_sq seems to be adding very little. Try without?
df_features=df[['Population','Murder_and_nonnegligent_manslaughter','RobberyNum']]

In [41]:
df_features.head()

Unnamed: 0,Population,Murder_and_nonnegligent_manslaughter,RobberyNum
0,1861,0,0
1,2577,0,0
2,2846,0,0
3,97956,1,227
4,6388,0,4


In [42]:
regr = linear_model.LinearRegression()

# Fit our model to our data.
regr.fit(df_features,df_target)

# Display the attributes we calculated.
print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)

print('R squared: \n',regr.score(df_features,df_target))

Coefficients: 
 [ -1.92671540e-03   5.43653696e+02   8.22520859e+00]
Intercept: 
 193.561889846
R squared: 
 0.997132425624


In [48]:
# Maybe we should predict property crime per capita, because the variation in the sheer number of property crimes 
# could be explained by an increase in population alone
df_target=df['prop_crime_per_capita']
df_features=df[['Population','Population_sq','Murder_and_nonnegligent_manslaughter','RobberyNum']]

In [49]:
regr = linear_model.LinearRegression()

# Fit our model to our data.
regr.fit(df_features,df_target)

# Display the attributes we calculated.
print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)

print('R squared: \n',regr.score(df_features,df_target))

Coefficients: 
 [ -3.31774900e-08  -5.29787278e-15   6.34752530e-03   3.35960989e-05]
Intercept: 
 0.0185447520951
R squared: 
 0.0668868715974


In [None]:
# Evidently that linear regression model does not explain property crimes per capita