In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
from sklearn import preprocessing
from sklearn import linear_model
pd.options.display.float_format = '{:.3f}'.format
from sklearn.model_selection import cross_val_score
# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [2]:
# Parse in the dataframes, skipping footnotes and also making them the same length
df2013_raw = pd.read_csv('NYState2013orig.csv',skiprows=4,skipfooter=3)
df2014_raw = pd.read_csv('NYState2014.csv',skiprows=4,skipfooter=28)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
df2013_raw.tail()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
343,Woodbury Town,10685,3,0,,0,2,1,541,9,529,3,
344,Woodridge Village,829,7,0,,0,0,7,17,8,9,0,0.0
345,Woodstock Town,5931,2,0,,0,0,2,58,13,45,0,
346,Yonkers,199134,1036,6,,25,390,615,2368,470,1662,236,10.0
347,Yorktown Town,36643,15,0,,0,2,13,334,45,287,2,


In [4]:
df2014_raw.tail()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
343,Walton Village4,2980,10,0,1.0,,1,8,75,15,59,1,2.0
344,Warsaw Village4,3397,5,0,1.0,,0,4,75,6,69,0,1.0
345,Washingtonville Village,5788,2,0,,0.0,0,2,59,5,53,1,0.0
346,Waterford Town and Village,8398,1,0,,0.0,1,0,58,14,44,0,0.0
347,Waterloo Village4,5185,13,0,2.0,,1,10,154,22,127,5,0.0


In [5]:
def convert_value(num):
    if num > 0:
        return 1
    else: 
        return 0

In [6]:
def process_crime_stats(df):
    # Convert Population column to strings.
    df.Population=df.Population.apply(lambda x:str(x))
    # Take out commas in Population column
    df.Population = df.Population.str.replace(',','')
    # Make Population an integer
    df.Population=df.Population.apply(lambda x:int(x))
    # Create a Population squared column
    df['Population_sq']=df.Population**2
    # Fix column headings
    df.columns = [c.replace(' ', '_') for c in df.columns]
    df.columns = [c.replace('\n', '_') for c in df.columns]
    # Fix Property Crime column
    df.Property_crime = df.Property_crime.apply(lambda x:str(x))
    df.Property_crime = df.Property_crime.str.replace(',','')
    df.Property_crime = df.Property_crime.apply(lambda x:int(x))
    # Fix Robbery column
    df.Robbery=df.Robbery.apply(lambda x:str(x))
    df.Robbery = df.Robbery.str.replace(',','')
    df.Robbery=df.Robbery.apply(lambda x:int(x))
    # Before making Robbery categorical, make a copy of the column for later:
    df['RobberyNum']=df['Robbery']
    # Make Robbery categorical
    df.Robbery=df.Robbery.apply(lambda x:convert_value(x))
    # Before making murder categorical, make a copy of the column for later:
    df['MurderNum']=df['Murder_and_nonnegligent_manslaughter']
    # Make the murder column categorical
    df.Murder_and_nonnegligent_manslaughter=df.Murder_and_nonnegligent_manslaughter.apply(lambda x:convert_value(x)) 
    return df

In [7]:
df2013_processed = process_crime_stats(df2013_raw)

In [8]:
df2014_processed = process_crime_stats(df2014_raw)

In [9]:
df2013_features=df2013_processed[['Population','Population_sq','Robbery','RobberyNum',
                                  'Murder_and_nonnegligent_manslaughter','MurderNum']]

In [10]:
df2013_target=df2013_processed['Property_crime']

In [11]:
df2014_features=df2014_processed[['Population','Population_sq','Robbery','RobberyNum',
                                  'Murder_and_nonnegligent_manslaughter','MurderNum']]

In [12]:
df2014_target=df2014_processed['Property_crime']

In [13]:
regr = linear_model.LinearRegression()
print('Cross-validating a Linear Regression model between two different data sets: \n')

# Fit our model to our training data.
regr.fit(df2013_features,df2013_target)

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)

# Performance on the training data:
print('R squared of model on 2013 training data: \n',regr.score(df2013_features,df2013_target)) 

# Performance on the test data:
# Score is measuring predicted outcome (same as regr.predict()) v. test data.
print('R squared of model on 2014 test data: \n',regr.score(df2014_features,df2014_target))

Cross-validating a Linear Regression model between two different data sets: 

Coefficients: 
 [  1.52130152e-02  -9.44818779e-10   1.04837375e+02   2.07765121e+00
  -6.46504116e+01   1.22380130e+02]
Intercept: 
 -21.1482148138
R squared of model on 2013 training data: 
 0.998780265433
R squared of model on 2014 test data: 
 0.998868521775
