In [1]:
import re

import pandas as pd
pd.options.display.max_rows = 999

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

In [2]:
crime = pd.read_csv('/Users/guest/Dropbox/Education/Thinkful/Unit 2/L4 - Linear Regression/NEW_YORK-Offenses_Known_to_Law_Enforcement_by_City_2013 - 13tbl8ny.csv',
                    header=4,
                    skipfooter = 3,
                    engine='python',
                    warn_bad_lines = True,
                    skip_blank_lines = True,
                   encoding="utf-8")

### Cleaning the data

In [3]:
# Tidy up the column names
crime.columns = crime.columns.str.strip().str.lower()

replacement_definitions = {' ':'_', 
                           '(':'',
                           ')':'',
                           '\n':'_',
                           '3':'',
                           '-':''}
for definition in replacement_definitions:
    crime.columns = crime.columns.str.replace(definition,replacement_definitions[definition])
    
# Set crime as the index
crime = crime.set_index('city')
    
# Replace nans with 0s
crime = crime.fillna(value=0)

# Convert all values to floats
for col in crime.columns:
    try:
        crime[col] = crime[col].apply(lambda x: float(x.replace(',', '')))
    except:
        crime[col] = crime[col].apply(lambda x: float(x))

In [4]:
# Rape – revised definition is empty. Get rid of it!
crime['rape'] = crime['rape_legacy_definition2']
crime = crime.drop(['rape_revised_definition1', 'rape_legacy_definition2'], axis='columns')

In [5]:
crime.describe()

Unnamed: 0,population,violent_crime,murder_and_nonnegligent_manslaughter,robbery,aggravated_assault,property_crime,burglary,larceny_theft,motor_vehicle_theft,arson,rape
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0
mean,40037.63,201.594828,1.566092,72.902299,121.261494,792.606322,119.683908,637.017241,35.905172,1.005747,5.864943
std,450037.4,2815.268504,18.303673,1031.032873,1706.13173,7659.724746,924.948789,6346.054451,403.423826,7.884612,60.425452
min,526.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3003.0,2.0,0.0,0.0,1.0,40.5,6.0,31.0,0.0,0.0,0.0
50%,7233.5,6.0,0.0,1.0,4.0,112.5,17.5,94.0,2.0,0.0,0.0
75%,18427.5,22.0,0.0,5.0,14.0,341.0,51.25,287.25,7.0,0.0,2.0
max,8396126.0,52384.0,335.0,19170.0,31767.0,141971.0,16606.0,117931.0,7434.0,132.0,1112.0


__Everything has ridiculously high variance – I'm going to look for outliers.__

In [6]:
def describe_outliers(frame):
    for variable in list(frame):
        var_mean = frame[variable].mean()
        var_std = frame[variable].std()

        lower = var_mean - var_std * 2
        upper = var_mean + var_std * 2

        low_outliers = frame.loc[(frame[variable] < lower)][[variable]]
        high_outliers = frame.loc[(frame[variable] > upper)][[variable]]

        print('\n{} outliers:'.format(variable))
        if not low_outliers.empty:
            print(low_outliers)
        if not high_outliers.empty:
            print(high_outliers)

In [7]:
describe_outliers(crime)


population outliers:
          population
city                
New York   8396126.0

violent_crime outliers:
          violent_crime
city                   
New York        52384.0

murder_and_nonnegligent_manslaughter outliers:
           murder_and_nonnegligent_manslaughter
city                                           
Buffalo                                    47.0
New York                                  335.0
Rochester                                  42.0

robbery outliers:
          robbery
city             
New York  19170.0

aggravated_assault outliers:
          aggravated_assault
city                        
New York             31767.0

property_crime outliers:
          property_crime
city                    
New York        141971.0

burglary outliers:
           burglary
city               
Buffalo      3458.0
New York    16606.0
Rochester    2587.0

larceny_theft outliers:
          larceny_theft
city                   
New York       117931.0

motor_vehicle_theft o

__NYC is a dramatic outlier in every category;I'm going to drop it.__

In [8]:
crime = crime.drop(['New York'])

In [9]:
crime.describe()

Unnamed: 0,population,violent_crime,murder_and_nonnegligent_manslaughter,robbery,aggravated_assault,property_crime,burglary,larceny_theft,motor_vehicle_theft,arson,rape
count,347.0,347.0,347.0,347.0,347.0,347.0,347.0,347.0,347.0,347.0,347.0
mean,15956.685879,51.213256,0.605187,17.867435,30.063401,385.752161,72.172911,298.994236,14.585014,1.008646,2.677233
std,27080.218837,236.667435,3.70709,94.972492,128.783376,1034.369072,264.941381,715.232296,67.682236,7.895813,10.74102
min,526.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2997.0,2.0,0.0,0.0,1.0,40.0,6.0,31.0,0.0,0.0,0.0
50%,7187.0,6.0,0.0,1.0,4.0,112.0,17.0,94.0,2.0,0.0,0.0
75%,18160.5,21.5,0.0,5.0,14.0,340.5,51.0,284.5,7.0,0.0,2.0
max,258789.0,3249.0,47.0,1322.0,1735.0,12491.0,3458.0,8076.0,957.0,132.0,145.0


In [10]:
describe_outliers(crime)


population outliers:
                  population
city                        
Albany               97956.0
Amherst Town        118296.0
Buffalo             258789.0
Cheektowaga Town     78361.0
Clarkstown Town      80705.0
Colonie Town         78215.0
Greece Town          96667.0
New Rochelle         78800.0
Ramapo Town          87204.0
Rochester           210562.0
Syracuse            143834.0
Yonkers             199134.0

violent_crime outliers:
               violent_crime
city                        
Albany                 791.0
Buffalo               3249.0
Mount Vernon           554.0
Niagara Falls          584.0
Rochester             2107.0
Schenectady            607.0
Syracuse              1192.0
Yonkers               1036.0

murder_and_nonnegligent_manslaughter outliers:
                   murder_and_nonnegligent_manslaughter
city                                                   
Buffalo                                            47.0
Hempstead Village                        

In general, the remaining outliers for each type of crime are also the places with the highest populations. They're also not as far off the mean as New York was. If the model underperforms, I'll circle back and think more about what to do with them. Otherwise I think I'm probably fine.

### Recoding to specification

$ Property crime = \alpha + Population + Population^2 + Murder + Robbery$


In [11]:
# Create a function that will transform murder and robbery from continuous to categorical variables.

def cont_to_cat(x):
    if x > 0:
        return 1
    else:
        return 0

In [12]:
# Population^2
crime['population_sq'] = crime['population'] ** 2

# Murder
crime['murder_cat'] = crime['murder_and_nonnegligent_manslaughter'].apply(cont_to_cat)

# Robbery
crime['robbery_cat'] = crime['robbery'].apply(cont_to_cat)

In [14]:
crime_clean = crime[['population', 'population_sq', 'murder_cat', 'robbery_cat', 'property_crime']]

In [16]:
crime_clean

Unnamed: 0_level_0,population,population_sq,murder_cat,robbery_cat,property_crime
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Adams Village,1861.0,3463321.0,0,0,12.0
Addison Town and Village,2577.0,6640929.0,0,0,24.0
Akron Village,2846.0,8099716.0,0,0,16.0
Albany,97956.0,9595378000.0,1,1,4090.0
Albion Village,6388.0,40806540.0,0,1,223.0
Alfred Village,4089.0,16719920.0,0,1,46.0
Allegany Village,1781.0,3171961.0,0,0,10.0
Amherst Town,118296.0,13993940000.0,1,1,2118.0
Amityville Village,9519.0,90611360.0,0,1,210.0
Amsterdam,18182.0,330585100.0,0,1,405.0
