In [None]:
import re

import pandas as pd
pd.options.display.max_rows = 999

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mstats

sns.set_style("whitegrid")

In [None]:
crime = pd.read_csv('/Users/guest/Dropbox/Education/Thinkful/Unit 2/L4 - Linear Regression/NEW_YORK-Offenses_Known_to_Law_Enforcement_by_City_2013 - 13tbl8ny.csv',
                    header=4,
                    skipfooter = 3,
                    engine='python',
                    warn_bad_lines = True,
                    skip_blank_lines = True,
                   encoding="utf-8")

### Cleaning the data

In [None]:
# Tidy up the column names
crime.columns = crime.columns.str.strip().str.lower()

replacement_definitions = {' ':'_', 
                           '(':'',
                           ')':'',
                           '\n':'_',
                           '3':'',
                           '-':''}
for definition in replacement_definitions:
    crime.columns = crime.columns.str.replace(definition,replacement_definitions[definition])
    
# Set crime as the index
crime = crime.set_index('city')
    
# Replace nans with 0s
crime = crime.fillna(value=0)

# Convert all values to floats
for col in crime.columns:
    try:
        crime[col] = crime[col].apply(lambda x: float(x.replace(',', '')))
    except:
        crime[col] = crime[col].apply(lambda x: float(x))

In [None]:
# Rape – revised definition is empty. Get rid of it!
crime['rape'] = crime['rape_legacy_definition2']
crime = crime.drop(['rape_revised_definition1', 'rape_legacy_definition2'], axis='columns')

In [None]:
crime.describe()

__Everything has ridiculously high variance – I'm going to look for outliers.__

In [None]:
def describe_outliers(frame):
    for variable in list(frame):
        var_mean = frame[variable].mean()
        var_std = frame[variable].std()

        lower = var_mean - var_std * 2
        upper = var_mean + var_std * 2

        low_outliers = frame.loc[(frame[variable] < lower)][[variable]]
        high_outliers = frame.loc[(frame[variable] > upper)][[variable]]

        print('\n{} outliers:'.format(variable))
        if not low_outliers.empty:
            print(low_outliers)
        if not high_outliers.empty:
            print(high_outliers)

In [None]:
continuous_variables = ['population', 'property_crime']

for var in continuous_variables:
    sns.boxplot(crime[var])
    plt.show()

__There are some troublesome outliers (most notably NYC); I'm going to limit their influence by winsorizing the data.__

In [None]:
crime['population_wins'] = mstats.winsorize(crime['population'], limits= 0.05)
crime['property_crime_wins'] = mstats.winsorize(crime['property_crime'], limits= 0.05)

In [None]:
continuous_variables = ['population_wins', 'property_crime_wins']

for var in continuous_variables:
    sns.boxplot(crime[var])
    plt.show()

This looks much, much better to me.

### Recoding to specification

$ Property crime = \alpha + Population + Population^2 + Murder + Robbery$


In [None]:
# Create a function that will transform murder and robbery from continuous to categorical variables.

def cont_to_cat(x):
    if x > 0:
        return 1
    else:
        return 0

In [None]:
# Population^2
crime['population_sq'] = crime['population_wins'] ** 2

# Murder
crime['murder_cat'] = crime['murder_and_nonnegligent_manslaughter'].apply(cont_to_cat)

# Robbery
crime['robbery_cat'] = crime['robbery'].apply(cont_to_cat)

In [None]:
crime_clean = crime[['population_wins', 'population_sq', 'murder_cat', 'robbery_cat', 'property_crime']]

In [None]:
crime_clean.head()