## 2.4.2. Multivariable regression

Prepare this data to model with multivariable regression (including data cleaning if necessary) according to this specification:
    
$$ Property crime = \alpha + Population + Population^2 + Murder + Robbery$$



In [96]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [97]:
path = ("/Users/gemma/Documents/data science/DS bootcamp/table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.csv"
            )
df = pd.read_csv(path, delimiter= ',', encoding='latin-1',header=0)


In [98]:
# getting names of columns to access them more easily
# and renaming murder because the name is messy

colnames = df.columns.values
print(colnames)

# renamed it originally but later on wasn't necessary bc i dropped the columns
#df.rename({'Murder and\nnonnegligent\nmanslaughter' : 'Murder etc'}, axis='columns', inplace=True)


['City' 'Population' 'Violent\ncrime'
 'Murder and\nnonnegligent\nmanslaughter' 'Rape\n(revised\ndefinition)1'
 'Rape\n(legacy\ndefinition)2' 'Robbery' 'Aggravated\nassault'
 'Property\ncrime' 'Burglary' 'Larceny-\ntheft' 'Motor\nvehicle\ntheft'
 'Arson3']


In [99]:
df.head(3)


Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
0,Adams Village,1861,0,0.0,,0,0,0,12,2,10,0,0.0
1,Addison Town and Village,2577,3,0.0,,0,0,3,24,3,20,1,0.0
2,Akron Village,2846,3,0.0,,0,0,3,16,1,15,0,0.0


In [100]:
# adding binary features - murder (or robbery) > 0 = 1, else 0

df['Murder bin'] = np.where(df[colnames[3]]==0, 0, 1)
df['Robbery bin'] = np.where(df['Robbery'] == 0, 0, 1)

In [101]:
df.head(3)

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3,Murder bin,Robbery bin
0,Adams Village,1861,0,0.0,,0,0,0,12,2,10,0,0.0,0,1
1,Addison Town and Village,2577,3,0.0,,0,0,3,24,3,20,1,0.0,0,1
2,Akron Village,2846,3,0.0,,0,0,3,16,1,15,0,0.0,0,1


In [103]:
# making the data frame i'll use for the regression

crime = df.drop(labels=colnames[2:13], axis=1)


In [104]:
# renaming the weirdly named columns

crime.rename({'Murder bin': 'Murder', 'Robbery bin':'Robbery'}, axis='columns',inplace=True)


In [106]:
# cleaning up population...

# getting rid of commas
crime['Population'] = crime['Population'].apply(lambda x: ''.join(list(filter(str.isdigit, str(x)))))

# there's at least one cell in Population with just a space, so filling it with NaN
crime = crime.replace(r'^\s*$', np.nan, regex=True)

# now dropping the row(s) that contain(s) the nan 
crime.dropna(axis=0, how='any', inplace=True)

# for some reason these two didnt work to convert Population to an integer
#pd.to_numeric(crime['Population'], errors='coerce', downcast='integer')
#crime.Population.astype(int,copy=False)

# finally! Population is an integer (checked in the cell below)
crime['Population'] = crime['Population'].astype(int)


# double checking
crime.dtypes


City          object
Population     int64
Murder         int64
Robbery        int64
dtype: object

In [107]:
crime.head(3)

Unnamed: 0,City,Population,Murder,Robbery
0,Adams Village,1861,0,1
1,Addison Town and Village,2577,0,1
2,Akron Village,2846,0,1


In [108]:
# adding pop ^2 bc i didnt do that yet

crime.insert(2, 'Popˆ2', crime['Population']**2, allow_duplicates=False)

In [109]:
crime.head(3)

Unnamed: 0,City,Population,Popˆ2,Murder,Robbery
0,Adams Village,1861,3463321,0,1
1,Addison Town and Village,2577,6640929,0,1
2,Akron Village,2846,8099716,0,1
