In [12]:
import numpy as np
import pandas as pd
import scipy
import sklearn
from sklearn import linear_model
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [13]:
# Grab and process the raw data.
data_path = ("Unit_2-Supporting_Files/table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.csv"
            )
df_raw = pd.read_csv(data_path, delimiter= ',')


In [14]:
df_raw.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3,Unnamed: 13
0,Adams Village,1861,0,0.0,,0,0,0,12,2,10,0,0.0,
1,Addison Town and Village,2577,3,0.0,,0,0,3,24,3,20,1,0.0,
2,Akron Village,2846,3,0.0,,0,0,3,16,1,15,0,0.0,
3,Albany,97956,791,8.0,,30,227,526,4090,705,3243,142,,
4,Albion Village,6388,23,0.0,,3,4,16,223,53,165,5,,


In [15]:
df_raw.columns

Index(['City', 'Population', 'Violent\ncrime',
       'Murder and\nnonnegligent\nmanslaughter',
       'Rape\n(revised\ndefinition)1', 'Rape\n(legacy\ndefinition)2',
       'Robbery', 'Aggravated\nassault', 'Property\ncrime', 'Burglary',
       'Larceny-\ntheft', 'Motor\nvehicle\ntheft', 'Arson3', 'Unnamed: 13'],
      dtype='object')

In [16]:
# Create new df and Remove rape_1 column and last 3 unwanted rows.
df = df_raw.drop(['Rape\n(revised\ndefinition)1','Unnamed: 13'], axis=1).drop(df_raw.index[-3:]).copy()

In [17]:
# Make a dataframe copy and rename columns
df.rename(columns={'Violent\ncrime':'Violent_Crime',
                            'Murder and\nnonnegligent\nmanslaughter':'Murder_Manslaughter',
                            'Rape\n(revised\ndefinition)1':'Rape_1',
                            'Rape\n(legacy\ndefinition)2': 'Rape_2',
                            'Aggravated\nassault':'Aggravated_Assault',
                            'Property\ncrime':'Property_Crime',
                            'Larceny-\ntheft':'Larceny_Theft',
                            'Motor\nvehicle\ntheft':'MV_Theft'}, inplace=True)
df.columns

Index(['City', 'Population', 'Violent_Crime', 'Murder_Manslaughter', 'Rape_2',
       'Robbery', 'Aggravated_Assault', 'Property_Crime', 'Burglary',
       'Larceny_Theft', 'MV_Theft', 'Arson3'],
      dtype='object')

In [18]:
df.head()

Unnamed: 0,City,Population,Violent_Crime,Murder_Manslaughter,Rape_2,Robbery,Aggravated_Assault,Property_Crime,Burglary,Larceny_Theft,MV_Theft,Arson3
0,Adams Village,1861,0,0.0,0,0,0,12,2,10,0,0.0
1,Addison Town and Village,2577,3,0.0,0,0,3,24,3,20,1,0.0
2,Akron Village,2846,3,0.0,0,0,3,16,1,15,0,0.0
3,Albany,97956,791,8.0,30,227,526,4090,705,3243,142,
4,Albion Village,6388,23,0.0,3,4,16,223,53,165,5,


In [19]:
#Pop size squared:

## Remove commas in pop size and robbery numericals
df.Population = df.Population.str.replace(',', '')
df.Robbery = df.Robbery.str.replace(',', '')

## Convert type to int (from string)
df.Population = df.Population.astype(int)
df.Murder_Manslaughter = df.Murder_Manslaughter.astype(int)
df.Robbery = df.Robbery.astype(int)

## Square the population
df['Population_Squared_feat'] = df.Population * df.Population

In [20]:
# Create new features: murder to categorical, robbery to catgorical
df['Murder_Manslaughter_feat'] = np.where((df['Murder_Manslaughter']==0), 0, 1)
df['Robbery_feat'] = np.where((df['Robbery']==0), 0, 1)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 348 entries, 0 to 347
Data columns (total 15 columns):
City                        348 non-null object
Population                  348 non-null int64
Violent_Crime               348 non-null object
Murder_Manslaughter         348 non-null int64
Rape_2                      348 non-null object
Robbery                     348 non-null int64
Aggravated_Assault          348 non-null object
Property_Crime              348 non-null object
Burglary                    348 non-null object
Larceny_Theft               348 non-null object
MV_Theft                    348 non-null object
Arson3                      187 non-null float64
Population_Squared_feat     348 non-null int64
Murder_Manslaughter_feat    348 non-null int64
Robbery_feat                348 non-null int64
dtypes: float64(1), int64(6), object(8)
memory usage: 43.5+ KB
