In [1]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt

## Source
- [Houston Police Department Crime Statistics](http://www.houstontx.gov/police/cs/crime-stats-archives.htm)
	- years: 2008 - 2017
	- format: Access or Excel

In [2]:
ls crime_data_raw/2012

[0m[01;32mapr12.xls[0m*  [01;32mdec12.xls[0m*  [01;32mjan12.xls[0m*  [01;32mjun12.xls[0m*  [01;32mmay12.xls[0m*  [01;32moct12.xls[0m*
[01;32maug12.xls[0m*  [01;32mfeb12.xls[0m*  [01;32mjul12.xls[0m*  [01;32mmar12.xls[0m*  [01;32mnov12.xls[0m*  [01;32msep12.xls[0m*


## combine all files into one dataframe

In [3]:
path = 'crime_data_raw/2012'
all_files = glob.glob(os.path.join(path, "*.xls")) 

df_from_each_file = (pd.read_excel(f) for f in all_files)
df   = pd.concat(df_from_each_file, ignore_index=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127771 entries, 0 to 127770
Data columns (total 10 columns):
Date             127771 non-null datetime64[ns]
Hour             127771 non-null int64
Offense Type     127771 non-null object
Beat             127771 non-null object
Premise          127771 non-null object
Block Range      127771 non-null object
Street Name      127771 non-null object
Type             127771 non-null object
Suffix           127771 non-null object
# Of Offenses    127771 non-null int64
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 9.7+ MB


In [5]:
df.head(5)

Unnamed: 0,Date,Hour,Offense Type,Beat,Premise,Block Range,Street Name,Type,Suffix,# Of Offenses
0,2012-04-05,2,Murder,17E40,20R,6100-6199,CLARIDGE,DR,-,1
1,2012-04-04,22,Murder,3B10,18A,11700-11799,HEMPSTEAD,HWY,-,1
2,2012-04-01,23,Murder,19G10,20A,7500-7599,CORPORATE,DR,-,1
3,2012-04-21,23,Murder,8C10,20R,6200-6299,RIETTA,-,-,1
4,2012-04-01,6,Murder,3B10,20A,4200-4299,34TH,ST,W,1


## Lets create a copy

In [6]:
df1 = df.copy()

In [7]:
df1.columns

Index(['Date', 'Hour', 'Offense Type', 'Beat', 'Premise', 'Block Range',
       'Street Name', 'Type', 'Suffix', '# Of Offenses'],
      dtype='object')

In [8]:
df1.head(5)

Unnamed: 0,Date,Hour,Offense Type,Beat,Premise,Block Range,Street Name,Type,Suffix,# Of Offenses
0,2012-04-05,2,Murder,17E40,20R,6100-6199,CLARIDGE,DR,-,1
1,2012-04-04,22,Murder,3B10,18A,11700-11799,HEMPSTEAD,HWY,-,1
2,2012-04-01,23,Murder,19G10,20A,7500-7599,CORPORATE,DR,-,1
3,2012-04-21,23,Murder,8C10,20R,6200-6299,RIETTA,-,-,1
4,2012-04-01,6,Murder,3B10,20A,4200-4299,34TH,ST,W,1


## Rename columns

In [9]:
df1.rename(columns={
    '# Of Offenses': '# offenses',
    'Block Range': 'BlockRange',
    'Street Name': 'StreetName',}, inplace=True)

In [10]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127771 entries, 0 to 127770
Data columns (total 10 columns):
Date            127771 non-null datetime64[ns]
Hour            127771 non-null int64
Offense Type    127771 non-null object
Beat            127771 non-null object
Premise         127771 non-null object
BlockRange      127771 non-null object
StreetName      127771 non-null object
Type            127771 non-null object
Suffix          127771 non-null object
# offenses      127771 non-null int64
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 9.7+ MB


In [11]:
df1.head(5)

Unnamed: 0,Date,Hour,Offense Type,Beat,Premise,BlockRange,StreetName,Type,Suffix,# offenses
0,2012-04-05,2,Murder,17E40,20R,6100-6199,CLARIDGE,DR,-,1
1,2012-04-04,22,Murder,3B10,18A,11700-11799,HEMPSTEAD,HWY,-,1
2,2012-04-01,23,Murder,19G10,20A,7500-7599,CORPORATE,DR,-,1
3,2012-04-21,23,Murder,8C10,20R,6200-6299,RIETTA,-,-,1
4,2012-04-01,6,Murder,3B10,20A,4200-4299,34TH,ST,W,1


## create a subdataframe with the columns that we want

In [12]:
df12 = df1[['Date','Beat','BlockRange','StreetName','Offense Type','Premise','# offenses','Hour']]

In [13]:
df12.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127771 entries, 0 to 127770
Data columns (total 8 columns):
Date            127771 non-null datetime64[ns]
Beat            127771 non-null object
BlockRange      127771 non-null object
StreetName      127771 non-null object
Offense Type    127771 non-null object
Premise         127771 non-null object
# offenses      127771 non-null int64
Hour            127771 non-null int64
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 7.8+ MB


In [14]:
df12.tail()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
127766,2012-09-30,4F20,2300-2399,GESSNER,Theft,18G,1,23
127767,2012-09-30,2A30,2000-2099,ARLINGTON,Theft,13R,1,22
127768,2012-09-30,2A30,400-499,21ST,Theft,13R,1,21
127769,2012-09-30,18F30,3000-3099,HILLCROFT,Theft,18A,1,23
127770,2012-09-30,3B40,600-699,THORNTON,Theft,20A,1,19


## Now we can inspect df

In [15]:
df12.Date.unique() ## timestamp

array(['2012-04-05T00:00:00.000000000', '2012-04-04T00:00:00.000000000',
       '2012-04-01T00:00:00.000000000', '2012-04-21T00:00:00.000000000',
       '2012-04-15T00:00:00.000000000', '2012-04-11T00:00:00.000000000',
       '2012-04-28T00:00:00.000000000', '2012-04-22T00:00:00.000000000',
       '2012-04-16T00:00:00.000000000', '2012-04-30T00:00:00.000000000',
       '2012-03-30T00:00:00.000000000', '2012-04-12T00:00:00.000000000',
       '2012-04-13T00:00:00.000000000', '2012-04-29T00:00:00.000000000',
       '2012-04-23T00:00:00.000000000', '2012-03-09T00:00:00.000000000',
       '2012-03-25T00:00:00.000000000', '2012-04-07T00:00:00.000000000',
       '2012-03-23T00:00:00.000000000', '2012-03-15T00:00:00.000000000',
       '2012-02-21T00:00:00.000000000', '2012-04-10T00:00:00.000000000',
       '2012-04-03T00:00:00.000000000', '2012-04-17T00:00:00.000000000',
       '2012-04-20T00:00:00.000000000', '2012-04-14T00:00:00.000000000',
       '2012-04-09T00:00:00.000000000', '2012-04-19

In [16]:
df12.Beat.unique()  ##

array(['17E40', '3B10', '19G10', '8C10', '3B40', '6B60', '11H10', '6B50',
       '18F30', '7C20', '19G50', '7C50', '16E30', '13D20', '16E20',
       '10H10', '12D10', '10H70', '1A20', '2A10', '6B10', '14D20',
       '12D40', '14D40', '20G10', '10H40', '13D10', '17E10', '7C10',
       '5F30', '17E30', '1A10', '3B30', '19G40', '3B50', '10H60', '18F20',
       '18F50', '8C50', '11H20', '20G40', '6B30', '20G70', '14D10',
       '12D20', '20G30', '6B20', '17E20', '1A40', '11H30', '13D40',
       '15E40', '14D30', '5F20', '19G30', '9C40', '9C20', '1A30', '10H20',
       '6B40', '10H50', '2A50', '15E20', '20G60', '18F40', '18F60',
       '12D60', '8C40', '16E10', '20G80', '16E40', '11H40', '20G50',
       '7C30', '7C40', '12D30', '5F10', '20G20', '19G20', '4F20', '1A50',
       '2A30', '15E10', '10H80', '15E30', '11H50', '8C30', '14D50',
       '12D70', '4F30', '24C20', '2A60', '2A20', '9C30', '8C20', '5F40',
       '2A40', '8C60', '4F10', '24C60', 'UNK', '10H30', '18F10', '13D30',
       '9C

In [17]:
df12.Beat.value_counts(dropna=False)  # UNK : 76

12D10    2973
19G10    2832
6B60     2797
3B10     2476
13D20    2397
1A20     2336
18F20    2241
18F30    2162
18F40    2113
20G30    2091
18F50    2049
17E10    2046
1A30     2046
17E40    2033
5F30     1993
1A50     1989
15E40    1962
14D20    1952
20G50    1900
2A50     1848
6B10     1810
9C40     1798
1A10     1779
3B50     1772
7C20     1687
6B30     1674
19G40    1661
17E30    1618
20G10    1617
2A30     1602
         ... 
12D40     433
8C20      404
14D50     383
10H20     369
24C10     361
8C40      343
1A40      340
11H50     332
24C30     323
18F10     309
11H40     304
9C10      294
12D50     291
24C20     289
24C50     264
13D30     154
21I50     138
24C40     128
21I10     114
21I30      97
23J50      88
24C60      78
UNK        76
21I60      57
21I20      55
21I70      23
21I40      16
23J40       4
23J30       2
23J20       1
Name: Beat, Length: 121, dtype: int64

In [18]:
df12.BlockRange.unique()  #

array(['6100-6199', '11700-11799', '7500-7599', '6200-6299', '4200-4299',
       '1400-1499', '16800-16899', '6700-6799', '300-399', '6300-6399',
       '4000-4099', '10300-10399', '19400-19499', '4900-4999', '100-199',
       '1000-1099', '8400-8499', '4100-4199', '6000-6099', '9200-9299',
       '12400-12499', '12200-12299', '1500-1599', '11200-11299',
       '3400-3499', '3800-3899', '12000-12099', '12900-12999',
       '10800-10899', '6500-6599', '3000-3099', '2100-2199',
       '11800-11899', '7300-7399', '2600-2699', '12700-12799',
       '10500-10599', '9700-9799', '700-799', '8900-8999', '11500-11599',
       '4400-4499', '10600-10699', '500-599', '9400-9499', '1200-1299',
       '3900-3999', '3300-3399', '7000-7099', '14000-14099', '5000-5099',
       '7200-7299', '2400-2499', '8000-8099', '9900-9999', '5400-5499',
       '2300-2399', '3200-3299', '7400-7499', '13400-13499',
       '12800-12899', '5500-5599', '6800-6899', '6900-6999',
       '11600-11699', '2900-2999', '7800-7

In [19]:
df12.BlockRange.value_counts(dropna=False)  #

900-999        1828
100-199        1725
9400-9499      1681
4400-4499      1534
800-899        1503
9500-9599      1496
700-799        1481
500-599        1448
2400-2499      1400
7900-7999      1378
1000-1099      1358
1100-1199      1352
200-299        1331
1300-1399      1313
1500-1599      1304
300-399        1290
7500-7599      1275
2500-2599      1271
5800-5899      1268
600-699        1254
2700-2799      1229
5000-5099      1201
2300-2399      1183
1400-1499      1182
1200-1299      1172
400-499        1170
2600-2699      1169
2200-2299      1168
5900-5999      1154
3800-3899      1142
               ... 
23700-23799       4
22100-22199       3
23800-23899       3
19900-19999       3
19700-19799       3
19500-19599       3
23300-23399       3
25000-25099       2
24100-24199       2
21300-21399       2
21100-21199       2
25500-25599       2
23100-23199       2
72200-72299       1
27600-27699       1
74000-74099       1
23500-23599       1
24800-24899       1
27200-27299       1


In [20]:
df12.StreetName.unique()  #

array(['CLARIDGE', 'HEMPSTEAD', 'CORPORATE', ..., 'WESTWICK FOREST',
       'KELFORD', 'CLEMENTSHIRE'], dtype=object)

In [21]:
df12.StreetName.value_counts(dropna=False)  # 

WESTHEIMER                3534
GESSNER                   2000
GULF                      1909
RICHMOND                  1786
NORTH                     1738
SOUTHWEST                 1312
MAIN                      1290
WEST SAM HOUSTON          1242
BELLAIRE                  1242
KATY                      1236
BISSONNET                 1220
NORTHWEST                 1130
GREENS                    1062
BELLFORT                  1044
FONDREN                    983
BEECHNUT                   900
TIDWELL                    888
FM 1960                    877
LITTLE YORK                869
BROADWAY                   865
POST OAK                   834
EAST                       763
SHEPHERD                   688
AIRLINE                    686
KIRBY                      668
HILLCROFT                  604
FANNIN                     553
WILCREST                   547
FUQUA                      546
IMPERIAL VALLEY            529
                          ... 
GULF CREEK                   1
REO     

In [22]:
df12['Offense Type'].unique()  # good

array(['Murder', 'Rape', 'Robbery', 'Aggravated Assault', 'Burglary',
       'Auto Theft', 'Theft'], dtype=object)

In [23]:
df12['Offense Type'].value_counts(dropna=False) # good, that its clean not that there were 73591 thefts!! 

Theft                 67978
Burglary              26630
Auto Theft            12672
Aggravated Assault    10238
Robbery                9385
Rape                    663
Murder                  205
Name: Offense Type, dtype: int64

In [24]:
df12.Premise.unique()  # lots of weird values

array(['20R', '18A', '20A', '13R', '20D', '250', '18B', '20C', '20V',
       '01B', '20P', '140', '18O', '11R', '22M', '18N', '080', '05D',
       '24F', '02B', '24E', '05L', '24P', '05Z', '18C', '13B', '210',
       '070', '18T', '05B', '120', '09D', '03B', '100', '24A', '23S',
       '05O', '18R', '240', '24C', '02S', '18U', '18G', '22H', '18P',
       '18M', '03S', '13H', '13S', '20G', '09P', 'N', '05C', '040', '05P',
       '---', '05Q', '05R', '170', '09H', '12V', '22E', '05S', '11F',
       '18W', '060', '11G', '20M', '18D', '160', '05E', '20N', '05H',
       '22P', '05Y', '190', '25V', '05W', '22D', '24J', '24V', '20W',
       '20L', '05F', '05G', '05N', '09R', '05V', '05X', '24M', '21V',
       '11L', '19V', '24T', '11P', '18H', '18S', '01P', '01A', '24S',
       '22U', '18L', '05M', '01K', '11C', '05A', '150', '24B', '05U',
       '24G', '20H', '11S', '11V', '13A', '02C', '13T', '25R', '05T',
       '06', '01T', '14V', '22C', '04V', '22V', '09V', '13C'],
      dtype=object)

In [25]:
df12.Premise.value_counts(dropna=False)  # srip extra spaces

20A    16963
20R    16740
18A    15070
13R     9323
20D     7823
18O     6498
080     5651
18R     3697
250     3642
18P     2947
120     2870
18T     2608
210     2121
23S     1617
18G     1469
070     1454
18M     1425
18N     1225
05C     1204
240     1166
03B     1136
18C     1119
05Z     1062
140     1060
18U      956
20G      843
20P      772
05O      751
20C      699
11R      694
       ...  
19V       27
05A       27
09R       27
150       26
N         26
11C       26
25V       25
24B       24
22C       23
11F       21
11P       20
24V       19
06        18
02S       17
21V       17
04V       12
01K       12
01T       10
05Y       10
02C        9
160        8
12V        7
13T        7
25R        5
11V        5
14V        5
24M        3
09V        2
22V        2
13C        1
Name: Premise, Length: 126, dtype: int64

In [26]:
df12['# offenses'].unique()

array([1, 2, 3, 4, 8, 5, 7, 6])

In [27]:
df12['# offenses'].value_counts(dropna=False)  # 2 locations with 10 offenses?

1    126502
2      1050
3       202
4        11
5         3
8         1
7         1
6         1
Name: # offenses, dtype: int64

In [28]:
df12.Hour.unique()

array([ 2, 22, 23,  6,  1, 12, 21,  0,  8,  4,  7,  5, 13,  3, 15, 19, 16,
       10, 20, 11,  9, 17, 14, 18])

In [29]:
df12.Hour.value_counts(dropna=False)

18    7907
12    7408
22    7305
20    7210
19    7199
17    7079
0     7056
21    6885
15    6212
16    6041
23    5770
14    5692
13    5430
8     5238
10    5082
11    5058
9     4735
7     4217
1     3535
2     3349
6     2876
3     2538
4     1981
5     1968
Name: Hour, dtype: int64

# Cleanup
## Premise Column

- strip empty spaces :not needed

In [30]:
len(df12.Premise.unique())

126

In [31]:
df12['Premise'] = df12['Premise'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [32]:
len(df12.Premise.unique())

126

In [33]:
df12.Premise.value_counts(dropna=False)

20A    16963
20R    16740
18A    15070
13R     9323
20D     7823
18O     6498
080     5651
18R     3697
250     3642
18P     2947
120     2870
18T     2608
210     2121
23S     1617
18G     1469
070     1454
18M     1425
18N     1225
05C     1204
240     1166
03B     1136
18C     1119
05Z     1062
140     1060
18U      956
20G      843
20P      772
05O      751
20C      699
11R      694
       ...  
19V       27
05A       27
09R       27
150       26
N         26
11C       26
25V       25
24B       24
22C       23
11F       21
11P       20
24V       19
06        18
02S       17
21V       17
04V       12
01K       12
01T       10
05Y       10
02C        9
160        8
12V        7
13T        7
25R        5
11V        5
14V        5
24M        3
09V        2
22V        2
13C        1
Name: Premise, Length: 126, dtype: int64

# Cleanup
## Offense Type Column

preatty clean!


In [34]:
df12['Offense Type'].value_counts(dropna=False)

Theft                 67978
Burglary              26630
Auto Theft            12672
Aggravated Assault    10238
Robbery                9385
Rape                    663
Murder                  205
Name: Offense Type, dtype: int64

In [35]:
df12['Offense Type'].unique()

array(['Murder', 'Rape', 'Robbery', 'Aggravated Assault', 'Burglary',
       'Auto Theft', 'Theft'], dtype=object)

In [36]:
len(df12['Offense Type'].unique())

7

In [37]:
df12['Offense Type'] = df12['Offense Type'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [38]:
df12['Offense Type'].value_counts(dropna=False)

Theft                 67978
Burglary              26630
Auto Theft            12672
Aggravated Assault    10238
Robbery                9385
Rape                    663
Murder                  205
Name: Offense Type, dtype: int64

In [39]:
len(df12['Offense Type'].unique())

7

# Cleanup
## StreetName Column

preatty clean

In [40]:
df12.StreetName.value_counts(dropna=False).head()

WESTHEIMER    3534
GESSNER       2000
GULF          1909
RICHMOND      1786
NORTH         1738
Name: StreetName, dtype: int64

In [41]:
len(df12.StreetName.unique())

7553

In [42]:
df12['StreetName'] = df12['StreetName'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [43]:
len(df12.StreetName.unique())

7553

In [44]:
df12[df12.StreetName.isnull()].head()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour


In [45]:
df12.StreetName.value_counts(dropna=False).head()

WESTHEIMER    3534
GESSNER       2000
GULF          1909
RICHMOND      1786
NORTH         1738
Name: StreetName, dtype: int64

# Cleanup
## BlockRange Column

- create mask to find 'UNK' values
- match with similar beat value (Needs to be done)

In [46]:
df12.BlockRange.value_counts(dropna=False).head()  # find UNK

900-999      1828
100-199      1725
9400-9499    1681
4400-4499    1534
800-899      1503
Name: BlockRange, dtype: int64

In [47]:
unk = df12.BlockRange == 'UNK'  # boolean mask

In [48]:
df12[unk]  # 0 rows

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour


In [49]:
df12[df12.BlockRange.isnull()].head()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour


# Cleanup
## Beat Column

preatty clean

In [50]:
df12.Beat.unique()

array(['17E40', '3B10', '19G10', '8C10', '3B40', '6B60', '11H10', '6B50',
       '18F30', '7C20', '19G50', '7C50', '16E30', '13D20', '16E20',
       '10H10', '12D10', '10H70', '1A20', '2A10', '6B10', '14D20',
       '12D40', '14D40', '20G10', '10H40', '13D10', '17E10', '7C10',
       '5F30', '17E30', '1A10', '3B30', '19G40', '3B50', '10H60', '18F20',
       '18F50', '8C50', '11H20', '20G40', '6B30', '20G70', '14D10',
       '12D20', '20G30', '6B20', '17E20', '1A40', '11H30', '13D40',
       '15E40', '14D30', '5F20', '19G30', '9C40', '9C20', '1A30', '10H20',
       '6B40', '10H50', '2A50', '15E20', '20G60', '18F40', '18F60',
       '12D60', '8C40', '16E10', '20G80', '16E40', '11H40', '20G50',
       '7C30', '7C40', '12D30', '5F10', '20G20', '19G20', '4F20', '1A50',
       '2A30', '15E10', '10H80', '15E30', '11H50', '8C30', '14D50',
       '12D70', '4F30', '24C20', '2A60', '2A20', '9C30', '8C20', '5F40',
       '2A40', '8C60', '4F10', '24C60', 'UNK', '10H30', '18F10', '13D30',
       '9C

In [51]:
len(df12.Beat.unique())

121

In [52]:
df12['Beat'] = df12['Beat'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [53]:
len(df12.Beat.unique())

121

In [54]:
df12[df12.Beat.isnull()].head()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour


In [55]:
df12.Beat.value_counts(dropna=False).head()

12D10    2973
19G10    2832
6B60     2797
3B10     2476
13D20    2397
Name: Beat, dtype: int64

In [56]:
df12.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127771 entries, 0 to 127770
Data columns (total 8 columns):
Date            127771 non-null datetime64[ns]
Beat            127771 non-null object
BlockRange      127771 non-null object
StreetName      127771 non-null object
Offense Type    127771 non-null object
Premise         127771 non-null object
# offenses      127771 non-null int64
Hour            127771 non-null int64
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 7.8+ MB


## Cleanup

### Date column
- convert to datetime
- index date colimn
- sort index

In [57]:
df12.head(5)

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
0,2012-04-05,17E40,6100-6199,CLARIDGE,Murder,20R,1,2
1,2012-04-04,3B10,11700-11799,HEMPSTEAD,Murder,18A,1,22
2,2012-04-01,19G10,7500-7599,CORPORATE,Murder,20A,1,23
3,2012-04-21,8C10,6200-6299,RIETTA,Murder,20R,1,23
4,2012-04-01,3B10,4200-4299,34TH,Murder,20A,1,6


In [58]:
df12['Date'] = pd.to_datetime(df12['Date'])

df12 = df12.set_index('Date').sort_index(ascending=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [59]:
df12.head(5)

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1973-11-16,1A30,2500-2599,RICHMOND,Theft,210,1,13
1975-05-17,4F30,1800-1899,BARKER CYPRESS,Robbery,20A,1,15
1979-07-07,18F50,3400-3499,DUNVALE,Burglary,20A,1,2
1982-01-01,19G30,13000-13099,LEADER,Rape,20A,1,15
1987-03-20,18F30,6300-6399,WINDSWEPT,Theft,05N,1,0


In [60]:
df12.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 127771 entries, 1973-11-16 to 2029-05-26
Data columns (total 7 columns):
Beat            127771 non-null object
BlockRange      127771 non-null object
StreetName      127771 non-null object
Offense Type    127771 non-null object
Premise         127771 non-null object
# offenses      127771 non-null int64
Hour            127771 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.8+ MB


## Odd dates
`DatetimeIndex: 130459 entries, 1963-02-02 to 2033-04-21
`
- some values are not from this year, lets look

In [61]:
df2012 = df12.loc['2012-01-01':'2012-12-31']  # rows with date from 01,01,12 - 12,31,12

In [62]:
df2012_wrong_date = df12[:"2011"]  # rows with year 0  upto 2012
df2012_wrong_date.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1567 entries, 1973-11-16 to 2011-12-31
Data columns (total 7 columns):
Beat            1567 non-null object
BlockRange      1567 non-null object
StreetName      1567 non-null object
Offense Type    1567 non-null object
Premise         1567 non-null object
# offenses      1567 non-null int64
Hour            1567 non-null int64
dtypes: int64(2), object(5)
memory usage: 97.9+ KB


In [63]:
df2012.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 126198 entries, 2012-01-01 to 2012-12-31
Data columns (total 7 columns):
Beat            126198 non-null object
BlockRange      126198 non-null object
StreetName      126198 non-null object
Offense Type    126198 non-null object
Premise         126198 non-null object
# offenses      126198 non-null int64
Hour            126198 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.7+ MB


# NAN values

- Beat: 0
- BlockRange: 0
- StreetName: 0
- Offense Type: 0
- Premise: 0
- Hour: 0

In [64]:
beat_nan = df2012.Beat.isnull()
block_nan = df2012.BlockRange.isnull()
str_nan = df2012.StreetName.isnull()
off_nan = df2012['Offense Type'].isnull()
premise_nan = df2012.Premise.isnull()  #
hour_nan = df2012.Hour.isnull()  #

In [65]:
df2012[hour_nan]

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


## Save clean data to  to csv

In [66]:
df2012.to_csv('crime_data_clean/crime12_clean.csv')

In [67]:
ls crime_data_clean/

[0m[01;32mcrime12_clean.csv[0m*  [01;32mcrime14_clean.csv[0m*  [01;32mcrime16_clean.csv[0m*
[01;32mcrime13_clean.csv[0m*  [01;32mcrime15_clean.csv[0m*  [01;32mcrime17_clean.csv[0m*
