In [1]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt

## Source
- [Houston Police Department Crime Statistics](http://www.houstontx.gov/police/cs/crime-stats-archives.htm)
	- years: 2008 - 2017
	- format: Access or Excel

In [2]:
ls crime_data_raw/2016

[0m[01;32mapr16.xls[0m*  [01;32mdec16.xls[0m*  [01;32mjan16.xls[0m*  [01;32mjun16.xls[0m*  [01;32mmay16.xls[0m*  [01;32moct16.xls[0m*
[01;32maug16.xls[0m*  [01;32mfeb16.xls[0m*  [01;32mjul16.xls[0m*  [01;32mmar16.xls[0m*  [01;32mnov16.xls[0m*  [01;32msep16.xls[0m*


## combine all files into one dataframe

In [3]:
path = 'crime_data_raw/2016'
all_files = glob.glob(os.path.join(path, "*.xls")) 

df_from_each_file = (pd.read_excel(f) for f in all_files)
df   = pd.concat(df_from_each_file, ignore_index=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122693 entries, 0 to 122692
Data columns (total 10 columns):
Date            122693 non-null datetime64[ns]
Hour            122693 non-null int64
Offense Type    122693 non-null object
Beat            122693 non-null object
Premise         121293 non-null object
BlockRange      122693 non-null object
StreetName      122693 non-null object
Type            122693 non-null object
Suffix          122693 non-null object
# offenses      122693 non-null int64
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 9.4+ MB


In [5]:
df.head(5)

Unnamed: 0,Date,Hour,Offense Type,Beat,Premise,BlockRange,StreetName,Type,Suffix,# offenses
0,2016-04-10,20,Robbery,10H10,"Road, Street, or Sidewalk",4000-4099,MILBY,-,-,1
1,2016-04-11,19,Aggravated Assault,10H10,"Road, Street, or Sidewalk",400-499,YORK,-,-,2
2,2016-04-12,20,Robbery,10H10,Apartment,1900-1999,RUNNELS,-,-,1
3,2016-04-13,2,Auto Theft,10H10,Driveway,100-199,SIDNEY,-,-,1
4,2016-04-14,3,Burglary,10H10,Service or Gas Station,3300-3399,CANAL,ST,-,1


## Lets create a copy

In [6]:
df1 = df.copy()

In [7]:
df1.columns

Index(['Date', 'Hour', 'Offense Type', 'Beat', 'Premise', 'BlockRange',
       'StreetName', 'Type', 'Suffix', '# offenses'],
      dtype='object')

In [8]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122693 entries, 0 to 122692
Data columns (total 10 columns):
Date            122693 non-null datetime64[ns]
Hour            122693 non-null int64
Offense Type    122693 non-null object
Beat            122693 non-null object
Premise         121293 non-null object
BlockRange      122693 non-null object
StreetName      122693 non-null object
Type            122693 non-null object
Suffix          122693 non-null object
# offenses      122693 non-null int64
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 9.4+ MB


In [9]:
df1.head(5)

Unnamed: 0,Date,Hour,Offense Type,Beat,Premise,BlockRange,StreetName,Type,Suffix,# offenses
0,2016-04-10,20,Robbery,10H10,"Road, Street, or Sidewalk",4000-4099,MILBY,-,-,1
1,2016-04-11,19,Aggravated Assault,10H10,"Road, Street, or Sidewalk",400-499,YORK,-,-,2
2,2016-04-12,20,Robbery,10H10,Apartment,1900-1999,RUNNELS,-,-,1
3,2016-04-13,2,Auto Theft,10H10,Driveway,100-199,SIDNEY,-,-,1
4,2016-04-14,3,Burglary,10H10,Service or Gas Station,3300-3399,CANAL,ST,-,1


## create a subdataframe with the columns that we want

In [10]:
df16 = df1[['Date','Beat','BlockRange','StreetName','Offense Type','Premise','# offenses','Hour']]

In [11]:
df16.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122693 entries, 0 to 122692
Data columns (total 8 columns):
Date            122693 non-null datetime64[ns]
Beat            122693 non-null object
BlockRange      122693 non-null object
StreetName      122693 non-null object
Offense Type    122693 non-null object
Premise         121293 non-null object
# offenses      122693 non-null int64
Hour            122693 non-null int64
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 7.5+ MB


In [12]:
df16.tail()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
122688,2016-09-14,UNK,7900-7999,FM 1960,Theft,Other Parking Lot,1,14
122689,2016-09-14,UNK,700-799,RICHCREST,Auto Theft,"Other, Unknown, or Not Listed",1,16
122690,2016-09-13,UNK,8300-8399,FM 1960,Theft,Hotel or Motel Parking Lot,1,20
122691,2016-09-13,UNK,17100-17199,EASTEX,Theft,Construction Site,1,7
122692,2016-09-30,UNK,11900-11999,GLEN BAY,Auto Theft,"Road, Street, or Sidewalk",1,15


## Now we can inspect df

In [13]:
df16.Date.unique() # timestamp 

array(['2016-04-10T00:00:00.000000000', '2016-04-11T00:00:00.000000000',
       '2016-04-12T00:00:00.000000000', '2016-04-13T00:00:00.000000000',
       '2016-04-14T00:00:00.000000000', '2016-04-15T00:00:00.000000000',
       '2016-04-16T00:00:00.000000000', '2016-04-18T00:00:00.000000000',
       '2016-04-20T00:00:00.000000000', '2016-02-09T00:00:00.000000000',
       '2016-03-11T00:00:00.000000000', '2016-03-27T00:00:00.000000000',
       '2016-03-28T00:00:00.000000000', '2016-03-30T00:00:00.000000000',
       '2016-03-31T00:00:00.000000000', '2016-04-01T00:00:00.000000000',
       '2016-04-03T00:00:00.000000000', '2016-04-05T00:00:00.000000000',
       '2016-04-07T00:00:00.000000000', '2016-04-08T00:00:00.000000000',
       '2016-04-09T00:00:00.000000000', '2016-04-22T00:00:00.000000000',
       '2016-04-24T00:00:00.000000000', '2016-04-26T00:00:00.000000000',
       '2016-04-28T00:00:00.000000000', '2016-04-30T00:00:00.000000000',
       '2016-04-02T00:00:00.000000000', '2016-04-29

In [14]:
df16.Beat.unique()  # seems clean

array(['10H10', '10H20', '10H30', '10H40', '10H50', '10H60', '10H70',
       '10H80', '11H10', '11H20', '11H30', '11H40', '11H50', '12D10',
       '12D20', '12D30', '12D40', '12D50', '12D60', '12D70', '13D10',
       '13D20', '13D30', '13D40', '14D10', '14D20', '14D30', '14D40',
       '14D50', '15E10', '15E20', '15E30', '15E40', '16E10', '16E20',
       '16E30', '16E40', '17E10', '17E20', '17E30', '17E40', '18F10',
       '18F20', '18F30', '18F40', '18F50', '18F60', '19G10', '19G20',
       '19G30', '19G40', '19G50', '1A10', '1A20', '1A30', '1A40', '1A50',
       '20G10', '20G20', '20G30', '20G40', '20G50', '20G60', '20G70',
       '20G80', '21I10', '21I30', '21I40', '21I50', '21I60', '23J50',
       '24C10', '24C20', '24C30', '24C40', '24C50', '24C60', '2A10',
       '2A20', '2A30', '2A40', '2A50', '2A60', '3B10', '3B30', '3B40',
       '3B50', '4F10', '4F20', '4F30', '5F10', '5F20', '5F30', '5F40',
       '6B10', '6B20', '6B30', '6B40', '6B50', '6B60', '7C10', '7C20',
       '7C30',

In [15]:
df16.Beat.value_counts(dropna=False)  # seems clean

1A20     3049
12D10    2576
19G10    2386
1A30     2273
2A50     2259
6B60     2242
17E10    2167
18F20    2132
13D20    2105
3B10     2103
15E40    2099
5F30     2076
18F40    2010
14D20    1959
20G50    1951
1A50     1921
18F30    1902
20G30    1899
1A10     1837
17E40    1798
20G10    1750
2A30     1686
11H10    1673
5F40     1653
7C20     1640
18F50    1616
6B30     1541
6B10     1511
9C40     1500
3B50     1495
         ... 
12D60     475
7C40      446
12D40     431
14D50     409
8C20      393
21I50     358
24C50     348
8C40      331
24C10     307
24C30     302
11H50     301
24C20     285
9C10      284
10H20     270
12D50     232
21I10     230
18F10     225
11H40     218
1A40      206
24C60     169
23J50     139
24C40     124
13D30     113
21I60      50
21I40      18
21I30      14
21I20       8
21I70       4
UH-3P       3
23J40       2
Name: Beat, Length: 120, dtype: int64

In [16]:
df16.BlockRange.unique()  # UNK val

array(['4000-4099', '400-499', '1900-1999', '100-199', '3300-3399',
       '200-299', '3400-3499', '5000-5099', '500-599', '3800-3899', 'UNK',
       '300-399', '2200-2299', '3100-3199', '900-999', '4700-4799',
       '2100-2199', '2800-2899', '4200-4299', '4500-4599', '1200-1299',
       '4100-4199', '4600-4699', '5400-5499', '1400-1499', '5200-5299',
       '2300-2399', '2000-2099', '800-899', '3500-3599', '1500-1599',
       '1100-1199', '2500-2599', '1700-1799', '2900-2999', '2400-2499',
       '1000-1099', '3200-3299', '2700-2799', '700-799', '3900-3999',
       '1600-1699', '1300-1399', '4400-4499', '2600-2699', '3600-3699',
       '3000-3099', '4300-4399', '3700-3799', '4800-4899', '5500-5599',
       '6200-6299', '5100-5199', '6300-6399', '6400-6499', '5900-5999',
       '4900-4999', '5300-5399', '6500-6599', '6000-6099', '5600-5699',
       '6600-6699', '6700-6799', '7000-7099', '7100-7199', '1800-1899',
       '6100-6199', '6900-6999', '7300-7399', '5700-5799', '7400-7499',
 

In [17]:
df16.BlockRange.value_counts(dropna=False)  # 1412 UNK values

2300-2399        1752
100-199          1707
900-999          1582
2400-2499        1529
1000-1099        1484
2500-2599        1480
800-899          1477
300-399          1444
4400-4499        1412
UNK              1412
700-799          1376
200-299          1307
1300-1399        1303
1500-1599        1281
1400-1499        1275
500-599          1248
2700-2799        1234
7900-7999        1204
2100-2199        1174
9400-9499        1171
1100-1199        1170
5000-5099        1160
9500-9599        1151
2000-2099        1135
3800-3899        1130
5400-5499        1117
5800-5899        1113
2600-2699        1104
1200-1299        1104
2800-2899        1095
                 ... 
20000-20099         4
18900-18999         4
23300-23399         4
19900-19999         4
24600-24699         4
22000-22099         4
23600-23699         3
20200-20299         2
23800-23899         2
22100-22199         2
24200-24299         2
21800-21899         2
24300-24399         2
20400-20499         1
29800-2989

In [18]:
df16.StreetName.unique()  # 

array(['MILBY', 'YORK', 'RUNNELS', ..., 'BRYSTONE', 'RICHCREST',
       'GLEN BAY'], dtype=object)

In [19]:
df16.StreetName.value_counts(dropna=False)  # seems clean

WESTHEIMER               3768
SAM HOUSTON              1792
GULF                     1785
KATY                     1696
GESSNER                  1680
RICHMOND                 1667
NORTH                    1663
LOOP                     1471
SOUTHWEST                1260
MAIN                     1207
BISSONNET                1165
NORTHWEST                1140
BELLFORT                 1060
BELLAIRE                  996
POST OAK                  956
BEECHNUT                  949
SHEPHERD                  860
FONDREN                   836
FM 1960                   834
GREENS                    812
TIDWELL                   808
WAYSIDE                   750
BROADWAY                  719
LITTLE YORK               682
FANNIN                    646
KIRBY                     613
WILCREST                  605
EAST                      604
FUQUA                     592
AIRPORT                   584
                         ... 
VILLA PINES                 1
GLEN DELL                   1
WOOD RIVER

In [20]:
df16['Offense Type'].unique()  # lots of extra space

array(['Robbery', 'Aggravated Assault', 'Auto Theft', 'Burglary', 'Theft',
       'Murder', 'Rape', 1], dtype=object)

In [21]:
df16['Offense Type'].value_counts(dropna=False) #4 results of value 1

Theft                 69219
Burglary              18442
Auto Theft            12690
Aggravated Assault    10866
Robbery                9955
Rape                   1232
Murder                  285
1                         4
Name: Offense Type, dtype: int64

In [22]:
df16.Premise.unique()  # seems clean

array(['Road, Street, or Sidewalk', 'Apartment', 'Driveway',
       'Service or Gas Station', 'Apartment Parking Lot',
       'Residence or House', 'Other Parking Lot',
       'Multi-Plex Home (Duplex,Triplex Etc.)', 'Sexually Oriented Club',
       'Church, Synagogue, or Temple',
       'Miscellaneous Business (Non-Specific)',
       'Department or Discount Store', 'Commercial Parking Lot or Garage',
       'Fire Station', 'Body Shop', 'Garage or Carport', 'Bus Stop',
       'Vacant Hotel, Motel, Etc.', 'Grocery Store or Supermarket',
       'Bar or Night Club', 'Other, Unknown, or Not Listed',
       'Bar or Night Club Parking Lot',
       'Vacant Other Residential (Apartment,Dorms)',
       'Vacant Storage Fac (Barn,Garage,Warehouse)', 'Hospital',
       'Convenience Store', 'Restaurant or Cafeteria Parking Lot',
       'Grocery Store or Supermarket Parking Lot',
       'Light Rail (Metro Rail) Vehicle',
       'Stadium, Sports Arena, Race Track',
       'Vacant Building (Commercial

In [23]:
df16.Premise.value_counts(dropna=False)  # 1400 NAN values

Residence or House                           17013
Apartment Parking Lot                        13931
Road, Street, or Sidewalk                    12857
Apartment                                    11923
Restaurant or Cafeteria Parking Lot           5034
Driveway                                      4611
Other Parking Lot                             4427
Department or Discount Store                  4077
Miscellaneous Business (Non-Specific)         3223
Commercial Parking Lot or Garage              3208
Service or Gas Station                        2869
Strip Business Center Parking Lot             2320
Other, Unknown, or Not Listed                 2319
Grocery Store or Supermarket                  2207
Hotel or Motel Parking Lot                    1975
Convenience Store                             1879
Restaurant or Cafeteria                       1827
Grocery Store or Supermarket Parking Lot      1695
NaN                                           1400
Bar or Night Club Parking Lot  

In [24]:
df16['# offenses'].unique()

array([ 1,  2,  4,  5,  3,  6,  8, 13,  7, 10])

In [25]:
df16['# offenses'].value_counts(dropna=False)  # seems ok

1     120640
2       1741
3        212
4         63
5         21
6         11
13         2
10         1
8          1
7          1
Name: # offenses, dtype: int64

In [26]:
df16.Hour.unique()

array([20, 19,  2,  3, 11, 14, 15, 22,  9, 12, 10, 18,  5, 17, 13, 21,  8,
        6,  7, 23, 16,  0,  1,  4])

In [27]:
df16.Hour.value_counts(dropna=False)  # 2-3 pm is the most common time for offenses

15    7654
16    7508
14    7320
13    7154
11    7093
10    7075
9     7036
12    7026
17    6661
8     6347
18    5994
19    5509
20    5324
7     5178
21    4791
22    4458
23    3879
6     3322
0     2973
1     2400
2     2355
5     1965
3     1925
4     1746
Name: Hour, dtype: int64

# Cleanup
## Premise Column

- strip empty spaces NOT NEEDED
- find nan values :1400

In [28]:
len(df16.Premise.unique())

126

In [29]:
df16['Premise'] = df16['Premise'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [30]:
len(df16.Premise.unique())

126

In [31]:
df16.Premise.value_counts(dropna=False)

Residence or House                           17013
Apartment Parking Lot                        13931
Road, Street, or Sidewalk                    12857
Apartment                                    11923
Restaurant or Cafeteria Parking Lot           5034
Driveway                                      4611
Other Parking Lot                             4427
Department or Discount Store                  4077
Miscellaneous Business (Non-Specific)         3223
Commercial Parking Lot or Garage              3208
Service or Gas Station                        2869
Strip Business Center Parking Lot             2320
Other, Unknown, or Not Listed                 2319
Grocery Store or Supermarket                  2207
Hotel or Motel Parking Lot                    1975
Convenience Store                             1879
Restaurant or Cafeteria                       1827
Grocery Store or Supermarket Parking Lot      1695
NaN                                           1400
Bar or Night Club Parking Lot  

In [32]:
df16[df16.Premise.isnull()].head()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
176,2016-04-23,10H40,1000-1099,ELGIN,Theft,,1,19
309,2016-04-02,10H60,5900-5999,SCOTT,Theft,,1,9
326,2016-04-27,10H60,2300-2399,WICHITA,Burglary,,1,5
485,2016-04-10,10H80,1100-1199,AUTREY,Theft,,1,10
670,2016-04-26,11H10,7400-7499,SATSUMA,Aggravated Assault,,1,15


# Cleanup
## Offense Type Column

- strip empty spaces  NOT NEEDED
- join similar values  NOT NEEDED
- find nan values

In [33]:
df16['Offense Type'].value_counts(dropna=False)

Theft                 69219
Burglary              18442
Auto Theft            12690
Aggravated Assault    10866
Robbery                9955
Rape                   1232
Murder                  285
1                         4
Name: Offense Type, dtype: int64

In [34]:
df16['Offense Type'].unique()

array(['Robbery', 'Aggravated Assault', 'Auto Theft', 'Burglary', 'Theft',
       'Murder', 'Rape', 1], dtype=object)

In [35]:
len(df16['Offense Type'].unique())

8

In [36]:
df16['Offense Type'] = df16['Offense Type'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [37]:
df16['Offense Type'].value_counts(dropna=False)

Theft                 69219
Burglary              18442
Auto Theft            12690
Aggravated Assault    10866
Robbery                9955
Rape                   1232
Murder                  285
NaN                       4
Name: Offense Type, dtype: int64

In [38]:
df16[df16['Offense Type'].isnull()]  # nan values, will fix later, they used to be value 1

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
39979,2015-11-17,8C50,8600-8699,STERLINGSHIRE,,Apartment,1,0
51910,2016-07-15,12D20,11200-11299,SAGELAND,,Residence or House,1,0
59165,2016-07-02,5F40,11600-11699,GESSNER,,Residence or House,1,14
92438,2015-06-04,11H10,5600-5699,LEELAND,,Apartment,1,20


# Cleanup
## StreetName Column

- strip empty spaces  : NOT NEEDED
- find similar values and combine ( needs done)

In [39]:
df16.StreetName.value_counts(dropna=False)

WESTHEIMER               3768
SAM HOUSTON              1792
GULF                     1785
KATY                     1696
GESSNER                  1680
RICHMOND                 1667
NORTH                    1663
LOOP                     1471
SOUTHWEST                1260
MAIN                     1207
BISSONNET                1165
NORTHWEST                1140
BELLFORT                 1060
BELLAIRE                  996
POST OAK                  956
BEECHNUT                  949
SHEPHERD                  860
FONDREN                   836
FM 1960                   834
GREENS                    812
TIDWELL                   808
WAYSIDE                   750
BROADWAY                  719
LITTLE YORK               682
FANNIN                    646
KIRBY                     613
WILCREST                  605
EAST                      604
FUQUA                     592
AIRPORT                   584
                         ... 
VILLA PINES                 1
GLEN DELL                   1
WOOD RIVER

In [40]:
len(df16.StreetName.unique())

7844

In [41]:
df16['StreetName'] = df16['StreetName'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [42]:
len(df16.StreetName.unique())

7844

# Cleanup
## BlockRange Column

- create mask to find 'UNK' values
- match with similar beat value (Needs to be done)

In [43]:
df16.BlockRange.value_counts(dropna=False) # find UNK: 1412

2300-2399        1752
100-199          1707
900-999          1582
2400-2499        1529
1000-1099        1484
2500-2599        1480
800-899          1477
300-399          1444
4400-4499        1412
UNK              1412
700-799          1376
200-299          1307
1300-1399        1303
1500-1599        1281
1400-1499        1275
500-599          1248
2700-2799        1234
7900-7999        1204
2100-2199        1174
9400-9499        1171
1100-1199        1170
5000-5099        1160
9500-9599        1151
2000-2099        1135
3800-3899        1130
5400-5499        1117
5800-5899        1113
2600-2699        1104
1200-1299        1104
2800-2899        1095
                 ... 
20000-20099         4
18900-18999         4
23300-23399         4
19900-19999         4
24600-24699         4
22000-22099         4
23600-23699         3
20200-20299         2
23800-23899         2
22100-22199         2
24200-24299         2
21800-21899         2
24300-24399         2
20400-20499         1
29800-2989

In [44]:
unk = df16.BlockRange == 'UNK'  # boolean mask

In [45]:
df16[unk]  # 1412 rows

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
13,2016-03-27,10H10,UNK,2 NAGLE,Theft,Residence or House,1,19
16,2016-03-31,10H10,UNK,GREENWOOD,Theft,Driveway,1,10
33,2016-04-28,10H10,UNK,HUNT,Burglary,Garage or Carport,1,20
58,2016-04-15,10H30,UNK,GULF,Robbery,"Road, Street, or Sidewalk",1,8
373,2016-04-19,10H70,UNK,BAYLOR PLAZA,Theft,University or College,1,18
381,2016-04-02,10H70,UNK,HERMANN PARK,Burglary,Apartment,1,20
384,2016-04-03,10H70,UNK,BAYLOR PLAZA,Theft,"Other, Unknown, or Not Listed",1,15
419,2016-04-09,10H70,UNK,HERMANN PARK,Burglary,Apartment,1,17
512,2016-03-21,10H80,UNK,1604.5 W. BELLFORT,Theft,Office Building,1,17
732,2016-04-21,11H30,UNK,8011.5 BENDELL DR,Burglary,"Other, Unknown, or Not Listed",1,7


# Cleanup
## Beat Column

- strip empty spaces  : NOT NEEDED

In [46]:
df16.Beat.unique()

array(['10H10', '10H20', '10H30', '10H40', '10H50', '10H60', '10H70',
       '10H80', '11H10', '11H20', '11H30', '11H40', '11H50', '12D10',
       '12D20', '12D30', '12D40', '12D50', '12D60', '12D70', '13D10',
       '13D20', '13D30', '13D40', '14D10', '14D20', '14D30', '14D40',
       '14D50', '15E10', '15E20', '15E30', '15E40', '16E10', '16E20',
       '16E30', '16E40', '17E10', '17E20', '17E30', '17E40', '18F10',
       '18F20', '18F30', '18F40', '18F50', '18F60', '19G10', '19G20',
       '19G30', '19G40', '19G50', '1A10', '1A20', '1A30', '1A40', '1A50',
       '20G10', '20G20', '20G30', '20G40', '20G50', '20G60', '20G70',
       '20G80', '21I10', '21I30', '21I40', '21I50', '21I60', '23J50',
       '24C10', '24C20', '24C30', '24C40', '24C50', '24C60', '2A10',
       '2A20', '2A30', '2A40', '2A50', '2A60', '3B10', '3B30', '3B40',
       '3B50', '4F10', '4F20', '4F30', '5F10', '5F20', '5F30', '5F40',
       '6B10', '6B20', '6B30', '6B40', '6B50', '6B60', '7C10', '7C20',
       '7C30',

In [47]:
len(df16.Beat.unique())

120

In [48]:
df16['Beat'] = df16['Beat'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [49]:
len(df16.Beat.unique())

120

In [50]:
df16.Beat.value_counts(dropna=False)

1A20     3049
12D10    2576
19G10    2386
1A30     2273
2A50     2259
6B60     2242
17E10    2167
18F20    2132
13D20    2105
3B10     2103
15E40    2099
5F30     2076
18F40    2010
14D20    1959
20G50    1951
1A50     1921
18F30    1902
20G30    1899
1A10     1837
17E40    1798
20G10    1750
2A30     1686
11H10    1673
5F40     1653
7C20     1640
18F50    1616
6B30     1541
6B10     1511
9C40     1500
3B50     1495
         ... 
12D60     475
7C40      446
12D40     431
14D50     409
8C20      393
21I50     358
24C50     348
8C40      331
24C10     307
24C30     302
11H50     301
24C20     285
9C10      284
10H20     270
12D50     232
21I10     230
18F10     225
11H40     218
1A40      206
24C60     169
23J50     139
24C40     124
13D30     113
21I60      50
21I40      18
21I30      14
21I20       8
21I70       4
UH-3P       3
23J40       2
Name: Beat, Length: 120, dtype: int64

In [51]:
df16.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122693 entries, 0 to 122692
Data columns (total 8 columns):
Date            122693 non-null datetime64[ns]
Beat            122693 non-null object
BlockRange      122693 non-null object
StreetName      122655 non-null object
Offense Type    122689 non-null object
Premise         121293 non-null object
# offenses      122693 non-null int64
Hour            122693 non-null int64
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 7.5+ MB


## Cleanup

### Date column
- convert to datetime
- index date colimn
- sort index

In [52]:
df16.head(5)

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
0,2016-04-10,10H10,4000-4099,MILBY,Robbery,"Road, Street, or Sidewalk",1,20
1,2016-04-11,10H10,400-499,YORK,Aggravated Assault,"Road, Street, or Sidewalk",2,19
2,2016-04-12,10H10,1900-1999,RUNNELS,Robbery,Apartment,1,20
3,2016-04-13,10H10,100-199,SIDNEY,Auto Theft,Driveway,1,2
4,2016-04-14,10H10,3300-3399,CANAL,Burglary,Service or Gas Station,1,3


In [53]:
df16['Date'] = pd.to_datetime(df16['Date'])

df16 = df16.set_index('Date').sort_index(ascending=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [54]:
df16.head(5)

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1916-05-23,10H70,UNK,LIBERTY ROAD,Aggravated Assault,Residence or House,1,19
1916-08-06,6B30,5800-5899,HOUSTON ROSSLYN,Burglary,Rental Storage Facility,1,9
1916-10-22,18F20,5300-5399,ALABAMA,Theft,Commercial Parking Lot or Garage,1,9
1916-10-22,19G40,8400-8499,COOK,Burglary,"Vacant Storage Fac (Barn,Garage,Warehouse)",1,15
1916-10-22,20G30,2900-2999,HAYES,Theft,Apartment,1,13


In [55]:
df16.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 122693 entries, 1916-05-23 to 2016-12-31
Data columns (total 7 columns):
Beat            122693 non-null object
BlockRange      122693 non-null object
StreetName      122655 non-null object
Offense Type    122689 non-null object
Premise         121293 non-null object
# offenses      122693 non-null int64
Hour            122693 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.5+ MB


## Odd dates
`DatetimeIndex: 122693 entries, 1916-05-23 to 2016-12-31`

- some values are not from this year, lets look

In [56]:
df2016 = df16.loc['2016-01-01':'2016-12-31']  # rows with date from 01,01,16 - 12,31,16

In [57]:
df2016_wrong_date = df16[:"2015"]  # rows with year 0  upto 2016

In [58]:
df2016.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 121421 entries, 2016-01-01 to 2016-12-31
Data columns (total 7 columns):
Beat            121421 non-null object
BlockRange      121421 non-null object
StreetName      121383 non-null object
Offense Type    121419 non-null object
Premise         120072 non-null object
# offenses      121421 non-null int64
Hour            121421 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.4+ MB


In [59]:
df2016_wrong_date.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1272 entries, 1916-05-23 to 2015-12-31
Data columns (total 7 columns):
Beat            1272 non-null object
BlockRange      1272 non-null object
StreetName      1272 non-null object
Offense Type    1270 non-null object
Premise         1221 non-null object
# offenses      1272 non-null int64
Hour            1272 non-null int64
dtypes: int64(2), object(5)
memory usage: 79.5+ KB


In [60]:
df2016.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 121421 entries, 2016-01-01 to 2016-12-31
Data columns (total 7 columns):
Beat            121421 non-null object
BlockRange      121421 non-null object
StreetName      121383 non-null object
Offense Type    121419 non-null object
Premise         120072 non-null object
# offenses      121421 non-null int64
Hour            121421 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.4+ MB


# NAN values

- Premise nan : 1349 rows
- streetName nan : 38 rows
- Offense Type nan : 2

In [61]:
premise_nan = df2016.Premise.isnull()  #
str_nan = df2016.StreetName.isnull()
off_nan = df2016['Offense Type'].isnull()

## DROP nan
drop nan values of StreetName

In [62]:
df2016.head()

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-01,4F30,1800-1899,BARKER CYPRESS,Theft,Apartment Parking Lot,1,13
2016-01-01,16E20,14400-14499,MORENO,Aggravated Assault,Residence or House,1,11
2016-01-01,10H80,2000-2099,SOUTHWEST,Burglary,Department or Discount Store,1,7
2016-01-01,16E20,2600-2699,TIDEWATER,Auto Theft,Driveway,1,20
2016-01-01,8C60,7600-7699,YOE,Theft,"Other, Unknown, or Not Listed",1,8


In [63]:
df2016 = df2016.dropna(subset=['StreetName'])  # drop nan values from StreetName, 38 rows

In [64]:
df2016.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 121383 entries, 2016-01-01 to 2016-12-31
Data columns (total 7 columns):
Beat            121383 non-null object
BlockRange      121383 non-null object
StreetName      121383 non-null object
Offense Type    121381 non-null object
Premise         120034 non-null object
# offenses      121383 non-null int64
Hour            121383 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.4+ MB


In [65]:
df2016['Premise'] = df2016['Premise'].fillna('unk')

In [66]:
df2016.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 121383 entries, 2016-01-01 to 2016-12-31
Data columns (total 7 columns):
Beat            121383 non-null object
BlockRange      121383 non-null object
StreetName      121383 non-null object
Offense Type    121381 non-null object
Premise         121383 non-null object
# offenses      121383 non-null int64
Hour            121383 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.4+ MB


In [67]:
df2016['Offense Type'].value_counts(dropna=False)

Theft                 68472
Burglary              18295
Auto Theft            12578
Aggravated Assault    10794
Robbery                9929
Rape                   1040
Murder                  273
NaN                       2
Name: Offense Type, dtype: int64

## Fillna
will use this since Dont know what to do... :/

In [68]:
df2016['Offense Type'].fillna(method='ffill', inplace=True)

In [69]:
df2016.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 121383 entries, 2016-01-01 to 2016-12-31
Data columns (total 7 columns):
Beat            121383 non-null object
BlockRange      121383 non-null object
StreetName      121383 non-null object
Offense Type    121383 non-null object
Premise         121383 non-null object
# offenses      121383 non-null int64
Hour            121383 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.4+ MB


## Save clean data to  to csv

In [70]:
df2016.to_csv('crime_data_clean/crime16_clean.csv')

In [71]:
ls crime_data_clean/

[0m[01;32mcrime16_clean.csv[0m*  [01;32mcrime17_clean.csv[0m*
