In [1]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt

## Source
- [Houston Police Department Crime Statistics](http://www.houstontx.gov/police/cs/crime-stats-archives.htm)
	- years: 2008 - 2017
	- format: Access or Excel

In [3]:
ls crime_data/2017

[0m[01;32mapr17.xls[0m*       [01;32mdec17.xls[0m*  [01;32mjan1.csv[0m*   [01;32mmar17.xls[0m*  [01;32moct17.xls[0m*
[01;32maug17.xls[0m*       [01;32mfeb17.xls[0m*  [01;32mjul17.xls[0m*  [01;32mmay17.xls[0m*  [01;32msep17.xls[0m*
[01;32mclean_2017.csv[0m*  [01;32mjan17.xls[0m*  [01;32mjun17.xls[0m*  [01;32mnov17.xls[0m*


## combine all files into one dataframe

In [4]:
path = 'crime_data/2017'
all_files = glob.glob(os.path.join(path, "*.xls")) 

df_from_each_file = (pd.read_excel(f) for f in all_files)
df   = pd.concat(df_from_each_file, ignore_index=True)



In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119727 entries, 0 to 119726
Data columns (total 13 columns):
# offenses      40378 non-null float64
Beat            119727 non-null object
Block Range     79349 non-null object
BlockRange      40378 non-null object
Date            119727 non-null object
Hour            119727 non-null int64
Offense Type    119727 non-null object
Offenses        79349 non-null float64
Premise         119240 non-null object
Street Name     79349 non-null object
StreetName      40378 non-null object
Suffix          119727 non-null object
Type            119727 non-null object
dtypes: float64(2), int64(1), object(10)
memory usage: 11.9+ MB


In [6]:
df.head(5)

Unnamed: 0,# offenses,Beat,Block Range,BlockRange,Date,Hour,Offense Type,Offenses,Premise,Street Name,StreetName,Suffix,Type
0,1.0,10H10,,200-299,2017-04-10 00:00:00,15,Burglary,,Residence or House,,CLIFTON,-,-
1,1.0,10H10,,2300-2399,2017-04-11 00:00:00,15,Theft,,Restaurant or Cafeteria Parking Lot,,CANAL,-,ST
2,1.0,10H10,,2300-2399,2017-04-11 00:00:00,17,Theft,,Restaurant or Cafeteria Parking Lot,,CANAL,-,ST
3,1.0,10H10,,4600-4699,2017-04-12 00:00:00,9,Burglary,,Miscellaneous Business (Non-Specific),,CANAL,-,ST
4,1.0,10H10,,100-199,2017-04-12 00:00:00,19,Theft,,"Other, Unknown, or Not Listed",,ADAM,-,LN


## Lets create a copy

In [7]:
df1 = df.copy()

In [8]:
df1.columns

Index(['# offenses', 'Beat', 'Block Range', 'BlockRange', 'Date', 'Hour',
       'Offense Type', 'Offenses', 'Premise', 'Street Name', 'StreetName',
       'Suffix', 'Type'],
      dtype='object')

## Duplicate columns
We have duplicate columns with diffrent names
- `Block Range` & `BlockRange`
- `StreetName`  & `Street Name`
- `Offenses`  & `# offenses`

lets concat the columns remove nan values and reindex

In [9]:
df1['BlockRange'] = pd.concat([df1['Block Range'].dropna(), df1['BlockRange'].dropna()]).reindex_like(df)
df1['StreetName'] = pd.concat([df1['Street Name'].dropna(), df1['StreetName'].dropna()]).reindex_like(df)
df1['Offenses'] = pd.concat([df1['# offenses'].dropna(), df1['Offenses'].dropna()]).reindex_like(df)

In [10]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119727 entries, 0 to 119726
Data columns (total 13 columns):
# offenses      40378 non-null float64
Beat            119727 non-null object
Block Range     79349 non-null object
BlockRange      119727 non-null object
Date            119727 non-null object
Hour            119727 non-null int64
Offense Type    119727 non-null object
Offenses        119727 non-null float64
Premise         119240 non-null object
Street Name     79349 non-null object
StreetName      119727 non-null object
Suffix          119727 non-null object
Type            119727 non-null object
dtypes: float64(2), int64(1), object(10)
memory usage: 11.9+ MB


In [11]:
df1.head(5)

Unnamed: 0,# offenses,Beat,Block Range,BlockRange,Date,Hour,Offense Type,Offenses,Premise,Street Name,StreetName,Suffix,Type
0,1.0,10H10,,200-299,2017-04-10 00:00:00,15,Burglary,1.0,Residence or House,,CLIFTON,-,-
1,1.0,10H10,,2300-2399,2017-04-11 00:00:00,15,Theft,1.0,Restaurant or Cafeteria Parking Lot,,CANAL,-,ST
2,1.0,10H10,,2300-2399,2017-04-11 00:00:00,17,Theft,1.0,Restaurant or Cafeteria Parking Lot,,CANAL,-,ST
3,1.0,10H10,,4600-4699,2017-04-12 00:00:00,9,Burglary,1.0,Miscellaneous Business (Non-Specific),,CANAL,-,ST
4,1.0,10H10,,100-199,2017-04-12 00:00:00,19,Theft,1.0,"Other, Unknown, or Not Listed",,ADAM,-,LN


## create a subdataframe with the columns that we want

In [12]:
df17 = df1[['Date','Beat','BlockRange','StreetName','Offense Type','Premise','Offenses','Hour']]

In [13]:
df17.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119727 entries, 0 to 119726
Data columns (total 8 columns):
Date            119727 non-null object
Beat            119727 non-null object
BlockRange      119727 non-null object
StreetName      119727 non-null object
Offense Type    119727 non-null object
Premise         119240 non-null object
Offenses        119727 non-null float64
Hour            119727 non-null int64
dtypes: float64(1), int64(1), object(6)
memory usage: 7.3+ MB


In [14]:
df17.tail()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,Offenses,Hour
119722,09/29/2017,UNK,12600-12699,STAFFORD RD,Robbery,"Road, Street, or Sidewalk",1.0,1
119723,09/30/2017,UNK,10600-10699,BROOKLET,Theft,"Road, Street, or Sidewalk",1.0,13
119724,09/30/2017,UNK,12300-12399,WESTHEIMER RD,Robbery,Restaurant or Cafeteria Parking Lot,1.0,19
119725,09/30/2017,UNK,13200-13299,BEECHNUT ST,Aggravated Assault,Highway or Freeway,1.0,17
119726,11/10/2015,UNK,4900-4999,TEALGATE,Rape,Residence or House,1.0,22


## Now we can inspect df

In [15]:
df17.Date.unique() ## mixture of timestamp and date string

array([Timestamp('2017-04-10 00:00:00'), Timestamp('2017-04-11 00:00:00'),
       Timestamp('2017-04-12 00:00:00'), Timestamp('2017-04-14 00:00:00'),
       Timestamp('2017-04-18 00:00:00'), Timestamp('2017-03-18 00:00:00'),
       Timestamp('2017-03-31 00:00:00'), Timestamp('2017-04-03 00:00:00'),
       Timestamp('2017-04-05 00:00:00'), Timestamp('2017-04-06 00:00:00'),
       Timestamp('2017-04-07 00:00:00'), Timestamp('2017-04-08 00:00:00'),
       Timestamp('2017-04-09 00:00:00'), Timestamp('2017-04-20 00:00:00'),
       Timestamp('2017-04-21 00:00:00'), Timestamp('2017-04-22 00:00:00'),
       Timestamp('2017-04-23 00:00:00'), Timestamp('2017-04-24 00:00:00'),
       Timestamp('2017-04-26 00:00:00'), Timestamp('2017-04-27 00:00:00'),
       Timestamp('2017-04-28 00:00:00'), Timestamp('2017-04-29 00:00:00'),
       Timestamp('2017-04-30 00:00:00'), Timestamp('2017-04-16 00:00:00'),
       Timestamp('2017-04-13 00:00:00'), Timestamp('2017-04-02 00:00:00'),
       Timestamp('2017-04

In [16]:
df17.Beat.unique()  ## extra spaces on several results

array(['10H10', '10H20', '10H30', '10H40', '10H50', '10H60', '10H70',
       '10H80', '11H10', '11H20', '11H30', '11H40', '11H50', '12D10',
       '12D20', '12D30', '12D40', '12D50', '12D60', '12D70', '13D10',
       '13D20', '13D30', '13D40', '14D10', '14D20', '14D30', '14D40',
       '14D50', '15E10', '15E20', '15E30', '15E40', '16E10', '16E20',
       '16E30', '16E40', '17E10', '17E20', '17E30', '17E40', '18F10',
       '18F20', '18F30', '18F40', '18F50', '18F60', '19G10', '19G20',
       '19G30', '19G40', '19G50', '1A10', '1A20', '1A30', '1A40', '1A50',
       '20G10', '20G20', '20G30', '20G40', '20G50', '20G60', '20G70',
       '20G80', '21I10', '21I20', '21I30', '21I40', '21I50', '21I60',
       '21I70', '23J50', '24C10', '24C20', '24C30', '24C40', '24C50',
       '24C60', '2A10', '2A20', '2A30', '2A40', '2A50', '2A60', '3B10',
       '3B30', '3B40', '3B50', '4F10', '4F20', '4F30', '5F10', '5F20',
       '5F30', '5F40', '6B10', '6B20', '6B30', '6B40', '6B50', '6B60',
       '7C10

In [17]:
df17.Beat.value_counts(dropna=False)  # needs cleanup

1A20      2622
15E40     2242
12D10     2200
18F20     2008
19G10     2008
17E10     1926
18F40     1903
13D20     1900
3B10      1883
1A30      1832
5F30      1821
6B60      1797
14D20     1774
1A50      1741
20G50     1739
17E40     1733
2A50      1721
18F30     1697
1A10      1645
20G30     1622
20G10     1597
2A30      1581
5F40      1495
7C20      1463
18F50     1395
3B50      1389
10H70     1389
20G80     1374
11H10     1363
9C40      1338
          ... 
24C50       32
12D40       32
24C30       31
13D40       31
10H20       31
11H50       29
21I50       29
24C10       25
18F10       25
24C20       24
12D50       24
21I70       21
9C10        20
1A40        20
14D50       18
21I10       16
21I20       15
11H40       13
21I40       12
24C60       10
13D30       10
23J50       10
21I60        9
24C40        6
21I30        5
21I40        3
21I20        3
23J40        1
UH-3P        1
21I70        1
Name: Beat, Length: 237, dtype: int64

In [18]:
df17.BlockRange.unique()  # UNK val

array(['200-299', '2300-2399', '4600-4699', '100-199', '5100-5199',
       '600-699', '3500-3599', '2900-2999', '4500-4599', '4300-4399',
       '300-399', '4400-4499', '2600-2699', '5400-5499', '1900-1999',
       '3600-3699', '2200-2299', '4000-4099', '3000-3099', '4700-4799',
       'UNK', '5000-5099', '700-799', '3900-3999', '5200-5299',
       '4800-4899', '3800-3899', '1300-1399', '1600-1699', '1200-1299',
       '5300-5399', '4100-4199', '1800-1899', '500-599', '3400-3499',
       '2100-2199', '2000-2099', '1000-1099', '1700-1799', '1500-1599',
       '900-999', '1400-1499', '1100-1199', '2700-2799', '2800-2899',
       '800-899', '400-499', '4200-4299', '2400-2499', '3100-3199',
       '3300-3399', '2500-2599', '3200-3299', '3700-3799', '5500-5599',
       '6200-6299', '5900-5999', '5800-5899', '6500-6599', '6400-6499',
       '6300-6399', '7000-7099', '4900-4999', '7400-7499', '5600-5699',
       '6700-6799', '6600-6699', '6800-6899', '6900-6999', '7300-7399',
       '6100-619

In [19]:
df17.BlockRange.value_counts(dropna=False)  # 1629 UNK values & 1.1103e+006-1.1104e+006 ???

900-999          1652
UNK              1629
100-199          1605
2400-2499        1553
2300-2399        1472
1000-1099        1388
1500-1599        1365
800-899          1360
700-799          1341
200-299          1311
4400-4499        1285
300-399          1278
1400-1499        1267
2500-2599        1247
500-599          1238
1100-1199        1185
2000-2099        1185
600-699          1154
2100-2199        1152
2700-2799        1152
7500-7599        1145
1300-1399        1138
1200-1299        1137
5000-5099        1136
2800-2899        1136
5400-5499        1101
9400-9499        1079
2900-2999        1077
2200-2299        1072
400-499          1069
                 ... 
80300-80399         1
25400-25499         1
25900-25999         1
24800-24899         1
28200-28299         1
133300-133399       1
28100-28199         1
31700-31799         1
31400-31499         1
79000-79099         1
29200-29299         1
27800-27899         1
32600-32699         1
26700-26799         1
32000-3209

In [20]:
df17.StreetName.unique()  # lots extra spaces

array(['CLIFTON', 'CANAL', 'ADAM', ..., 'GREEN LAKER', 'STAFFORD RD',
       'TEALGATE'], dtype=object)

In [21]:
df17.StreetName.value_counts(dropna=False)  # name cleanup too!

WESTHEIMER                                                                                                                                                1993
WESTHEIMER RD                                                                                                                                             1230
GESSNER                                                                                                                                                    980
NORTH                                                                                                                                                      839
GULF                                                                                                                                                       785
LOOP                                                                                                                                                       778
SAM HOUSTON                                   

In [22]:
df17['Offense Type'].unique()  # lots of extra space

array(['Burglary', 'Theft', 'Robbery', 'Auto Theft', 'Aggravated Assault',
       'Rape', 'Murder', 'AutoTheft', 1, 'Burglary                 ',
       'Robbery                  ', 'Theft                    ',
       'AutoTheft                ', 'Aggravated Assault       ',
       'Rape                     ', 'Murder                   '],
      dtype=object)

In [23]:
df17['Offense Type'].value_counts(dropna=False) # combine similar values

Theft                        61411
Burglary                     15713
Aggravated Assault           11156
Robbery                       8956
AutoTheft                     6649
Theft                         6011
Auto Theft                    3874
Burglary                      1371
Rape                          1239
Aggravated Assault            1158
AutoTheft                      978
Robbery                        822
Murder                         235
Rape                           129
Murder                          23
1                                2
Name: Offense Type, dtype: int64

In [24]:
df17.Premise.unique()  # srip extra spaces

array(['Residence or House', 'Restaurant or Cafeteria Parking Lot',
       'Miscellaneous Business (Non-Specific)',
       'Other, Unknown, or Not Listed', 'Bank',
       'Road, Street, or Sidewalk', 'Other Parking Lot',
       'Apartment Parking Lot', 'Convenience Store Parking Lot',
       'Apartment', nan, 'Commercial Building',
       'Vacant Single Occ Resd(House,Townhs,Dplex)', 'Warehouse',
       'Convenience Store', 'Park and Ride Terminal',
       'Commercial Parking Lot or Garage', 'Restaurant or Cafeteria',
       'Garage or Carport', 'Bar or Night Club', 'Service or Gas Station',
       'Construction Site', 'Driveway', 'Grocery Store or Supermarket',
       'Bus Station', 'Bus Stop', 'Light Rail Platform', 'Fire Station',
       'Rental Storage Facility', 'Office Building',
       'Hotel, Motel, Inn, Etc.', 'Department or Discount Store',
       'Specialty Store (Non-Specific)', 'Church, Synagogue, or Temple',
       'Vehicle/Auto Sales/Lease/Auto Parts Store',
       'Mult

In [25]:
df17.Premise.value_counts(dropna=False)  # srip extra spaces

Residence or House                                                                                                                                        13967
Apartment Parking Lot                                                                                                                                     12838
Road, Street, or Sidewalk                                                                                                                                 11297
Apartment                                                                                                                                                 10249
Other Parking Lot                                                                                                                                          4864
Driveway                                                                                                                                                   4260
Restaurant or Cafeteria Parking Lot     

In [26]:
df17.Offenses.unique()

array([ 1.,  4.,  2.,  3.,  5., 10.,  8.,  6.,  7.])

In [27]:
df17.Offenses.value_counts(dropna=False)  # 2 locations with 10 offenses?

1.0     117727
2.0       1661
3.0        227
4.0         70
5.0         21
6.0         10
7.0          7
10.0         2
8.0          2
Name: Offenses, dtype: int64

In [28]:
df17.Hour.unique()

array([15, 17,  9, 19,  0, 14,  2,  6,  7,  8, 13,  1,  3, 20, 21, 22,  4,
       12, 11, 16, 23,  5, 10, 18, 24])

In [29]:
df17.Hour.value_counts(dropna=False)  # 2-3 pm is the most common time for offenses

15    7531
14    7386
10    7209
13    7041
16    6970
9     6836
12    6816
11    6751
17    6515
8     6203
18    5951
19    5587
20    5136
7     5051
21    4509
22    4278
23    3738
6     3253
0     2912
1     2372
2     2315
5     1920
3     1784
4     1660
24       3
Name: Hour, dtype: int64

# Cleanup
## Premise Column

- strip empty spaces

In [30]:
len(df17.Premise.unique())

253

In [31]:
df17['Premise'] = df17['Premise'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [32]:
len(df17.Premise.unique())

133

In [33]:
df17.Premise.value_counts(dropna=False)

Residence or House                           15197
Apartment Parking Lot                        14009
Road, Street, or Sidewalk                    12409
Apartment                                    11302
Other Parking Lot                             5419
Driveway                                      4729
Restaurant or Cafeteria Parking Lot           4177
Department or Discount Store                  3645
Commercial Parking Lot or Garage              3463
Miscellaneous Business (Non-Specific)         3345
Service or Gas Station                        3025
Other, Unknown, or Not Listed                 2599
Strip Business Center Parking Lot             2334
Grocery Store or Supermarket                  2091
Convenience Store                             2069
Restaurant or Cafeteria                       1915
Hotel or Motel Parking Lot                    1730
Grocery Store or Supermarket Parking Lot      1673
Hotel, Motel, Inn, Etc.                       1314
Convenience Store Parking Lot  

# Cleanup
## Offense Type Column

- strip empty spaces
- join similar values
- find value of 1 row
    - value 1 change to nan values, will fix later

In [34]:
df17['Offense Type'].value_counts(dropna=False)

Theft                        61411
Burglary                     15713
Aggravated Assault           11156
Robbery                       8956
AutoTheft                     6649
Theft                         6011
Auto Theft                    3874
Burglary                      1371
Rape                          1239
Aggravated Assault            1158
AutoTheft                      978
Robbery                        822
Murder                         235
Rape                           129
Murder                          23
1                                2
Name: Offense Type, dtype: int64

In [35]:
df17['Offense Type'].unique()

array(['Burglary', 'Theft', 'Robbery', 'Auto Theft', 'Aggravated Assault',
       'Rape', 'Murder', 'AutoTheft', 1, 'Burglary                 ',
       'Robbery                  ', 'Theft                    ',
       'AutoTheft                ', 'Aggravated Assault       ',
       'Rape                     ', 'Murder                   '],
      dtype=object)

In [36]:
len(df17['Offense Type'].unique())

16

In [37]:
df17['Offense Type'] = df17['Offense Type'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [38]:
df17['Offense Type'].value_counts(dropna=False)

Theft                 67422
Burglary              17084
Aggravated Assault    12314
Robbery                9778
AutoTheft              7627
Auto Theft             3874
Rape                   1368
Murder                  258
NaN                       2
Name: Offense Type, dtype: int64

In [39]:
len(df17['Offense Type'].unique())

9

In [40]:
df17['Offense Type'].value_counts(dropna=False) # need to join AutoTheft & Auto Theft

Theft                 67422
Burglary              17084
Aggravated Assault    12314
Robbery                9778
AutoTheft              7627
Auto Theft             3874
Rape                   1368
Murder                  258
NaN                       2
Name: Offense Type, dtype: int64

In [41]:
df17 = df17.replace('Auto Theft','AutoTheft')

In [42]:
len(df17['Offense Type'].unique())

8

In [43]:
df17['Offense Type'].value_counts(dropna=False)  # still have nan values

Theft                 67422
Burglary              17084
Aggravated Assault    12314
AutoTheft             11501
Robbery                9778
Rape                   1368
Murder                  258
NaN                       2
Name: Offense Type, dtype: int64

In [44]:
df17[df17['Offense Type'].isnull()]  # nan values, will fix later, they used to be value 1

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,Offenses,Hour
35775,2017-02-15 00:00:00,20G40,1400-1499,DAIRY ASHFORD,,Apartment,1.0,23
38357,2016-12-31 00:00:00,6B20,4200-4299,OAK SHADOWS,,Residence or House,1.0,1


# Cleanup
## StreetName Column

- strip empty spaces
- find similar values and combine ( needs done)

In [45]:
df17.StreetName.value_counts(dropna=False).head()

WESTHEIMER       1993
WESTHEIMER RD    1230
GESSNER           980
NORTH             839
GULF              785
Name: StreetName, dtype: int64

In [46]:
len(df17.StreetName.unique())

21587

In [47]:
df17['StreetName'] = df17['StreetName'].str.strip()

In [48]:
len(df17.StreetName.unique())

18672

In [49]:
df17.StreetName.value_counts(dropna=False)  # WESTHEIMER & WESTHEIMER DR are similar, need cleanup

WESTHEIMER                 2101
WESTHEIMER RD              1438
GESSNER                    1028
NORTH                       882
LOOP                        833
GULF                        828
SAM HOUSTON                 807
RICHMOND                    771
KATY                        743
NORTHWEST                   710
SOUTHWEST                   695
MAIN                        675
BISSONNET                   648
RICHMOND AVE                621
BELLAIRE                    558
BELLFORT                    548
POST OAK                    536
BEECHNUT                    513
SHEPHERD                    474
FONDREN                     459
TIDWELL                     458
BISSONNET ST                437
BELLAIRE BLVD               426
LITTLE YORK                 411
BROADWAY                    410
WAYSIDE                     403
EAST                        394
FM 1960                     374
BEECHNUT ST                 365
KIRBY                       355
                           ... 
LAKE GEN

# Cleanup
## BlockRange Column

- create mask to find 'UNK' values
- match with similar beat value (Needs to be done)

In [50]:
df17.BlockRange.value_counts(dropna=False).head()  # find UNK

900-999      1652
UNK          1629
100-199      1605
2400-2499    1553
2300-2399    1472
Name: BlockRange, dtype: int64

In [51]:
unk = df17.BlockRange == 'UNK'  # boolean mask

In [52]:
df17[unk]  # 1629 rows

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,Offenses,Hour
33,2017-04-16 00:00:00,10H20,UNK,2 RUSK,Burglary,Residence or House,1.0,22
71,2017-04-17 00:00:00,10H30,UNK,CHARTRES,Aggravated Assault,"Road, Street, or Sidewalk",1.0,2
139,2017-04-17 00:00:00,10H40,UNK,WESTHEIMER AND HWY 6,Theft,Restaurant or Cafeteria,1.0,12
275,2017-01-06 00:00:00,10H50,UNK,PIERCE,Theft,Commercial Building,1.0,13
314,2017-04-28 00:00:00,10H50,UNK,HOLMAN,Theft,Residence or House,1.0,6
454,2017-04-10 00:00:00,10H70,UNK,BAYLOR PLAZA,Theft,University or College,1.0,17
457,2017-04-10 00:00:00,10H70,UNK,HERMANN MUSEUM,Theft,Apartment,1.0,10
465,2017-04-12 00:00:00,10H70,UNK,BAYLOR PLAZA,Theft,Private School,1.0,16
467,2017-04-13 00:00:00,10H70,UNK,HERMANN MUSEUM,Theft,Apartment Parking Lot,1.0,18
474,2017-04-14 00:00:00,10H70,UNK,HERMANN MUSEUM,Theft,Apartment Parking Lot,1.0,13


# Cleanup
## Beat Column

- strip empty spaces

In [53]:
df17.Beat.unique()

array(['10H10', '10H20', '10H30', '10H40', '10H50', '10H60', '10H70',
       '10H80', '11H10', '11H20', '11H30', '11H40', '11H50', '12D10',
       '12D20', '12D30', '12D40', '12D50', '12D60', '12D70', '13D10',
       '13D20', '13D30', '13D40', '14D10', '14D20', '14D30', '14D40',
       '14D50', '15E10', '15E20', '15E30', '15E40', '16E10', '16E20',
       '16E30', '16E40', '17E10', '17E20', '17E30', '17E40', '18F10',
       '18F20', '18F30', '18F40', '18F50', '18F60', '19G10', '19G20',
       '19G30', '19G40', '19G50', '1A10', '1A20', '1A30', '1A40', '1A50',
       '20G10', '20G20', '20G30', '20G40', '20G50', '20G60', '20G70',
       '20G80', '21I10', '21I20', '21I30', '21I40', '21I50', '21I60',
       '21I70', '23J50', '24C10', '24C20', '24C30', '24C40', '24C50',
       '24C60', '2A10', '2A20', '2A30', '2A40', '2A50', '2A60', '3B10',
       '3B30', '3B40', '3B50', '4F10', '4F20', '4F30', '5F10', '5F20',
       '5F30', '5F40', '6B10', '6B20', '6B30', '6B40', '6B50', '6B60',
       '7C10

In [54]:
len(df17.Beat.unique())

237

In [55]:
df17['Beat'] = df17['Beat'].str.strip()

In [56]:
len(df17.Beat.unique())

120

In [57]:
df17.Beat.value_counts(dropna=False)

1A20     2852
15E40    2443
12D10    2430
19G10    2205
18F20    2202
18F40    2100
17E10    2098
13D20    2062
3B10     2038
6B60     1993
1A30     1991
14D20    1980
5F30     1965
1A50     1893
17E40    1891
2A50     1870
18F30    1870
20G50    1860
1A10     1819
20G30    1781
20G10    1739
2A30     1730
5F40     1651
7C20     1597
18F50    1559
10H70    1556
3B50     1526
11H10    1494
9C40     1489
20G80    1485
         ... 
12D60     434
13D40     392
8C20      380
12D40     374
24C50     361
21I50     355
8C40      347
10H20     329
14D50     324
24C30     311
24C20     303
24C10     293
11H50     265
9C10      254
18F10     253
12D50     235
21I10     221
1A40      202
11H40     164
23J50     146
24C40     129
24C60     126
13D30     124
21I60      62
21I70      22
21I20      18
21I40      15
21I30       5
UH-3P       1
23J40       1
Name: Beat, Length: 120, dtype: int64

In [58]:
df17.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119727 entries, 0 to 119726
Data columns (total 8 columns):
Date            119727 non-null object
Beat            119727 non-null object
BlockRange      119727 non-null object
StreetName      119718 non-null object
Offense Type    119725 non-null object
Premise         119240 non-null object
Offenses        119727 non-null float64
Hour            119727 non-null int64
dtypes: float64(1), int64(1), object(6)
memory usage: 7.3+ MB


## Cleanup

### Date column
- convert to datetime
- index date colimn
- sort index

In [59]:
df17.head(5)

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,Offenses,Hour
0,2017-04-10 00:00:00,10H10,200-299,CLIFTON,Burglary,Residence or House,1.0,15
1,2017-04-11 00:00:00,10H10,2300-2399,CANAL,Theft,Restaurant or Cafeteria Parking Lot,1.0,15
2,2017-04-11 00:00:00,10H10,2300-2399,CANAL,Theft,Restaurant or Cafeteria Parking Lot,1.0,17
3,2017-04-12 00:00:00,10H10,4600-4699,CANAL,Burglary,Miscellaneous Business (Non-Specific),1.0,9
4,2017-04-12 00:00:00,10H10,100-199,ADAM,Theft,"Other, Unknown, or Not Listed",1.0,19


In [60]:
df17['Date'] = pd.to_datetime(df17['Date'])

df17 = df17.set_index('Date').sort_index(ascending=True)

In [61]:
df17.head(5)

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,Offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1917-01-20,20G20,2100-2199,GESSNER,Theft,Apartment Parking Lot,1.0,12
1917-02-20,17E20,9100-9199,FONDREN,AutoTheft,"Road, Street, or Sidewalk",1.0,15
1917-02-20,10H70,7500-7599,ARDMORE,Theft,Other Parking Lot,1.0,16
1917-04-16,18F20,1100-1199,UPTOWN PARK,Rape,Bar or Night Club,1.0,12
1917-12-18,9C20,500-599,PORTWALL,Theft,Miscellaneous Business (Non-Specific),1.0,14


In [62]:
df17.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 119727 entries, 1917-01-20 to 2017-12-31
Data columns (total 7 columns):
Beat            119727 non-null object
BlockRange      119727 non-null object
StreetName      119718 non-null object
Offense Type    119725 non-null object
Premise         119240 non-null object
Offenses        119727 non-null float64
Hour            119727 non-null int64
dtypes: float64(1), int64(1), object(5)
memory usage: 7.3+ MB


## Odd dates
`DatetimeIndex: 119727 entries, 1917-01-20 to 2017-12-31`

- some values are not from this year, lets look

In [63]:
df2017 = df17.loc['2017-01-01':'2017-12-31']  # rows with date from 01,01,17 - 12,31,17

In [64]:
df2007_wrong_date = df17[:"2016"]  # rows with year 0  upto 2016

In [65]:
df2017.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 118581 entries, 2017-01-01 to 2017-12-31
Data columns (total 7 columns):
Beat            118581 non-null object
BlockRange      118581 non-null object
StreetName      118573 non-null object
Offense Type    118580 non-null object
Premise         118107 non-null object
Offenses        118581 non-null float64
Hour            118581 non-null int64
dtypes: float64(1), int64(1), object(5)
memory usage: 7.2+ MB


# NAN values

- Premise nan : 474 rows
- streetName nan : 8 rows
- Offense Type nan : 1

In [101]:
premise_nan = df2017.Premise.isnull()  #
str_nan = df2017.StreetName.isnull()
off_nan = df2017['Offense Type'].isnull()

In [96]:
beat_mask = df2017['Beat'] == '2A30'  # beat 2A30 mask
res_house = df2017.Premise =='Residence or House'
rd_side = df2017.Premise =='Road, Street, or Sidewalk'
vac_res = df2017.Premise =='Vacant Single Occ Resd(House,Townhs,Dplex)'
theft_msk = df2017['Offense Type'] == 'Theft'
burglary_msk = df2017['Offense Type'] == 'Burglary'

In [110]:
df2017[(beat_mask)& (str_nan)]

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,Offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-01-16,2A30,UNK,,Theft,Residence or House,1.0,15
2017-01-20,2A30,UNK,,Theft,"Road, Street, or Sidewalk",1.0,22
2017-01-26,2A30,UNK,,Burglary,Residence or House,1.0,13
2017-02-25,2A30,UNK,,Theft,"Road, Street, or Sidewalk",1.0,9
2017-03-11,2A30,UNK,,Burglary,"Vacant Single Occ Resd(House,Townhs,Dplex)",1.0,0
2017-04-02,2A30,UNK,,Theft,Residence or House,1.0,11


In [111]:
df2017.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 118581 entries, 2017-01-01 to 2017-12-31
Data columns (total 7 columns):
Beat            118581 non-null object
BlockRange      118581 non-null object
StreetName      118573 non-null object
Offense Type    118580 non-null object
Premise         118107 non-null object
Offenses        118581 non-null float64
Hour            118581 non-null int64
dtypes: float64(1), int64(1), object(5)
memory usage: 9.7+ MB


## DROP nan
drop nan values of StreetName

In [112]:
df2017.head()

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,Offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-01-01,20G80,14600-14699,BRIAR FOREST,Theft,Apartment Parking Lot,1.0,12
2017-01-01,19G50,11700-11799,WILCREST,Theft,Bar or Night Club Parking Lot,1.0,19
2017-01-01,3B50,7300-7399,IRVINGTON,AutoTheft,Bar or Night Club,1.0,12
2017-01-01,3B50,6500-6599,AMASA,Aggravated Assault,"Road, Street, or Sidewalk",1.0,2
2017-01-01,2A50,6700-6799,ARNOT,Theft,"Parks and Recreation, Zoo, Swimming Pool",1.0,9


In [116]:
df2017 = df2017.dropna(subset=['StreetName'])  # drop nan values from StreetName, 8 rows

In [117]:
df2017.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 118573 entries, 2017-01-01 to 2017-12-31
Data columns (total 7 columns):
Beat            118573 non-null object
BlockRange      118573 non-null object
StreetName      118573 non-null object
Offense Type    118572 non-null object
Premise         118099 non-null object
Offenses        118573 non-null float64
Hour            118573 non-null int64
dtypes: float64(1), int64(1), object(5)
memory usage: 7.2+ MB


In [130]:
df2017['Premise'] = df2017['Premise'].fillna('unk')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [137]:
df2017.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 118573 entries, 2017-01-01 to 2017-12-31
Data columns (total 7 columns):
Beat            118573 non-null object
BlockRange      118573 non-null object
StreetName      118573 non-null object
Offense Type    118572 non-null object
Premise         118573 non-null object
Offenses        118573 non-null float64
Hour            118573 non-null int64
dtypes: float64(1), int64(1), object(5)
memory usage: 9.7+ MB


In [138]:
df2017['Offense Type'].value_counts(dropna=False)

Theft                 66789
Burglary              16910
Aggravated Assault    12264
AutoTheft             11386
Robbery                9753
Rape                   1224
Murder                  246
NaN                       1
Name: Offense Type, dtype: int64

## Fillna
will use this since Dont know what to do... :/

In [139]:
df2017['Offense Type'].fillna(method='ffill', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [141]:
df2017.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 118573 entries, 2017-01-01 to 2017-12-31
Data columns (total 7 columns):
Beat            118573 non-null object
BlockRange      118573 non-null object
StreetName      118573 non-null object
Offense Type    118573 non-null object
Premise         118573 non-null object
Offenses        118573 non-null float64
Hour            118573 non-null int64
dtypes: float64(1), int64(1), object(5)
memory usage: 9.7+ MB


## Save clean data to  to csv

In [143]:
df2017.to_csv('clean_data/crime_data/crime17_clean.csv')

In [144]:
ls clean_data/crime_data/

[0m[01;32mcrime17_clean.csv[0m*
