In [1]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt

## Source
- [Houston Police Department Crime Statistics](http://www.houstontx.gov/police/cs/crime-stats-archives.htm)
	- years: 2008 - 2017
	- format: Access or Excel

In [2]:
ls crime_data_raw/2015

[0m[01;32mapr15.xls[0m*  [01;32mdec15.xls[0m*  [01;32mjan15.xls[0m*  [01;32mjun15.xls[0m*  [01;32mmay15.xls[0m*  [01;32moct15.xls[0m*
[01;32maug15.xls[0m*  [01;32mfeb15.xls[0m*  [01;32mjul15.xls[0m*  [01;32mmar15.xls[0m*  [01;32mnov15.xls[0m*  [01;32msep15.xls[0m*


## combine all files into one dataframe

In [3]:
path = 'crime_data_raw/2015'
all_files = glob.glob(os.path.join(path, "*.xls")) 

df_from_each_file = (pd.read_excel(f) for f in all_files)
df   = pd.concat(df_from_each_file, ignore_index=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120009 entries, 0 to 120008
Data columns (total 10 columns):
Date            120009 non-null datetime64[ns]
Hour            120009 non-null int64
Offense Type    120009 non-null object
Beat            120009 non-null object
Premise         117881 non-null object
BlockRange      120009 non-null object
StreetName      120009 non-null object
Type            120009 non-null object
Suffix          120009 non-null object
# offenses      120009 non-null int64
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 9.2+ MB


In [5]:
df.head(5)

Unnamed: 0,Date,Hour,Offense Type,Beat,Premise,BlockRange,StreetName,Type,Suffix,# offenses
0,2015-02-26,12,Theft,10H10,"Road, Street, or Sidewalk",4900-4999,CANAL,ST,-,1
1,2015-04-05,16,Burglary,10H10,Residence or House,100-199,YORK,-,-,1
2,2015-04-06,20,Rape,10H10,Residence or House,UNK,CANAL,CT,-,1
3,2015-04-16,8,Theft,10H10,Construction Site,400-499,ENNIS,ST,-,1
4,2015-04-01,19,Theft,10H10,,UNK,SIDNEY,-,-,1


## Lets create a copy

In [6]:
df1 = df.copy()

In [7]:
df1.columns

Index(['Date', 'Hour', 'Offense Type', 'Beat', 'Premise', 'BlockRange',
       'StreetName', 'Type', 'Suffix', '# offenses'],
      dtype='object')

In [8]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120009 entries, 0 to 120008
Data columns (total 10 columns):
Date            120009 non-null datetime64[ns]
Hour            120009 non-null int64
Offense Type    120009 non-null object
Beat            120009 non-null object
Premise         117881 non-null object
BlockRange      120009 non-null object
StreetName      120009 non-null object
Type            120009 non-null object
Suffix          120009 non-null object
# offenses      120009 non-null int64
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 9.2+ MB


In [9]:
df1.head(5)

Unnamed: 0,Date,Hour,Offense Type,Beat,Premise,BlockRange,StreetName,Type,Suffix,# offenses
0,2015-02-26,12,Theft,10H10,"Road, Street, or Sidewalk",4900-4999,CANAL,ST,-,1
1,2015-04-05,16,Burglary,10H10,Residence or House,100-199,YORK,-,-,1
2,2015-04-06,20,Rape,10H10,Residence or House,UNK,CANAL,CT,-,1
3,2015-04-16,8,Theft,10H10,Construction Site,400-499,ENNIS,ST,-,1
4,2015-04-01,19,Theft,10H10,,UNK,SIDNEY,-,-,1


## create a subdataframe with the columns that we want

In [10]:
df15 = df1[['Date','Beat','BlockRange','StreetName','Offense Type','Premise','# offenses','Hour']]

In [11]:
df15.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120009 entries, 0 to 120008
Data columns (total 8 columns):
Date            120009 non-null datetime64[ns]
Beat            120009 non-null object
BlockRange      120009 non-null object
StreetName      120009 non-null object
Offense Type    120009 non-null object
Premise         117881 non-null object
# offenses      120009 non-null int64
Hour            120009 non-null int64
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 7.3+ MB


In [12]:
df15.tail()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
120004,2015-09-07,UNK,6500-6599,MCHARD,Aggravated Assault,Residence or House,1,17
120005,2015-09-19,UNK,300-399,KIRKSTALL,Theft,"Road, Street, or Sidewalk",1,16
120006,2015-09-14,UNK,10600-10699,EASTEX,Theft,"Physician, Doctor, Dentist's Office",1,16
120007,2015-09-25,UNK,1700-1799,JOHNSON,Theft,Residence or House,1,14
120008,2015-09-28,UNK,1400-1499,BAY AREA,Theft,Other Parking Lot,1,15


## Now we can inspect df

In [13]:
df15.Date.unique() # timestamp

array(['2015-02-26T00:00:00.000000000', '2015-04-05T00:00:00.000000000',
       '2015-04-06T00:00:00.000000000', '2015-04-16T00:00:00.000000000',
       '2015-04-01T00:00:00.000000000', '2015-04-02T00:00:00.000000000',
       '2015-04-14T00:00:00.000000000', '2015-04-18T00:00:00.000000000',
       '2015-04-22T00:00:00.000000000', '2015-03-28T00:00:00.000000000',
       '2015-04-03T00:00:00.000000000', '2015-04-19T00:00:00.000000000',
       '2015-04-20T00:00:00.000000000', '2015-04-24T00:00:00.000000000',
       '2009-05-18T00:00:00.000000000', '2015-03-31T00:00:00.000000000',
       '2015-04-09T00:00:00.000000000', '2015-04-12T00:00:00.000000000',
       '2015-04-23T00:00:00.000000000', '2015-01-05T00:00:00.000000000',
       '2015-03-30T00:00:00.000000000', '2015-04-04T00:00:00.000000000',
       '2015-04-07T00:00:00.000000000', '2015-04-21T00:00:00.000000000',
       '2015-04-08T00:00:00.000000000', '2015-04-10T00:00:00.000000000',
       '2015-04-17T00:00:00.000000000', '2015-04-13

In [14]:
df15.Beat.unique()  # UNK

array(['10H10', '10H20', '10H30', '10H40', '10H50', '10H60', '10H70',
       '10H80', '11H10', '11H20', '11H30', '11H40', '11H50', '12D10',
       '12D20', '12D30', '12D40', '12D50', '12D60', '12D70', '13D10',
       '13D20', '13D30', '13D40', '14D10', '14D20', '14D30', '14D40',
       '14D50', '15E10', '15E20', '15E30', '15E40', '16E10', '16E20',
       '16E30', '16E40', '17E10', '17E20', '17E30', '17E40', '18F10',
       '18F20', '18F30', '18F40', '18F50', '18F60', '19G10', '19G20',
       '19G30', '19G40', '19G50', '1A10', '1A20', '1A30', '1A40', '1A50',
       '20G10', '20G20', '20G30', '20G40', '20G50', '20G60', '20G70',
       '20G80', '21I10', '21I30', '21I50', '23J50', '24C10', '24C20',
       '24C30', '24C40', '24C50', '24C60', '2A10', '2A20', '2A30', '2A40',
       '2A50', '2A60', '3B10', '3B30', '3B40', '3B50', '4F10', '4F20',
       '4F30', '5F10', '5F20', '5F30', '5F40', '6B10', '6B20', '6B30',
       '6B40', '6B50', '6B60', '7C10', '7C20', '7C30', '7C40', '7C50',
       '

In [15]:
df15.Beat.value_counts(dropna=False)  # needs cleanup

1A20     2716
13D20    2667
19G10    2548
6B60     2493
12D10    2466
17E10    2126
18F20    2074
15E40    2043
3B10     2026
1A30     1892
18F40    1879
5F30     1844
1A10     1843
20G50    1843
1A50     1817
11H10    1790
17E40    1785
5F40     1771
18F30    1724
14D20    1710
2A50     1701
20G30    1700
7C20     1689
6B10     1688
9C40     1608
18F50    1573
3B50     1550
2A30     1492
14D10    1483
10H70    1479
         ... 
20G20     493
2A40      481
21I50     470
10H10     455
8C20      410
8C40      396
12D60     377
14D50     332
10H20     326
11H50     314
24C50     311
24C10     270
1A40      268
9C10      263
24C20     249
24C30     235
18F10     228
12D50     200
11H40     199
13D30     159
24C60     140
21I10     138
23J50     125
24C40     118
21I60      18
21I30      13
21I40      10
21I20       3
3B20        1
21I70       1
Name: Beat, Length: 119, dtype: int64

In [16]:
df15.BlockRange.unique()  # UNK val

array(['4900-4999', '100-199', 'UNK', '400-499', '1900-1999', '2200-2299',
       '2500-2599', '200-299', '300-399', '4500-4599', '3500-3599',
       '6200-6299', '4700-4799', '500-599', '2600-2699', '700-799',
       '3300-3399', '5300-5399', '2300-2399', '2000-2099', '800-899',
       '1100-1199', '4200-4299', '3800-3899', '5000-5099', '1400-1499',
       '1200-1299', '5200-5299', '4100-4199', '4400-4499', '3900-3999',
       '5100-5199', '4300-4399', '5400-5499', '1500-1599', '1000-1099',
       '2100-2199', '3700-3799', '1700-1799', '600-699', '2800-2899',
       '1800-1899', '900-999', '3100-3199', '2900-2999', '1600-1699',
       '1300-1399', '2400-2499', '2700-2799', '3600-3699', '4000-4099',
       '4600-4699', '3400-3499', '3000-3099', '3200-3299', '5900-5999',
       '5500-5599', '6400-6499', '8500-8599', '6500-6599', '6600-6699',
       '5600-5699', '5700-5799', '7100-7199', '6700-6799', '7000-7099',
       '6100-6199', '6300-6399', '7200-7299', '7300-7399', '6000-6099',
   

In [17]:
df15.BlockRange.value_counts(dropna=False)  # 1560 UNK values

2300-2399        1678
100-199          1663
900-999          1577
UNK              1560
800-899          1496
300-399          1465
2400-2499        1418
700-799          1417
1000-1099        1395
4400-4499        1354
7900-7999        1334
200-299          1333
9400-9499        1268
500-599          1252
2500-2599        1249
7500-7599        1183
1500-1599        1174
1100-1199        1174
9500-9599        1174
5000-5099        1165
600-699          1131
1400-1499        1124
2700-2799        1121
1300-1399        1102
2000-2099        1092
5800-5899        1071
1200-1299        1069
3800-3899        1048
6100-6199        1046
6000-6099        1045
                 ... 
23200-23299         3
23800-23899         3
21200-21299         3
21800-21899         3
22400-22499         2
24800-24899         2
27600-27699         2
25000-25099         2
22000-22099         2
21900-21999         2
23600-23699         2
23300-23399         2
19900-19999         2
20000-20099         2
202600-202

In [18]:
df15.StreetName.unique()  # 

array(['CANAL', 'YORK', 'ENNIS', ..., 'LINBROOK', 'TEXAS LAUREL',
       'KIRKSTALL'], dtype=object)

In [19]:
df15.StreetName.value_counts(dropna=False)  # name cleanup too, street called A??

WESTHEIMER                 3623
GULF                       2138
SAM HOUSTON                1718
NORTH                      1701
LOOP                       1573
GESSNER                    1567
RICHMOND                   1432
KATY                       1378
SOUTHWEST                  1365
MAIN                       1135
BISSONNET                  1102
NORTHWEST                  1074
BELLFORT                   1025
BELLAIRE                   1003
FM 1960                     995
POST OAK                    910
BROADWAY                    897
BEECHNUT                    879
WAYSIDE                     876
FONDREN                     868
TIDWELL                     859
GREENS                      848
EAST                        751
LITTLE YORK                 741
SHEPHERD                    723
AIRLINE                     651
KIRBY                       606
HILLCROFT                   603
FANNIN                      602
TELEPHONE                   586
                           ... 
HATTIE  

In [20]:
df15['Offense Type'].unique()  # 

array(['Theft', 'Burglary', 'Rape', 'Murder', 'Auto Theft',
       'Aggravated Assault', 'Robbery', 1], dtype=object)

In [21]:
df15['Offense Type'].value_counts(dropna=False) # 

Theft                 66309
Burglary              19824
Auto Theft            13113
Robbery               10274
Aggravated Assault     9209
Rape                    985
Murder                  287
1                         8
Name: Offense Type, dtype: int64

In [22]:
df15.Premise.unique()  #

array(['Road, Street, or Sidewalk', 'Residence or House',
       'Construction Site', nan, 'Apartment', 'Other Parking Lot',
       "Physician, Doctor, Dentist's Office", 'Apartment Parking Lot',
       'Driveway', 'Warehouse', 'Commercial Parking Lot or Garage',
       'Service or Gas Station', 'Bus Stop',
       'Convenience Store Parking Lot',
       'Parks and Recreation, Zoo, Swimming Pool', 'Hospital',
       'Miscellaneous Business (Non-Specific)',
       'Vacant Single Occ Resd(House,Townhs,Dplex)',
       'Other, Unknown, or Not Listed', 'Convenience Store',
       'Bar or Night Club Parking Lot', 'Field, Woods, Forest, Park',
       'Restaurant or Cafeteria', 'Auto Repair',
       'Stadium, Sports Arena, Race Track',
       'Government or Public Building', 'Bar or Night Club',
       'Commercial Building', 'Factory, Manufacturing, or Industrial',
       'Grocery Store or Supermarket', 'Light Rail Platform',
       'Office Building', 'Garage or Carport', 'Bus Station',
       

In [23]:
df15.Premise.value_counts(dropna=False)  # 1912  # nan values

Residence or House                           16814
Apartment Parking Lot                        13220
Apartment                                    11815
Road, Street, or Sidewalk                    11390
Restaurant or Cafeteria Parking Lot           4897
Driveway                                      4726
Department or Discount Store                  4195
Other Parking Lot                             3807
Miscellaneous Business (Non-Specific)         3100
Service or Gas Station                        2939
Commercial Parking Lot or Garage              2882
Grocery Store or Supermarket                  2706
Strip Business Center Parking Lot             2253
NaN                                           2128
Convenience Store                             1912
Restaurant or Cafeteria                       1827
Other, Unknown, or Not Listed                 1800
Hotel or Motel Parking Lot                    1669
Grocery Store or Supermarket Parking Lot      1451
Hotel, Motel, Inn, Etc.        

In [24]:
df15['# offenses'].unique()

array([ 1,  2,  3,  5,  4,  6, 12, 17,  7, 16, 33,  9,  8, 13])

In [25]:
df15['# offenses'].value_counts(dropna=False)  # 2 locations with 10 offenses?

1     118206
2       1539
3        183
4         40
5         14
6         10
8          5
7          3
17         2
16         2
9          2
33         1
13         1
12         1
Name: # offenses, dtype: int64

In [26]:
df15.Hour.unique()

array([12, 16, 20,  8, 19,  9, 13, 10, 18, 15, 11, 17,  1, 21,  7, 14, 22,
        0, 23,  5,  4,  2,  6,  3])

In [27]:
df15.Hour.value_counts(dropna=False)  #

15    7474
14    7352
16    7325
12    7061
11    7032
10    7025
13    6950
9     6790
17    6680
8     6151
18    5998
19    5571
20    5300
7     4825
21    4547
22    4326
23    3749
6     3064
0     2889
1     2434
2     2216
5     1831
3     1796
4     1623
Name: Hour, dtype: int64

# Cleanup
## Premise Column

- strip empty spaces  : NOT NEEDED
- find nan values : 2128

In [28]:
len(df15.Premise.unique())

125

In [29]:
df15['Premise'] = df15['Premise'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [30]:
len(df15.Premise.unique())

125

In [31]:
df15.Premise.value_counts(dropna=False)

Residence or House                           16814
Apartment Parking Lot                        13220
Apartment                                    11815
Road, Street, or Sidewalk                    11390
Restaurant or Cafeteria Parking Lot           4897
Driveway                                      4726
Department or Discount Store                  4195
Other Parking Lot                             3807
Miscellaneous Business (Non-Specific)         3100
Service or Gas Station                        2939
Commercial Parking Lot or Garage              2882
Grocery Store or Supermarket                  2706
Strip Business Center Parking Lot             2253
NaN                                           2128
Convenience Store                             1912
Restaurant or Cafeteria                       1827
Other, Unknown, or Not Listed                 1800
Hotel or Motel Parking Lot                    1669
Grocery Store or Supermarket Parking Lot      1451
Hotel, Motel, Inn, Etc.        

In [32]:
df15[df15.Premise.isnull()].head()  # 2128

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
4,2015-04-01,10H10,UNK,SIDNEY,Theft,,1,19
14,2009-05-18,10H10,4500-4599,CANAL,Theft,,1,11
27,2015-04-21,10H10,3300-3399,NAVIGATION,Theft,,1,13
39,2015-04-02,10H20,3800-3899,LAMAR,Burglary,,1,0
56,2015-04-07,10H20,4300-4399,COYLE,Theft,,1,14


# Cleanup
## Offense Type Column

- strip empty spaces  : NOT NEEDED
- join similar values : NOT NEEDED
- find value of 1 row
    - value 1 change to nan values, will fix later

In [33]:
df15['Offense Type'].value_counts(dropna=False)

Theft                 66309
Burglary              19824
Auto Theft            13113
Robbery               10274
Aggravated Assault     9209
Rape                    985
Murder                  287
1                         8
Name: Offense Type, dtype: int64

In [34]:
df15['Offense Type'].unique()

array(['Theft', 'Burglary', 'Rape', 'Murder', 'Auto Theft',
       'Aggravated Assault', 'Robbery', 1], dtype=object)

In [35]:
len(df15['Offense Type'].unique())

8

In [36]:
df15['Offense Type'] = df15['Offense Type'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [37]:
df15['Offense Type'].value_counts(dropna=False)

Theft                 66309
Burglary              19824
Auto Theft            13113
Robbery               10274
Aggravated Assault     9209
Rape                    985
Murder                  287
NaN                       8
Name: Offense Type, dtype: int64

In [38]:
len(df15['Offense Type'].unique())

8

In [39]:
df15[df15['Offense Type'].isnull()]  # nan values, will fix later, they used to be value 1

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
3538,2015-04-10,17E40,11200-11299,FONDREN,,Service or Gas Station,1,18
24565,2015-12-04,17E40,7800-7899,AIRPORT,,Apartment,1,13
40495,2015-01-01,12D10,800-899,EASTLAKE,,,1,0
51606,2015-07-30,13D40,8400-8499,TAVENOR,,Residence or House,1,16
77596,2015-03-19,4F10,9300-9399,LONG POINT,,Apartment,1,21
79969,2015-04-05,10H20,4900-4999,MCKINNEY,,"Road, Street, or Sidewalk",1,20
99822,2013-08-31,8C50,9200-9299,NYSSA,,,1,0
110226,2013-09-27,UNK,1500-1599,BAILEY ST 424,,Apartment,1,6


# Cleanup
## StreetName Column

- strip empty spaces
- find similar values and combine ( needs done)

In [40]:
df15.StreetName.value_counts(dropna=False).head()

WESTHEIMER     3623
GULF           2138
SAM HOUSTON    1718
NORTH          1701
LOOP           1573
Name: StreetName, dtype: int64

In [41]:
len(df15.StreetName.unique())

7918

In [42]:
df15['StreetName'] = df15['StreetName'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [43]:
len(df15.StreetName.unique())

7913

In [44]:
df15.StreetName.value_counts(dropna=False)  # WESTHEIMER & WESTHEIMER DR are similar, need cleanup

WESTHEIMER             3623
GULF                   2138
SAM HOUSTON            1718
NORTH                  1701
LOOP                   1573
GESSNER                1567
RICHMOND               1432
KATY                   1378
SOUTHWEST              1365
MAIN                   1135
BISSONNET              1102
NORTHWEST              1074
BELLFORT               1025
BELLAIRE               1003
FM 1960                 995
POST OAK                910
BROADWAY                897
BEECHNUT                879
WAYSIDE                 876
FONDREN                 868
TIDWELL                 859
GREENS                  848
EAST                    751
LITTLE YORK             741
SHEPHERD                723
AIRLINE                 651
KIRBY                   606
HILLCROFT               603
FANNIN                  602
TELEPHONE               586
                       ... 
GRAY RIDGE                1
BEN                       1
FAWNGROVE                 1
KINGSPASS                 1
IRON ROCK           

# Cleanup
## BlockRange Column

- create mask to find 'UNK' values
- match with similar beat value (Needs to be done)

In [45]:
df15.BlockRange.value_counts(dropna=False).head()  # find UNK:1560

2300-2399    1678
100-199      1663
900-999      1577
UNK          1560
800-899      1496
Name: BlockRange, dtype: int64

In [46]:
unk = df15.BlockRange == 'UNK'  # boolean mask

In [47]:
df15[unk].head()  # 1560 rows

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
2,2015-04-06,10H10,UNK,CANAL,Rape,Residence or House,1,20
4,2015-04-01,10H10,UNK,SIDNEY,Theft,,1,19
7,2015-04-18,10H10,UNK,ENNIS,Rape,Residence or House,1,10
13,2015-04-24,10H10,UNK,MARSDEN,Burglary,Residence or House,1,18
18,2015-04-23,10H10,UNK,STILES,Theft,Other Parking Lot,1,17


# Cleanup
## Beat Column

- strip empty spaces : NOT NEEDED
- create mask for UNK beat : 750

In [48]:
df15.Beat.unique()

array(['10H10', '10H20', '10H30', '10H40', '10H50', '10H60', '10H70',
       '10H80', '11H10', '11H20', '11H30', '11H40', '11H50', '12D10',
       '12D20', '12D30', '12D40', '12D50', '12D60', '12D70', '13D10',
       '13D20', '13D30', '13D40', '14D10', '14D20', '14D30', '14D40',
       '14D50', '15E10', '15E20', '15E30', '15E40', '16E10', '16E20',
       '16E30', '16E40', '17E10', '17E20', '17E30', '17E40', '18F10',
       '18F20', '18F30', '18F40', '18F50', '18F60', '19G10', '19G20',
       '19G30', '19G40', '19G50', '1A10', '1A20', '1A30', '1A40', '1A50',
       '20G10', '20G20', '20G30', '20G40', '20G50', '20G60', '20G70',
       '20G80', '21I10', '21I30', '21I50', '23J50', '24C10', '24C20',
       '24C30', '24C40', '24C50', '24C60', '2A10', '2A20', '2A30', '2A40',
       '2A50', '2A60', '3B10', '3B30', '3B40', '3B50', '4F10', '4F20',
       '4F30', '5F10', '5F20', '5F30', '5F40', '6B10', '6B20', '6B30',
       '6B40', '6B50', '6B60', '7C10', '7C20', '7C30', '7C40', '7C50',
       '

In [49]:
len(df15.Beat.unique())

119

In [50]:
df15['Beat'] = df15['Beat'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [51]:
len(df15.Beat.unique())

119

In [52]:
df15.Beat.value_counts(dropna=False)

1A20     2716
13D20    2667
19G10    2548
6B60     2493
12D10    2466
17E10    2126
18F20    2074
15E40    2043
3B10     2026
1A30     1892
18F40    1879
5F30     1844
1A10     1843
20G50    1843
1A50     1817
11H10    1790
17E40    1785
5F40     1771
18F30    1724
14D20    1710
2A50     1701
20G30    1700
7C20     1689
6B10     1688
9C40     1608
18F50    1573
3B50     1550
2A30     1492
14D10    1483
10H70    1479
         ... 
20G20     493
2A40      481
21I50     470
10H10     455
8C20      410
8C40      396
12D60     377
14D50     332
10H20     326
11H50     314
24C50     311
24C10     270
1A40      268
9C10      263
24C20     249
24C30     235
18F10     228
12D50     200
11H40     199
13D30     159
24C60     140
21I10     138
23J50     125
24C40     118
21I60      18
21I30      13
21I40      10
21I20       3
3B20        1
21I70       1
Name: Beat, Length: 119, dtype: int64

In [53]:
unk_beat = df15.Beat == 'UNK'  # boolean mask
len(df15[unk_beat])

750

In [54]:
df15[unk_beat].head()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
10064,2015-04-03,UNK,14400-14499,SCOTT,Robbery,Service or Gas Station,1,12
10065,2015-04-10,UNK,13800-13899,ALMEDA,Theft,Other Parking Lot,1,6
10066,2015-04-18,UNK,10900-10999,GESSNER,Burglary,Apartment,1,23
10067,2015-04-23,UNK,7600-7699,EAST,Theft,Service or Gas Station,1,22
10068,2015-04-22,UNK,8300-8399,WILD ROSE,Auto Theft,"Road, Street, or Sidewalk",1,15


## Cleanup

### Date column
- convert to datetime
- index date colimn
- sort index

In [55]:
df15.head(5)

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
0,2015-02-26,10H10,4900-4999,CANAL,Theft,"Road, Street, or Sidewalk",1,12
1,2015-04-05,10H10,100-199,YORK,Burglary,Residence or House,1,16
2,2015-04-06,10H10,UNK,CANAL,Rape,Residence or House,1,20
3,2015-04-16,10H10,400-499,ENNIS,Theft,Construction Site,1,8
4,2015-04-01,10H10,UNK,SIDNEY,Theft,,1,19


In [56]:
df15['Date'] = pd.to_datetime(df15['Date'])

df15 = df15.set_index('Date').sort_index(ascending=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [57]:
df15.head(5)

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1915-01-05,3B10,3200-3299,MANGUM RD 180,Theft,Other Parking Lot,1,22
1915-01-14,5F10,7000-7099,WESTVIEW,Auto Theft,Apartment Parking Lot,1,23
1915-04-24,1A20,3100-3199,SMITH,Burglary,Office Building,1,7
1915-07-30,11H10,5600-5699,TRUETT,Rape,"Other, Unknown, or Not Listed",1,10
1915-09-03,19G20,11700-11799,BEECHNUT,Burglary,Rental Storage Facility,1,13


In [58]:
df15.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 120009 entries, 1915-01-05 to 2015-12-31
Data columns (total 7 columns):
Beat            120009 non-null object
BlockRange      120009 non-null object
StreetName      119981 non-null object
Offense Type    120001 non-null object
Premise         117881 non-null object
# offenses      120009 non-null int64
Hour            120009 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.3+ MB


## Odd dates
`DatetimeIndex: 120009 entries, 1915-01-05 to 2015-12-31`

- some values are not from this year, lets look

In [59]:
df2015 = df15.loc['2015-01-01':'2015-12-31']  # rows with date from 01,01,15 - 12,31,15

In [60]:
df2015_wrong_date = df15[:"2014"]  # rows with year 0  upto 2014
df2015_wrong_date.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 973 entries, 1915-01-05 to 2014-12-31
Data columns (total 7 columns):
Beat            973 non-null object
BlockRange      973 non-null object
StreetName      972 non-null object
Offense Type    971 non-null object
Premise         870 non-null object
# offenses      973 non-null int64
Hour            973 non-null int64
dtypes: int64(2), object(5)
memory usage: 60.8+ KB


In [61]:
df2015.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 119036 entries, 2015-01-01 to 2015-12-31
Data columns (total 7 columns):
Beat            119036 non-null object
BlockRange      119036 non-null object
StreetName      119009 non-null object
Offense Type    119030 non-null object
Premise         117011 non-null object
# offenses      119036 non-null int64
Hour            119036 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.3+ MB


# NAN values

- Premise nan : 2025 rows
- streetName nan : 27 rows
- Offense Type nan : 6

In [62]:
premise_nan = df2015.Premise.isnull()  #
str_nan = df2015.StreetName.isnull()
off_nan = df2015['Offense Type'].isnull()

## DROP nan
drop nan values of StreetName

In [63]:
df2015.head()

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-01-01,9C40,12700-12799,BERTHAS,Theft,Driveway,1,11
2015-01-01,2A10,1000-1099,QUITMAN,Theft,Grocery Store or Supermarket,1,13
2015-01-01,2A10,2700-2799,CHAPMAN,Aggravated Assault,Residence or House,1,6
2015-01-01,10H50,3700-3799,BURKETT,Burglary,Residence or House,1,10
2015-01-01,1A30,2600-2699,SHEPHERD,Burglary,"Electronics Store, Electrical Supplies",1,5


In [64]:
df2015 = df2015.dropna(subset=['StreetName'])  # drop nan values from StreetName, 27 rows

In [65]:
df2015.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 119009 entries, 2015-01-01 to 2015-12-31
Data columns (total 7 columns):
Beat            119009 non-null object
BlockRange      119009 non-null object
StreetName      119009 non-null object
Offense Type    119003 non-null object
Premise         116984 non-null object
# offenses      119009 non-null int64
Hour            119009 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.3+ MB


In [66]:
df2015['Premise'] = df2015['Premise'].fillna('unk')

In [67]:
df2015.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 119009 entries, 2015-01-01 to 2015-12-31
Data columns (total 7 columns):
Beat            119009 non-null object
BlockRange      119009 non-null object
StreetName      119009 non-null object
Offense Type    119003 non-null object
Premise         119009 non-null object
# offenses      119009 non-null int64
Hour            119009 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.3+ MB


In [68]:
df2015['Offense Type'].value_counts(dropna=False)

Theft                 65719
Burglary              19706
Auto Theft            13010
Robbery               10262
Aggravated Assault     9171
Rape                    863
Murder                  272
NaN                       6
Name: Offense Type, dtype: int64

## Fillna
will use this since Dont know what to do... :/

In [69]:
df2015['Offense Type'].fillna(method='ffill', inplace=True)

In [70]:
df2015.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 119009 entries, 2015-01-01 to 2015-12-31
Data columns (total 7 columns):
Beat            119009 non-null object
BlockRange      119009 non-null object
StreetName      119009 non-null object
Offense Type    119009 non-null object
Premise         119009 non-null object
# offenses      119009 non-null int64
Hour            119009 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.3+ MB


## Save clean data to  to csv

In [71]:
df2015.to_csv('crime_data_clean/crime15_clean.csv')

In [72]:
ls crime_data_clean/

[0m[01;32mcrime15_clean.csv[0m*  [01;32mcrime16_clean.csv[0m*  [01;32mcrime17_clean.csv[0m*
