In [1]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt

## Source
- [Houston Police Department Crime Statistics](http://www.houstontx.gov/police/cs/crime-stats-archives.htm)
	- years: 2008 - 2017
	- format: Access or Excel

In [2]:
ls crime_data/2014

[0m[01;32mapr14.xls[0m*  [01;32mdec14.xls[0m*  [01;32mjan14.xls[0m*  [01;32mjun14.xls[0m*  [01;32mmay14.xls[0m*  [01;32moct14.xls[0m*
[01;32maug14.xls[0m*  [01;32mfeb14.xls[0m*  [01;32mjul14.xls[0m*  [01;32mmar14.xls[0m*  [01;32mnov14.xls[0m*  [01;32msep14.xls[0m*


## combine all files into one dataframe

In [3]:
path = 'crime_data/2014'
all_files = glob.glob(os.path.join(path, "*.xls")) 

df_from_each_file = (pd.read_excel(f) for f in all_files)
df   = pd.concat(df_from_each_file, ignore_index=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121750 entries, 0 to 121749
Data columns (total 15 columns):
# Of             7613 non-null float64
# Of Offenses    52214 non-null float64
# Offenses       10319 non-null float64
# offenses       51604 non-null float64
Beat             121734 non-null object
Block Range      69989 non-null object
BlockRange       51604 non-null object
Date             121746 non-null datetime64[ns]
Hour             121750 non-null object
Offense Type     121750 non-null object
Premise          92169 non-null object
Street Name      70146 non-null object
StreetName       51604 non-null object
Suffix           121750 non-null object
Type             121750 non-null object
dtypes: datetime64[ns](1), float64(4), object(10)
memory usage: 13.9+ MB


In [5]:
df.head(5)

Unnamed: 0,# Of,# Of Offenses,# Offenses,# offenses,Beat,Block Range,BlockRange,Date,Hour,Offense Type,Premise,Street Name,StreetName,Suffix,Type
0,,1.0,,,14D30,8500-8599,,2014-04-19,17,Murder,18D,MARTIN LUTHER KING,,-,BLVD
1,,1.0,,,10H20,3600-3699,,2014-04-28,5,Murder,13R,MCKINNEY,,-,ST
2,,3.0,,,5F30,7400-7499,,2014-04-27,18,Murder,20A,HILLMONT,,-,-
3,,2.0,,,17E10,5400-5499,,2014-04-09,18,Murder,20A,RENWICK,,-,-
4,,1.0,,,18F50,9300-9399,,2014-04-24,3,Murder,03B,RICHMOND,,-,AVE


## Lets create a copy

In [6]:
df1 = df.copy()

In [7]:
df1.columns

Index(['# Of', '# Of Offenses', '# Offenses', '# offenses', 'Beat',
       'Block Range', 'BlockRange', 'Date', 'Hour', 'Offense Type', 'Premise',
       'Street Name', 'StreetName', 'Suffix', 'Type'],
      dtype='object')

## Duplicate columns
We have duplicate columns with diffrent names
- '# Of','# Of Offenses', '# Offenses', '# offenses'
- 'Block Range', 'BlockRange'
- 'Street Name', 'StreetName'


lets concat the columns remove nan values and reindex

In [8]:
df1['BlockRange'] = pd.concat([df1['Block Range'].dropna(), df1['BlockRange'].dropna()]).reindex_like(df1)
df1['StreetName'] = pd.concat([df1['Street Name'].dropna(), df1['StreetName'].dropna()]).reindex_like(df1)
df1['# offenses'] = pd.concat([df1['# Of'].dropna(),df1['# Of Offenses'].dropna(),df1['# offenses'].dropna(), df1['# Offenses'].dropna()]).reindex_like(df1)

In [9]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121750 entries, 0 to 121749
Data columns (total 15 columns):
# Of             7613 non-null float64
# Of Offenses    52214 non-null float64
# Offenses       10319 non-null float64
# offenses       121750 non-null float64
Beat             121734 non-null object
Block Range      69989 non-null object
BlockRange       121593 non-null object
Date             121746 non-null datetime64[ns]
Hour             121750 non-null object
Offense Type     121750 non-null object
Premise          92169 non-null object
Street Name      70146 non-null object
StreetName       121750 non-null object
Suffix           121750 non-null object
Type             121750 non-null object
dtypes: datetime64[ns](1), float64(4), object(10)
memory usage: 13.9+ MB


In [10]:
df1.head(5)

Unnamed: 0,# Of,# Of Offenses,# Offenses,# offenses,Beat,Block Range,BlockRange,Date,Hour,Offense Type,Premise,Street Name,StreetName,Suffix,Type
0,,1.0,,1.0,14D30,8500-8599,8500-8599,2014-04-19,17,Murder,18D,MARTIN LUTHER KING,MARTIN LUTHER KING,-,BLVD
1,,1.0,,1.0,10H20,3600-3699,3600-3699,2014-04-28,5,Murder,13R,MCKINNEY,MCKINNEY,-,ST
2,,3.0,,3.0,5F30,7400-7499,7400-7499,2014-04-27,18,Murder,20A,HILLMONT,HILLMONT,-,-
3,,2.0,,2.0,17E10,5400-5499,5400-5499,2014-04-09,18,Murder,20A,RENWICK,RENWICK,-,-
4,,1.0,,1.0,18F50,9300-9399,9300-9399,2014-04-24,3,Murder,03B,RICHMOND,RICHMOND,-,AVE


## create a subdataframe with the columns that we want

In [12]:
df14 = df1[['Date','Beat','BlockRange','StreetName','Offense Type','Premise','# offenses','Hour']]

In [13]:
df14.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121750 entries, 0 to 121749
Data columns (total 8 columns):
Date            121746 non-null datetime64[ns]
Beat            121734 non-null object
BlockRange      121593 non-null object
StreetName      121750 non-null object
Offense Type    121750 non-null object
Premise         92169 non-null object
# offenses      121750 non-null float64
Hour            121750 non-null object
dtypes: datetime64[ns](1), float64(1), object(6)
memory usage: 7.4+ MB


In [14]:
df14.tail()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
121745,2014-09-15,UNK,2400-2499,LEE,Burglary,Residence or House,1.0,21
121746,2014-09-18,UNK,7900-7999,SAM HOUSTON,Theft,Apartment Parking Lot,1.0,18
121747,2014-09-19,UNK,2700-2799,YALE,Burglary,Restaurant or Cafeteria Parking Lot,1.0,3
121748,2014-09-18,UNK,9300-9399,GESSNER,Burglary,Barber And Beauty Shops,1.0,7
121749,2014-09-28,UNK,7900-7999,GESSNER,Theft,,1.0,2


## Now we can inspect df

In [16]:
df14.Date.unique() ##  timestamp 

array(['2014-04-19T00:00:00.000000000', '2014-04-28T00:00:00.000000000',
       '2014-04-27T00:00:00.000000000', '2014-04-09T00:00:00.000000000',
       '2014-04-24T00:00:00.000000000', '2014-04-15T00:00:00.000000000',
       '2014-04-17T00:00:00.000000000', '2014-04-05T00:00:00.000000000',
       '2014-04-20T00:00:00.000000000', '2014-04-22T00:00:00.000000000',
       '2006-10-02T00:00:00.000000000', '2014-04-14T00:00:00.000000000',
       '2012-04-29T00:00:00.000000000', '2014-04-18T00:00:00.000000000',
       '2006-11-28T00:00:00.000000000', '2014-02-15T00:00:00.000000000',
       '2012-04-26T00:00:00.000000000', '2010-06-12T00:00:00.000000000',
       '2014-04-16T00:00:00.000000000', '2014-04-21T00:00:00.000000000',
       '2014-04-04T00:00:00.000000000', '2013-07-14T00:00:00.000000000',
       '2011-06-06T00:00:00.000000000', '2002-01-01T00:00:00.000000000',
       '2014-04-29T00:00:00.000000000', '2014-04-26T00:00:00.000000000',
       '2014-04-23T00:00:00.000000000', '2014-03-21

In [18]:
df14.Beat.unique()  ## 

array(['14D30', '10H20', '5F30', '17E10', '18F50', '10H70', '19G20',
       '2A10', '17E30', '2A60', '5F10', '16E20', '17E20', '19G40',
       '19G10', '3B40', '10H50', '10H40', '16E30', '15E40', '6B50',
       '6B40', '18F30', '2A40', '10H10', '3B30', '11H20', '12D70', '8C10',
       '3B10', '14D40', '14D20', '1A50', '11H10', '9C40', '3B50', '2A20',
       '7C30', '20G40', '1A30', '2A30', '6B60', '18F60', '1A20', '4F10',
       '15E30', '12D30', '4F20', '19G50', '8C60', '15E20', '17E40',
       '9C20', '14D10', '2A50', '1A10', '6B30', '7C40', '19G30', '11H30',
       '9C30', '7C50', '20G10', '13D20', '13D10', '18F40', '20G30',
       '6B10', '7C20', '12D40', '20G20', '16E40', '20G50', '12D20',
       '5F40', '14D50', '13D30', '18F20', '12D10', '20G60', '15E10',
       '5F20', '20G80', '7C10', '4F30', '6B20', '10H60', '8C20', '8C40',
       '13D40', '16E10', '8C50', '8C30', '24C20', '24C50', '9C10',
       '20G70', '24C10', '12D60', 'UNK', '10H30', '11H50', '24C30',
       '12D50', '11

In [19]:
df14.Beat.value_counts(dropna=False)  # needs cleanup

19G10     2653
1A20      2570
6B60      2520
13D20     2447
1A10      2443
12D10     2370
17E10     2267
3B10      2047
18F30     2038
17E40     2030
18F20     2018
6B10      1991
14D20     1976
5F30      1911
2A50      1902
18F40     1834
20G30     1827
1A30      1777
18F50     1765
7C20      1760
5F40      1759
15E40     1741
20G50     1716
3B50      1712
11H10     1690
1A50      1684
9C40      1605
6B30      1546
17E30     1536
14D10     1477
          ... 
'16E10       4
'9C10        3
21I40        3
'2A40        3
'11H50       3
'UNK         3
'4F10        3
'16E40       3
'24C10       3
'3B30        3
'5F10        3
'10H20       3
23J40        3
'24C30       2
'21I50       2
'12D40       2
'14D50       2
'21I10       2
23J30        2
'11H20       2
UH-3P        2
'24C50       1
'8C40        1
'8C50        1
'24C40       1
'24C20       1
'16E30       1
'1A40        1
'11H40       1
'23J50       1
Name: Beat, Length: 235, dtype: int64

In [20]:
df14.BlockRange.unique()  # UNK val

array(['8500-8599', '3600-3699', '7400-7499', '5400-5499', '9300-9399',
       '2100-2199', '11100-11199', '10100-10199', '9000-9099',
       '1700-1799', '1800-1899', '13100-13199', '7100-7199', '9700-9799',
       '8800-8899', '500-599', '8700-8799', '2800-2899', '4500-4599',
       '14700-14799', '2600-2699', '7200-7299', '100-199', '1300-1399',
       '6500-6599', '6400-6499', '1400-1499', '3000-3099', '400-499',
       '7300-7399', '4000-4099', '15200-15299', '4400-4499', '4200-4299',
       '12800-12899', '3800-3899', '11700-11799', '2900-2999',
       '6700-6799', '4800-4899', '5900-5999', '1200-1299', '4700-4799',
       '11500-11599', '7500-7599', '1000-1099', '2400-2499', '6200-6299',
       '2700-2799', '1900-1999', '1100-1199', '13700-13799', '9400-9499',
       '9800-9899', '700-799', '9600-9699', '800-899', '5600-5699',
       '6600-6699', '12600-12699', '11400-11499', '8600-8699',
       '9900-9999', '7900-7999', '3200-3299', '12300-12399', '6000-6099',
       '6300-6399

In [22]:
df14.BlockRange.value_counts(dropna=False)  #  UNK ??

100-199          1880
2300-2399        1671
900-999          1636
1000-1099        1415
4400-4499        1412
700-799          1407
2400-2499        1382
7900-7999        1363
800-899          1341
300-399          1310
9400-9499        1299
200-299          1297
9500-9599        1262
7500-7599        1231
2700-2799        1211
5000-5099        1195
500-599          1184
1500-1599        1176
600-699          1168
1300-1399        1165
2500-2599        1139
6100-6199        1137
3800-3899        1118
5800-5899        1115
1100-1199        1113
1400-1499        1110
6400-6499        1109
2100-2199        1107
6000-6099        1099
5100-5199        1089
                 ... 
29800-29899         3
19300-19399         3
19800-19899         3
24900-24999         2
21500-21599         2
20000-20099         2
21400-21499         2
22000-22099         2
20500-20599         2
25800-25899         2
19900-19999         2
23100-23199         2
25500-25599         1
100000-100099       1
22800-2289

In [24]:
df14.StreetName.unique()  # 

array(['MARTIN LUTHER KING', 'MCKINNEY', 'HILLMONT', ..., 'ELANA',
       'JOHNSTON', 'TIMOTHY'], dtype=object)

In [26]:
df14.StreetName.value_counts(dropna=False)  # 

WESTHEIMER                  3458
NORTH                       1790
GULF                        1756
GESSNER                     1680
RICHMOND                    1628
KATY                        1265
BELLAIRE                    1225
SOUTHWEST                   1207
MAIN                        1203
BISSONNET                   1194
NORTHWEST                   1032
FM 1960                      998
BEECHNUT                     958
GREENS                       951
FONDREN                      933
BELLFORT                     923
BROADWAY                     921
SAM HOUSTON                  888
TIDWELL                      858
POST OAK                     842
LITTLE YORK                  835
EAST                         809
WAYSIDE                      801
LOOP                         798
SHEPHERD                     768
AIRLINE                      687
HILLCROFT                    684
FANNIN                       611
KIRBY                        602
CULLEN                       539
          

In [29]:
df14['Offense Type'].unique()  # clean

array(['Murder', 'Rape', 'Robbery', 'Aggravated Assault', 'Burglary',
       'Auto Theft', 'Theft'], dtype=object)

In [28]:
df14['Offense Type'].value_counts(dropna=False) # preatty clean

Theft                 66675
Burglary              21145
Auto Theft            13454
Robbery                9998
Aggravated Assault     9460
Rape                    788
Murder                  230
Name: Offense Type, dtype: int64

In [31]:
df14.Premise.unique()  #lots of num& letter, & nan values

array(['18D', '13R', '20A', '03B', '250', '20D', '20N', '18A', '20R',
       '20C', '150', '18N', '18P', '140', '18G', '22H', '18B', '18O',
       '070', '18T', '210', '13S', '13B', '18M', '190', '11R', '05L',
       '18U', '18C', '080', '24E', '24C', '23S', '100', '18R', '05O',
       '03S', '120', '05Z', '18H', '05N', '01B', '05F', '20M', '05D',
       '240', '18S', '24A', '13H', '11P', '20P', '13A', '20L', '18W',
       '05C', '05M', '170', '05B', '09P', '---', '20G', '09D', '040',
       '05E', '05Q', '24P', '24G', '05W', '20V', '05U', '05V', '060',
       '22M', '24F', '05Y', '05R', '25V', '11G', '05X', '05G', '11L',
       '22E', '11F', '04V', '09H', '24S', '02B', '22C', '22P', '01A',
       '22U', '01K', '02S', '11S', '01P', '05S', '05H', '05T', '06',
       '22D', '24B', '05A', '24J', '24M', '24T', '02C', '24V', '23C', 'N',
       '09R', '05P', '01T', '13T', '18L', 'Residence or House',
       'Driveway', 'Apartment Parking Lot', nan,
       'Bar or Night Club Parking Lot', 'Co

In [33]:
df14.Premise.value_counts(dropna=False)  # LOTS NAN values!!!

NaN                                          29581
20R                                           6410
Residence or House                            6384
20A                                           6094
18A                                           6072
Apartment                                     4786
Apartment Parking Lot                         4517
13R                                           3836
Road, Street, or Sidewalk                     3465
20D                                           3222
18O                                           2999
18R                                           1757
080                                           1702
Driveway                                      1665
120                                           1411
Restaurant or Cafeteria Parking Lot           1411
Department or Discount Store                  1277
18P                                           1181
23S                                           1104
Other Parking Lot              

In [36]:
df14['# offenses'].unique()

array([ 1.,  3.,  2.,  4.,  6.,  5., 12., 29.,  7.])

In [37]:
df14['# offenses'].value_counts(dropna=False)  # 2 locations with 10 offenses?

1.0     119966
2.0       1560
3.0        176
4.0         27
5.0         11
6.0          7
7.0          1
12.0         1
29.0         1
Name: # offenses, dtype: int64

In [39]:
df14.Hour.unique()  ## this needs cleanup!!!

array([17, 5, 18, 3, 6, 21, 13, 14, 2, 23, 12, 0, 8, 19, 20, 1, 9, 16, 15,
       10, 22, 7, 11, 4, 24, '07', '05', '22', '15', '10', '19', '09',
       '08', '14', "'15", '02', '12', "'07", '16', "'19", "'18", '21',
       '17', '11', '13', '18', '06', "'00", "'05", "'13", "'14", "'22",
       "'23", '04', '03', '23', '20', '00', "'20", "'16", '01', "'17",
       "'06", "'03", "'01", "'09", "'02", "'10", "'11", "'12", "'21",
       "'08", "'04"], dtype=object)

In [41]:
df14.Hour.value_counts(dropna=False)  #strip the string chars and keep only nums

18     6665
15     6615
16     6553
17     6430
12     6415
19     6192
14     6181
20     5899
13     5889
10     5732
11     5686
9      5313
21     5291
22     5003
8      4927
0      4446
23     4071
7      4001
6      2643
1      2420
2      2414
3      1910
5      1825
4      1615
14      412
15      410
09      405
16      400
11      383
12      377
       ... 
02      144
05      129
03      117
01      115
04       94
'00      91
'12      73
'18      69
'22      64
'17      56
'16      53
'20      51
'23      48
'19      48
'21      48
'14      44
'08      43
'15      43
'10      40
'13      38
'11      33
'07      29
'09      25
'02      25
'01      24
'06      17
'05      16
'03      12
'04      10
24        1
Name: Hour, Length: 73, dtype: int64

# Cleanup
## Premise Column

- strip empty spaces : NOT NEEDED

In [42]:
len(df14.Premise.unique())

251

In [43]:
df14['Premise'] = df14['Premise'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [44]:
len(df14.Premise.unique())

251

In [50]:
df14[df14.Premise.isnull()].head()  # 29581

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
10412,2014-08-08,10H10,5300-5399,HARRISBURG,Aggravated Assault,,1.0,18
10419,2014-08-23,10H10,UNK,ENNIS,Theft,,1.0,0
10420,2014-08-25,10H10,100-199,LOCKWOOD,Theft,,1.0,12
10423,2014-08-02,10H10,200-299,ROBERTS,Theft,,1.0,18
10448,2014-07-27,10H10,3900-3999,BERING,Aggravated Assault,,1.0,15


In [51]:
df14.Premise.value_counts(dropna=False).head()  # LOTS NANS

NaN                   29581
20R                    6410
Residence or House     6384
20A                    6094
18A                    6072
Name: Premise, dtype: int64

# Cleanup
## Offense Type Column
superclean: no need to touch

In [52]:
df14['Offense Type'].value_counts(dropna=False)

Theft                 66675
Burglary              21145
Auto Theft            13454
Robbery                9998
Aggravated Assault     9460
Rape                    788
Murder                  230
Name: Offense Type, dtype: int64

In [53]:
df14['Offense Type'].unique()

array(['Murder', 'Rape', 'Robbery', 'Aggravated Assault', 'Burglary',
       'Auto Theft', 'Theft'], dtype=object)

In [54]:
len(df14['Offense Type'].unique())

7

In [55]:
df14['Offense Type'] = df14['Offense Type'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [56]:
df14['Offense Type'].value_counts(dropna=False)

Theft                 66675
Burglary              21145
Auto Theft            13454
Robbery                9998
Aggravated Assault     9460
Rape                    788
Murder                  230
Name: Offense Type, dtype: int64

In [57]:
len(df14['Offense Type'].unique())

7

# Cleanup
## StreetName Column

- strip empty spaces
- find similar values and combine ( needs done)

In [59]:
df14.StreetName.value_counts(dropna=False).head()

WESTHEIMER    3458
NORTH         1790
GULF          1756
GESSNER       1680
RICHMOND      1628
Name: StreetName, dtype: int64

In [60]:
len(df14.StreetName.unique())

7970

In [61]:
df14['StreetName'] = df14['StreetName'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [62]:
len(df14.StreetName.unique())

7966

In [67]:
df14[df14.StreetName.isnull()].head()  # 24 nan values

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
15965,2014-08-06,1A20,UNK,,Theft,,1.0,17
17617,2014-07-21,2A30,UNK,,Theft,Driveway,1.0,10
17631,2014-08-15,2A30,UNK,,Burglary,Garage or Carport,1.0,6
17689,2014-08-15,2A30,UNK,,Auto Theft,Apartment Parking Lot,1.0,15
17712,2014-08-21,2A30,UNK,,Theft,Residence or House,1.0,12


In [68]:
df14.StreetName.value_counts(dropna=False).head()

WESTHEIMER    3458
NORTH         1790
GULF          1756
GESSNER       1680
RICHMOND      1628
Name: StreetName, dtype: int64

# Cleanup
## BlockRange Column

- create mask to find 'UNK' values
- match with similar beat value (Needs to be done)

In [69]:
df14.BlockRange.value_counts(dropna=False).head()  # find UNK

100-199      1880
2300-2399    1671
900-999      1636
1000-1099    1415
4400-4499    1412
Name: BlockRange, dtype: int64

In [70]:
unk = df14.BlockRange == 'UNK'  # boolean mask

In [73]:
df14[unk].head()  # 805 rows

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
10419,2014-08-23,10H10,UNK,ENNIS,Theft,,1.0,0
10426,2014-08-23,10H10,UNK,ENNIS,Theft,Other Parking Lot,1.0,0
10666,2014-08-25,10H40,UNK,FANNIN,Theft,"Road, Street, or Sidewalk",1.0,20
10674,2014-06-28,10H40,UNK,MCGOWEN,Theft,Bar or Night Club,1.0,20
10833,2014-08-25,10H60,UNK,2 CHARLESTON,Theft,Residence or House,1.0,23


# Cleanup
## Beat Column

- strip empty spaces

In [74]:
df14.Beat.unique()

array(['14D30', '10H20', '5F30', '17E10', '18F50', '10H70', '19G20',
       '2A10', '17E30', '2A60', '5F10', '16E20', '17E20', '19G40',
       '19G10', '3B40', '10H50', '10H40', '16E30', '15E40', '6B50',
       '6B40', '18F30', '2A40', '10H10', '3B30', '11H20', '12D70', '8C10',
       '3B10', '14D40', '14D20', '1A50', '11H10', '9C40', '3B50', '2A20',
       '7C30', '20G40', '1A30', '2A30', '6B60', '18F60', '1A20', '4F10',
       '15E30', '12D30', '4F20', '19G50', '8C60', '15E20', '17E40',
       '9C20', '14D10', '2A50', '1A10', '6B30', '7C40', '19G30', '11H30',
       '9C30', '7C50', '20G10', '13D20', '13D10', '18F40', '20G30',
       '6B10', '7C20', '12D40', '20G20', '16E40', '20G50', '12D20',
       '5F40', '14D50', '13D30', '18F20', '12D10', '20G60', '15E10',
       '5F20', '20G80', '7C10', '4F30', '6B20', '10H60', '8C20', '8C40',
       '13D40', '16E10', '8C50', '8C30', '24C20', '24C50', '9C10',
       '20G70', '24C10', '12D60', 'UNK', '10H30', '11H50', '24C30',
       '12D50', '11

In [75]:
len(df14.Beat.unique())

235

In [76]:
df14['Beat'] = df14['Beat'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [77]:
len(df14.Beat.unique())

235

In [78]:
df14.Beat.value_counts(dropna=False)

19G10     2653
1A20      2570
6B60      2520
13D20     2447
1A10      2443
12D10     2370
17E10     2267
3B10      2047
18F30     2038
17E40     2030
18F20     2018
6B10      1991
14D20     1976
5F30      1911
2A50      1902
18F40     1834
20G30     1827
1A30      1777
18F50     1765
7C20      1760
5F40      1759
15E40     1741
20G50     1716
3B50      1712
11H10     1690
1A50      1684
9C40      1605
6B30      1546
17E30     1536
14D10     1477
          ... 
'16E10       4
'9C10        3
21I40        3
'2A40        3
'11H50       3
'UNK         3
'4F10        3
'16E40       3
'24C10       3
'3B30        3
'5F10        3
'10H20       3
23J40        3
'24C30       2
'21I50       2
'12D40       2
'14D50       2
'21I10       2
23J30        2
'11H20       2
UH-3P        2
'24C50       1
'8C40        1
'8C50        1
'24C40       1
'24C20       1
'16E30       1
'1A40        1
'11H40       1
'23J50       1
Name: Beat, Length: 235, dtype: int64

In [81]:
df14[df14['Beat'].isnull()].head() # 16 nan values

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
52628,2014-07-19,,2100-2199,VINTAGE,Theft,,1.0,11
52629,2014-07-13,,500-599,W.38TH,Theft,,1.0,8
52630,2014-07-26,,6100-6199,ATTWATER,Rape,,1.0,15
52631,2014-07-13,,7800-7899,FM 1960,Theft,,1.0,15
52632,2014-07-14,,6600-6699,MERRY,Theft,,1.0,9


## Cleanup

### time column


In [83]:
df14.Hour.unique()

array([17, 5, 18, 3, 6, 21, 13, 14, 2, 23, 12, 0, 8, 19, 20, 1, 9, 16, 15,
       10, 22, 7, 11, 4, 24, '07', '05', '22', '15', '10', '19', '09',
       '08', '14', "'15", '02', '12', "'07", '16', "'19", "'18", '21',
       '17', '11', '13', '18', '06', "'00", "'05", "'13", "'14", "'22",
       "'23", '04', '03', '23', '20', '00', "'20", "'16", '01', "'17",
       "'06", "'03", "'01", "'09", "'02", "'10", "'11", "'12", "'21",
       "'08", "'04"], dtype=object)

In [133]:
df14.Hour.value_counts(dropna=False)  # too much trouble to clean?

18     6665
15     6615
16     6553
17     6430
12     6415
19     6192
14     6181
20     5899
13     5889
10     5732
11     5686
9      5313
21     5291
22     5003
8      4927
0      4446
23     4071
7      4001
6      2643
1      2420
2      2414
3      1910
5      1825
4      1615
14      412
15      410
09      405
16      400
11      383
12      377
       ... 
02      144
05      129
03      117
01      115
04       94
'00      91
'12      73
'18      69
'22      64
'17      56
'16      53
'20      51
'23      48
'19      48
'21      48
'14      44
'08      43
'15      43
'10      40
'13      38
'11      33
'07      29
'09      25
'02      25
'01      24
'06      17
'05      16
'03      12
'04      10
24        1
Name: Hour, Length: 73, dtype: int64

## Cleanup

### Date column
- convert to datetime
- index date colimn
- sort index

In [113]:
df14.head(5)

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
0,2014-04-19,14D30,8500-8599,MARTIN LUTHER KING,Murder,18D,1.0,17
1,2014-04-28,10H20,3600-3699,MCKINNEY,Murder,13R,1.0,5
2,2014-04-27,5F30,7400-7499,HILLMONT,Murder,20A,3.0,18
3,2014-04-09,17E10,5400-5499,RENWICK,Murder,20A,2.0,18
4,2014-04-24,18F50,9300-9399,RICHMOND,Murder,03B,1.0,3


In [114]:
df14['Date'] = pd.to_datetime(df14['Date'])

df14 = df14.set_index('Date').sort_index(ascending=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [115]:
df14.head(5)

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1914-09-08,24C60,12700-12799,LAKE HOUSTON,Burglary,Restaurant or Cafeteria,1.0,7
1914-11-02,18F60,8800-8899,BELLAIRE,Burglary,Miscellaneous Business (Non-Specific),1.0,3
1914-12-03,12D20,12800-12899,GULF,Auto Theft,,1.0,19
1920-06-28,17E40,12100-12199,FAIRMEADOW,Theft,,1.0,16
1924-12-09,UNK,11000-11099,EASTEX,Theft,Strip Business Center Parking Lot,1.0,11


In [116]:
df14.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 121750 entries, 1914-09-08 to NaT
Data columns (total 7 columns):
Beat            121734 non-null object
BlockRange      121593 non-null object
StreetName      121726 non-null object
Offense Type    121750 non-null object
Premise         92169 non-null object
# offenses      121750 non-null float64
Hour            121750 non-null object
dtypes: float64(1), object(6)
memory usage: 7.4+ MB


## Odd dates
`DatetimeIndex: 121750 entries, 1914-09-08 to NaT`

- some values are not from this year, lets look

In [117]:
df2014 = df14.loc['2014-01-01':'2014-12-31']  # rows with date from 01,01,14 - 12,31,14

In [118]:
df2014_wrong_date = df14[:"2013"]  # rows with year 0  upto 2014

In [119]:
df2014.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 120520 entries, 2014-01-01 to 2014-12-31
Data columns (total 7 columns):
Beat            120506 non-null object
BlockRange      120367 non-null object
StreetName      120496 non-null object
Offense Type    120520 non-null object
Premise         91064 non-null object
# offenses      120520 non-null float64
Hour            120520 non-null object
dtypes: float64(1), object(6)
memory usage: 7.4+ MB


# NAN values

- Beat: 14
- BlockRange: 153
- StreetName: 24
- Offense Type: 0
- Premise: 29456
- Hour: 0

In [123]:
beat_nan = df2014.Beat.isnull()
block_nan = df2014.BlockRange.isnull()
str_nan = df2014.StreetName.isnull()
off_nan = df2014['Offense Type'].isnull()
premise_nan = df2014.Premise.isnull()  #
hour_nan = df2014.Hour.isnull()  #

In [130]:
len(df2014[hour_nan])

0

## DROP nan
drop nan values of StreetName

In [134]:
df2014.head()

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-01-01,19G10,7400-7499,WEST SAM HOUSTON,Theft,18O,1.0,12
2014-01-01,19G10,7400-7499,WEST SAM HOUSTON,Theft,11R,1.0,11
2014-01-01,1A10,1500-1599,PIERCE,Robbery,13R,1.0,6
2014-01-01,19G10,7400-7499,WEST SAM HOUSTON,Theft,18O,1.0,11
2014-01-01,4F10,9500-9599,LONG POINT,Aggravated Assault,20A,1.0,10


In [135]:
df2014 = df2014.dropna(subset=['StreetName'])  # drop nan values from StreetName,

In [136]:
df2014.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 120496 entries, 2014-01-01 to 2014-12-31
Data columns (total 7 columns):
Beat            120482 non-null object
BlockRange      120346 non-null object
StreetName      120496 non-null object
Offense Type    120496 non-null object
Premise         91047 non-null object
# offenses      120496 non-null float64
Hour            120496 non-null object
dtypes: float64(1), object(6)
memory usage: 7.4+ MB


In [137]:
df2014['Premise'] = df2014['Premise'].fillna('unk')

In [140]:
df2014.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 120496 entries, 2014-01-01 to 2014-12-31
Data columns (total 7 columns):
Beat            120482 non-null object
BlockRange      120346 non-null object
StreetName      120496 non-null object
Offense Type    120496 non-null object
Premise         120496 non-null object
# offenses      120496 non-null float64
Hour            120496 non-null object
dtypes: float64(1), object(6)
memory usage: 7.4+ MB


## Final result

- Beat            120482 non-null object
- BlockRange      120346 non-null object

- Hour column still has extra quotes that needs to be removed



will leave as is , can be fill with similar data from other years

## Save clean data to  to csv

In [141]:
df2014.to_csv('clean_data/crime_data/crime14_clean.csv')

In [142]:
ls clean_data/crime_data/

[0m[01;32mcrime14_clean.csv[0m*  [01;32mcrime15_clean.csv[0m*  [01;32mcrime16_clean.csv[0m*  [01;32mcrime17_clean.csv[0m*
