In [1]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt

## Source
- [Houston Police Department Crime Statistics](http://www.houstontx.gov/police/cs/crime-stats-archives.htm)
	- years: 2008 - 2017
	- format: Access or Excel

In [2]:
ls crime_data_raw/2013

[0m[01;32mapr13.xls[0m*  [01;32mdec13.xls[0m*  [01;32mjan13.xls[0m*  [01;32mjun13.xls[0m*  [01;32mmay13.xls[0m*  [01;32moct13.xls[0m*
[01;32maug13.xls[0m*  [01;32mfeb13.xls[0m*  [01;32mjul13.xls[0m*  [01;32mmar13.xls[0m*  [01;32mnov13.xls[0m*  [01;32msep13.xls[0m*


## combine all files into one dataframe

In [3]:
path = 'crime_data_raw/2013'
all_files = glob.glob(os.path.join(path, "*.xls")) 

df_from_each_file = (pd.read_excel(f) for f in all_files)
df   = pd.concat(df_from_each_file, ignore_index=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130459 entries, 0 to 130458
Data columns (total 11 columns):
# Of Offenses    130459 non-null int64
Beat             130459 non-null object
Block Range      130459 non-null object
Date             130459 non-null datetime64[ns]
Hour             130459 non-null int64
Offense Type     130459 non-null object
Premise          130459 non-null object
Street Name      130459 non-null object
Suffix           130459 non-null object
Type             130459 non-null object
Unnamed: 1       0 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2), object(7)
memory usage: 10.9+ MB


In [5]:
df.head(5)

Unnamed: 0,# Of Offenses,Beat,Block Range,Date,Hour,Offense Type,Premise,Street Name,Suffix,Type,Unnamed: 1
0,1,19G10,9400-9499,2013-04-25,5,Murder,13R,WOODFAIR,-,DR,
1,1,17E40,6100-6199,2013-04-14,0,Murder,18A,BELLFORT,W,ST,
2,1,20G10,9900-9999,2013-04-02,16,Murder,20A,RICHMOND,-,AVE,
3,1,2A20,1300-1399,2013-04-19,22,Murder,13R,29TH,E,ST,
4,1,1A10,500-599,2013-04-23,0,Murder,190,RUSK,-,-,


## Lets create a copy

In [6]:
df1 = df.copy()

In [7]:
df1.columns

Index(['# Of Offenses', 'Beat', 'Block Range', 'Date', 'Hour', 'Offense Type',
       'Premise', 'Street Name', 'Suffix', 'Type', 'Unnamed: 1'],
      dtype='object')

In [8]:
df1.head(5)

Unnamed: 0,# Of Offenses,Beat,Block Range,Date,Hour,Offense Type,Premise,Street Name,Suffix,Type,Unnamed: 1
0,1,19G10,9400-9499,2013-04-25,5,Murder,13R,WOODFAIR,-,DR,
1,1,17E40,6100-6199,2013-04-14,0,Murder,18A,BELLFORT,W,ST,
2,1,20G10,9900-9999,2013-04-02,16,Murder,20A,RICHMOND,-,AVE,
3,1,2A20,1300-1399,2013-04-19,22,Murder,13R,29TH,E,ST,
4,1,1A10,500-599,2013-04-23,0,Murder,190,RUSK,-,-,


## Rename columns

In [9]:
df1.rename(columns={
    '# Of Offenses': '# offenses',
    'Block Range': 'BlockRange',
    'Street Name': 'StreetName',}, inplace=True)

In [10]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130459 entries, 0 to 130458
Data columns (total 11 columns):
# offenses      130459 non-null int64
Beat            130459 non-null object
BlockRange      130459 non-null object
Date            130459 non-null datetime64[ns]
Hour            130459 non-null int64
Offense Type    130459 non-null object
Premise         130459 non-null object
StreetName      130459 non-null object
Suffix          130459 non-null object
Type            130459 non-null object
Unnamed: 1      0 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2), object(7)
memory usage: 10.9+ MB


In [11]:
df1.head(5)

Unnamed: 0,# offenses,Beat,BlockRange,Date,Hour,Offense Type,Premise,StreetName,Suffix,Type,Unnamed: 1
0,1,19G10,9400-9499,2013-04-25,5,Murder,13R,WOODFAIR,-,DR,
1,1,17E40,6100-6199,2013-04-14,0,Murder,18A,BELLFORT,W,ST,
2,1,20G10,9900-9999,2013-04-02,16,Murder,20A,RICHMOND,-,AVE,
3,1,2A20,1300-1399,2013-04-19,22,Murder,13R,29TH,E,ST,
4,1,1A10,500-599,2013-04-23,0,Murder,190,RUSK,-,-,


## create a subdataframe with the columns that we want

In [12]:
df13 = df1[['Date','Beat','BlockRange','StreetName','Offense Type','Premise','# offenses','Hour']]

In [13]:
df13.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130459 entries, 0 to 130458
Data columns (total 8 columns):
Date            130459 non-null datetime64[ns]
Beat            130459 non-null object
BlockRange      130459 non-null object
StreetName      130459 non-null object
Offense Type    130459 non-null object
Premise         130459 non-null object
# offenses      130459 non-null int64
Hour            130459 non-null int64
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 8.0+ MB


In [14]:
df13.tail()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
130454,2013-09-30,1A30,2500-2599,PARK,Theft,13R,1,20
130455,2013-09-30,17E30,11200-11299,GESSNER,Theft,18G,1,20
130456,2013-09-30,13D20,8600-8699,MORLEY,Theft,20D,1,18
130457,2013-09-26,2A60,2600-2699,BEVIS,Theft,13R,1,7
130458,2013-09-30,17E10,5300-5399,DASHWOOD,Theft,18H,1,19


## Now we can inspect df

In [15]:
df13.Date.unique() ## timestamp

array(['2013-04-25T00:00:00.000000000', '2013-04-14T00:00:00.000000000',
       '2013-04-02T00:00:00.000000000', '2013-04-19T00:00:00.000000000',
       '2013-04-23T00:00:00.000000000', '2012-07-02T00:00:00.000000000',
       '2013-04-11T00:00:00.000000000', '2013-04-03T00:00:00.000000000',
       '2013-04-17T00:00:00.000000000', '2013-04-29T00:00:00.000000000',
       '2013-04-21T00:00:00.000000000', '2013-04-18T00:00:00.000000000',
       '2013-04-13T00:00:00.000000000', '2013-04-27T00:00:00.000000000',
       '2013-04-05T00:00:00.000000000', '2013-04-08T00:00:00.000000000',
       '2013-04-28T00:00:00.000000000', '2013-04-20T00:00:00.000000000',
       '2013-04-06T00:00:00.000000000', '2013-04-01T00:00:00.000000000',
       '2005-01-01T00:00:00.000000000', '2010-05-01T00:00:00.000000000',
       '2011-10-16T00:00:00.000000000', '2013-04-16T00:00:00.000000000',
       '2013-03-23T00:00:00.000000000', '2013-04-15T00:00:00.000000000',
       '2013-03-18T00:00:00.000000000', '2013-04-22

In [16]:
df13.Beat.unique()  ##

array(['19G10', '17E40', '20G10', '2A20', '1A10', '14D10', '7C20',
       '14D20', '6B60', '17E20', '4F10', '7C30', '17E30', '7C10', '8C60',
       '19G50', '6B50', '6B20', '20G30', '2A60', '2A50', '18F60', '18F30',
       '16E40', '15E20', '5F30', '7C50', '13D20', '2A10', '12D10',
       '12D30', '20G40', '14D40', '19G20', '19G30', '18F20', '6B40',
       '15E30', '8C30', '17E10', '10H70', '8C10', '12D50', '3B10', '1A30',
       '10H50', '8C50', '9C40', '16E10', '6B10', '10H40', '18F40', '5F40',
       '6B30', '3B50', '1A50', '20G50', '8C40', '10H30', '3B30', '13D10',
       '16E20', '3B40', '19G40', '10H60', '9C20', '1A20', '5F10', '5F20',
       '14D30', '4F20', '7C40', '18F50', '20G60', '20G70', '16E30',
       '11H30', '20G20', '15E10', '10H20', '11H10', '2A30', '8C20',
       '24C20', '13D40', '12D70', '14D50', '15E40', 'UNK', '20G80',
       '9C30', '11H20', '4F30', '24C50', '10H10', '21I30', '24C30',
       '12D40', '11H50', '1A40', '10H80', '24C10', '12D60', '9C10',
       '18

In [17]:
df13.Beat.value_counts(dropna=False)  # UNK : 98

12D10    2787
19G10    2777
1A20     2651
13D20    2607
6B60     2589
18F40    2285
3B10     2262
18F30    2258
2A50     2257
20G30    2247
18F20    2246
5F30     2222
15E40    2158
17E10    2153
1A30     2105
14D20    2104
17E40    2063
6B10     1999
1A50     1907
18F50    1880
9C40     1845
20G50    1812
1A10     1766
2A30     1738
5F40     1728
18F60    1713
7C20     1699
19G40    1686
6B30     1622
17E30    1581
         ... 
10H10     469
13D40     464
8C20      425
10H20     376
11H50     361
8C40      349
14D50     337
24C10     333
12D50     333
1A40      328
24C30     314
24C20     305
9C10      304
18F10     282
11H40     244
24C50     227
21I10     181
21I50     173
23J50     131
21I30     126
13D30     125
24C40     116
24C60     113
UNK        98
21I60      61
21I20      50
21I70      19
21I40      16
23J40       4
23J30       3
Name: Beat, Length: 120, dtype: int64

In [18]:
df13.BlockRange.unique()  #

array(['9400-9499', '6100-6199', '9900-9999', '1300-1399', '500-599',
       '3600-3699', '3700-3799', '9600-9699', '7000-7099', '3100-3199',
       '4000-4099', '8100-8199', '9200-9299', '5300-5399', '10500-10599',
       '10100-10199', '100-199', '1000-1099', '2700-2799', '1900-1999',
       '5000-5099', '6200-6299', '4200-4299', '6500-6599', '10000-10099',
       '16500-16599', '11800-11899', '12900-12999', '7400-7499',
       '15000-15099', '7600-7699', '8700-8799', '9500-9599', '4400-4499',
       '2200-2299', '6700-6799', '4100-4199', '8000-8099', '9100-9199',
       '7900-7999', '12300-12399', '2000-2099', '7200-7299', '4300-4399',
       '7100-7199', '8300-8399', '6600-6699', '300-399', '7800-7899',
       '2100-2199', '12100-12199', '10200-10299', '5800-5899',
       '1600-1699', '2300-2399', '8800-8899', '3200-3299', '800-899',
       '14000-14099', '200-299', '6300-6399', '11000-11099',
       '13400-13499', '600-699', '0-99', '12500-12599', '5500-5599',
       '6800-6899', 

In [19]:
df13.BlockRange.value_counts(dropna=False)  #

100-199        1879
900-999        1783
9500-9599      1607
700-799        1503
9400-9499      1485
7900-7999      1476
800-899        1457
2400-2499      1432
300-399        1432
500-599        1380
1300-1399      1375
1500-1599      1365
2500-2599      1364
1000-1099      1357
7500-7599      1339
2700-2799      1321
200-299        1313
5000-5099      1290
4400-4499      1284
600-699        1283
5800-5899      1281
1200-1299      1252
6000-6099      1243
2300-2399      1229
2600-2699      1228
5900-5999      1215
1400-1499      1193
6400-6499      1167
2200-2299      1164
2100-2199      1153
               ... 
20100-20199       2
29800-29899       2
22100-22199       2
23100-23199       2
21500-21599       2
27500-27599       2
19600-19699       2
24300-24399       2
20600-20699       1
21100-21199       1
23600-23699       1
19700-19799       1
20800-20899       1
23000-23099       1
25300-25399       1
21800-21899       1
22400-22499       1
20400-20499       1
21900-21999       1


In [20]:
df13.StreetName.unique()  #

array(['WOODFAIR', 'BELLFORT', 'RICHMOND', ..., 'VALWOOD', 'BLUE TAIL',
       'BAYOU ISLAND'], dtype=object)

In [21]:
df13.StreetName.value_counts(dropna=False)  # name cleanup too!

WESTHEIMER          3824
GULF                2035
GESSNER             1925
RICHMOND            1726
NORTH               1679
SOUTHWEST           1337
MAIN                1337
KATY                1337
NORTHWEST           1302
BELLAIRE            1300
BISSONNET           1172
WEST SAM HOUSTON    1163
BELLFORT            1075
FONDREN             1055
BEECHNUT            1017
FM 1960             1010
GREENS               981
EAST                 947
BROADWAY             903
LITTLE YORK          899
TIDWELL              881
POST OAK             786
SHEPHERD             770
KIRBY                714
AIRLINE              711
FANNIN               633
HILLCROFT            612
CULLEN               585
TELEPHONE            566
FUQUA                553
                    ... 
WOODLAKE SQUARE        1
HALCYON TIME           1
BEACONRIDGE            1
DREXEL HILL            1
SUNGATE                1
CORONADO               1
STACY KNOLL            1
CRIPPLE BROOK          1
PENHURST               1


In [22]:
df13['Offense Type'].unique()  # good

array(['Murder', 'Rape', 'Robbery', 'Aggravated Assault', 'Burglary',
       'Auto Theft', 'Theft'], dtype=object)

In [23]:
df13['Offense Type'].value_counts(dropna=False) # good, that its clean not that there were 73591 thefts!! 

Theft                 73591
Burglary              23733
Auto Theft            13147
Robbery                9891
Aggravated Assault     9274
Rape                    614
Murder                  209
Name: Offense Type, dtype: int64

In [24]:
df13.Premise.unique()  # lots of weird values

array(['13R', '18A', '20A', '190', '250', '20D', '20R', '13S', '09H',
       '05L', '18C', '05T', '11R', '18G', '140', '20C', '20P', '18R',
       '02S', '18T', '05Z', '120', '05B', '13H', '02B', '18P', '18O',
       '080', '070', '20G', '05Q', '13B', '23S', '03B', '22H', '18N',
       '210', '03S', '11L', '01B', '13A', '05E', '240', '22E', '100',
       '05D', '24J', '18U', '05C', '02C', '20N', '18W', '18M', '05M',
       '05F', '22M', '24E', '20L', '170', '24A', '09D', '24C', '18B',
       '01A', '05W', '---', '11S', '22P', '05G', '06', '05O', '18L',
       '060', '05N', '18S', '05S', '040', '20W', '20V', '20M', '20H',
       '22D', '09P', '19V', '21V', '05R', '05X', '24P', '24G', '11G',
       '24F', '05H', '05Y', '12V', '01P', '18H', '05V', '05P', '24S',
       '22U', '22V', '11P', '11F', '22C', '24T', '01T', '18D', '09R',
       '150', '11C', '25R', '24B', 'N', '05U', '09V', '04V', '01K', '05A',
       '24V', '160', '14V', '13T', '25V', '11V', '01R', '24M', '13C'],
      dtype=obj

In [25]:
df13.Premise.value_counts(dropna=False)  # srip extra spaces

20R    16499
20A    15958
18A    14756
13R     9713
20D     8486
18O     8178
080     4213
18R     3704
120     3653
18P     2788
250     2710
23S     2559
18T     2468
210     2251
240     1652
18G     1628
18M     1603
18N     1427
05Z     1287
03B     1237
140     1198
18C     1163
24C     1099
18U     1092
05C     1027
---     1019
20G      884
11R      814
070      755
09D      715
       ...  
09R       30
21V       28
22C       26
20H       26
20W       25
24T       21
24B       21
24A       19
02C       16
02S       15
05Y       14
11F       14
150       13
12V       13
13T       12
01T       12
24V       12
19V       11
04V       10
14V        9
01K        8
22V        7
25V        7
11V        7
25R        4
160        4
01R        3
13C        3
24M        2
09V        1
Name: Premise, Length: 127, dtype: int64

In [26]:
df13['# offenses'].unique()

array([ 1,  3,  2,  7,  4,  5,  6, 11,  8])

In [27]:
df13['# offenses'].value_counts(dropna=False)  # 2 locations with 10 offenses?

1     129259
2        994
3        184
4         13
6          3
7          2
5          2
11         1
8          1
Name: # offenses, dtype: int64

In [28]:
df13.Hour.unique()

array([ 5,  0, 16, 22,  3, 12,  9,  6, 20,  4, 21, 14, 18, 11, 13, 15, 17,
        7, 19, 23,  2, 10,  1,  8])

In [29]:
df13.Hour.value_counts(dropna=False)

18    8106
22    7601
19    7559
12    7468
20    7426
17    7369
21    7003
15    6641
0     6601
16    6546
14    6317
23    5800
13    5658
10    5431
11    5335
8     5152
9     4766
7     4105
1     3488
2     3128
6     2779
3     2435
5     1905
4     1840
Name: Hour, dtype: int64

# Cleanup
## Premise Column

- strip empty spaces :not needed

In [30]:
len(df13.Premise.unique())

127

In [31]:
df13['Premise'] = df13['Premise'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [32]:
len(df13.Premise.unique())

127

In [33]:
df13.Premise.value_counts(dropna=False)

20R    16499
20A    15958
18A    14756
13R     9713
20D     8486
18O     8178
080     4213
18R     3704
120     3653
18P     2788
250     2710
23S     2559
18T     2468
210     2251
240     1652
18G     1628
18M     1603
18N     1427
05Z     1287
03B     1237
140     1198
18C     1163
24C     1099
18U     1092
05C     1027
---     1019
20G      884
11R      814
070      755
09D      715
       ...  
09R       30
21V       28
22C       26
20H       26
20W       25
24T       21
24B       21
24A       19
02C       16
02S       15
05Y       14
11F       14
150       13
12V       13
13T       12
01T       12
24V       12
19V       11
04V       10
14V        9
01K        8
22V        7
25V        7
11V        7
25R        4
160        4
01R        3
13C        3
24M        2
09V        1
Name: Premise, Length: 127, dtype: int64

# Cleanup
## Offense Type Column

preatty clean!


In [34]:
df13['Offense Type'].value_counts(dropna=False)

Theft                 73591
Burglary              23733
Auto Theft            13147
Robbery                9891
Aggravated Assault     9274
Rape                    614
Murder                  209
Name: Offense Type, dtype: int64

In [35]:
df13['Offense Type'].unique()

array(['Murder', 'Rape', 'Robbery', 'Aggravated Assault', 'Burglary',
       'Auto Theft', 'Theft'], dtype=object)

In [36]:
len(df13['Offense Type'].unique())

7

In [37]:
df13['Offense Type'] = df13['Offense Type'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [38]:
df13['Offense Type'].value_counts(dropna=False)

Theft                 73591
Burglary              23733
Auto Theft            13147
Robbery                9891
Aggravated Assault     9274
Rape                    614
Murder                  209
Name: Offense Type, dtype: int64

In [39]:
len(df13['Offense Type'].unique())

7

# Cleanup
## StreetName Column

preatty clean

In [40]:
df13.StreetName.value_counts(dropna=False).head()

WESTHEIMER    3824
GULF          2035
GESSNER       1925
RICHMOND      1726
NORTH         1679
Name: StreetName, dtype: int64

In [41]:
len(df13.StreetName.unique())

7540

In [42]:
df13['StreetName'] = df13['StreetName'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [43]:
len(df13.StreetName.unique())

7540

In [44]:
#df13[df13.StreetName.isnull()].head()

In [45]:
df13.StreetName.value_counts(dropna=False).head()

WESTHEIMER    3824
GULF          2035
GESSNER       1925
RICHMOND      1726
NORTH         1679
Name: StreetName, dtype: int64

# Cleanup
## BlockRange Column

- create mask to find 'UNK' values
- match with similar beat value (Needs to be done)

In [46]:
df13.BlockRange.value_counts(dropna=False).head()  # find UNK

100-199      1879
900-999      1783
9500-9599    1607
700-799      1503
9400-9499    1485
Name: BlockRange, dtype: int64

In [47]:
unk = df13.BlockRange == 'UNK'  # boolean mask

In [48]:
df13[unk]  # 0 rows

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour


In [49]:
df13[df13.BlockRange.isnull()].head()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour


# Cleanup
## Beat Column

preatty clean

In [50]:
df13.Beat.unique()

array(['19G10', '17E40', '20G10', '2A20', '1A10', '14D10', '7C20',
       '14D20', '6B60', '17E20', '4F10', '7C30', '17E30', '7C10', '8C60',
       '19G50', '6B50', '6B20', '20G30', '2A60', '2A50', '18F60', '18F30',
       '16E40', '15E20', '5F30', '7C50', '13D20', '2A10', '12D10',
       '12D30', '20G40', '14D40', '19G20', '19G30', '18F20', '6B40',
       '15E30', '8C30', '17E10', '10H70', '8C10', '12D50', '3B10', '1A30',
       '10H50', '8C50', '9C40', '16E10', '6B10', '10H40', '18F40', '5F40',
       '6B30', '3B50', '1A50', '20G50', '8C40', '10H30', '3B30', '13D10',
       '16E20', '3B40', '19G40', '10H60', '9C20', '1A20', '5F10', '5F20',
       '14D30', '4F20', '7C40', '18F50', '20G60', '20G70', '16E30',
       '11H30', '20G20', '15E10', '10H20', '11H10', '2A30', '8C20',
       '24C20', '13D40', '12D70', '14D50', '15E40', 'UNK', '20G80',
       '9C30', '11H20', '4F30', '24C50', '10H10', '21I30', '24C30',
       '12D40', '11H50', '1A40', '10H80', '24C10', '12D60', '9C10',
       '18

In [51]:
len(df13.Beat.unique())

120

In [52]:
df13['Beat'] = df13['Beat'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [53]:
len(df13.Beat.unique())

120

In [54]:
df13[df13.Beat.isnull()].head()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour


In [55]:
df13.Beat.value_counts(dropna=False).head()

12D10    2787
19G10    2777
1A20     2651
13D20    2607
6B60     2589
Name: Beat, dtype: int64

In [56]:
df13.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130459 entries, 0 to 130458
Data columns (total 8 columns):
Date            130459 non-null datetime64[ns]
Beat            130459 non-null object
BlockRange      130459 non-null object
StreetName      130459 non-null object
Offense Type    130459 non-null object
Premise         130459 non-null object
# offenses      130459 non-null int64
Hour            130459 non-null int64
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 8.0+ MB


## Cleanup

### Date column
- convert to datetime
- index date colimn
- sort index

In [57]:
df13.head(5)

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
0,2013-04-25,19G10,9400-9499,WOODFAIR,Murder,13R,1,5
1,2013-04-14,17E40,6100-6199,BELLFORT,Murder,18A,1,0
2,2013-04-02,20G10,9900-9999,RICHMOND,Murder,20A,1,16
3,2013-04-19,2A20,1300-1399,29TH,Murder,13R,1,22
4,2013-04-23,1A10,500-599,RUSK,Murder,190,1,0


In [58]:
df13['Date'] = pd.to_datetime(df13['Date'])

df13 = df13.set_index('Date').sort_index(ascending=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [59]:
df13.head(5)

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1963-02-02,10H40,3800-3899,MAIN,Theft,09R,1,13
1972-06-01,12D50,16400-16499,BROOKVILLA,Rape,20R,1,0
1972-07-15,14D30,5300-5399,NORTHRIDGE,Burglary,---,1,20
1977-10-14,10H50,3200-3299,TRUXILLO,Theft,18A,1,15
1979-07-01,11H10,100-199,EASTGATE,Rape,20R,1,0


In [60]:
df13.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 130459 entries, 1963-02-02 to 2033-04-21
Data columns (total 7 columns):
Beat            130459 non-null object
BlockRange      130459 non-null object
StreetName      130459 non-null object
Offense Type    130459 non-null object
Premise         130459 non-null object
# offenses      130459 non-null int64
Hour            130459 non-null int64
dtypes: int64(2), object(5)
memory usage: 8.0+ MB


## Odd dates
`DatetimeIndex: 130459 entries, 1963-02-02 to 2033-04-21
`
- some values are not from this year, lets look

In [61]:
df2013 = df13.loc['2013-01-01':'2013-12-31']  # rows with date from 01,01,13 - 12,31,13

In [62]:
df2023_wrong_date = df13[:"2012"]  # rows with year 0  upto 2012
df2023_wrong_date.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1434 entries, 1963-02-02 to 2012-12-31
Data columns (total 7 columns):
Beat            1434 non-null object
BlockRange      1434 non-null object
StreetName      1434 non-null object
Offense Type    1434 non-null object
Premise         1434 non-null object
# offenses      1434 non-null int64
Hour            1434 non-null int64
dtypes: int64(2), object(5)
memory usage: 89.6+ KB


In [63]:
df2013.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 129021 entries, 2013-01-01 to 2013-12-31
Data columns (total 7 columns):
Beat            129021 non-null object
BlockRange      129021 non-null object
StreetName      129021 non-null object
Offense Type    129021 non-null object
Premise         129021 non-null object
# offenses      129021 non-null int64
Hour            129021 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.9+ MB


# NAN values

- Beat: 0
- BlockRange: 0
- StreetName: 0
- Offense Type: 0
- Premise: 0
- Hour: 0

In [64]:
beat_nan = df2013.Beat.isnull()
block_nan = df2013.BlockRange.isnull()
str_nan = df2013.StreetName.isnull()
off_nan = df2013['Offense Type'].isnull()
premise_nan = df2013.Premise.isnull()  #
hour_nan = df2013.Hour.isnull()  #

In [65]:
df2013

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-01,16E40,6900-6999,TRIGATE,Burglary,20N,1,9
2013-01-01,10H40,4200-4299,SAN JACINTO,Theft,120,1,12
2013-01-01,20G80,800-899,WEST OAKS MALL,Theft,080,1,19
2013-01-01,7C20,5700-5799,LOCKWOOD,Burglary,070,1,0
2013-01-01,1A10,1700-1799,CHENEVERT,Theft,13R,1,14
2013-01-01,6B30,6400-6499,TALL WILLOW,Burglary,20R,1,10
2013-01-01,12D70,500-599,BAYBROOK MALL,Theft,250,1,16
2013-01-01,5F20,7900-7999,AMELIA,Theft,20A,1,18
2013-01-01,11H40,5400-5499,ALLENDALE,Burglary,210,1,6
2013-01-01,1A10,1200-1299,COMMERCE,Theft,150,1,11


## Save clean data to  to csv

In [67]:
df2013.to_csv('crime_data_clean/crime13_clean.csv')

In [69]:
ls crime_data_clean/

[0m[01;32mcrime13_clean.csv[0m*  [01;32mcrime15_clean.csv[0m*  [01;32mcrime17_clean.csv[0m*
[01;32mcrime14_clean.csv[0m*  [01;32mcrime16_clean.csv[0m*
