In [1]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt

## Source
- [Houston Police Department Crime Statistics](http://www.houstontx.gov/police/cs/crime-stats-archives.htm)
	- years: 2008 - 2017
	- format: Access or Excel

In [2]:
ls crime_data_raw/2011

[0m[01;32mapr11.xls[0m*  [01;32mdec11.xls[0m*  [01;32mjan11.xls[0m*  [01;32mjun11.xls[0m*  [01;32mmay11.xls[0m*  [01;32moct11.xls[0m*
[01;32maug11.xls[0m*  [01;32mfeb11.xls[0m*  [01;32mjul11.xls[0m*  [01;32mmar11.xls[0m*  [01;32mnov11.xls[0m*  [01;32msep11.xls[0m*


## combine all files into one dataframe

In [3]:
path = 'crime_data_raw/2011'
all_files = glob.glob(os.path.join(path, "*.xls")) 

df_from_each_file = (pd.read_excel(f) for f in all_files)
df   = pd.concat(df_from_each_file, ignore_index=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127659 entries, 0 to 127658
Data columns (total 10 columns):
Date             127659 non-null datetime64[ns]
Hour             127659 non-null int64
Offense Type     127659 non-null object
Beat             127659 non-null object
Premise          127659 non-null object
Block Range      127659 non-null object
Street Name      127659 non-null object
Type             127659 non-null object
Suffix           127659 non-null object
# Of Offenses    127659 non-null int64
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 9.7+ MB


In [5]:
df.head(5)

Unnamed: 0,Date,Hour,Offense Type,Beat,Premise,Block Range,Street Name,Type,Suffix,# Of Offenses
0,2011-04-13,1,Murder,17E40,20A,6400-6499,BANKSIDE,DR,-,1
1,2011-04-06,14,Murder,20G30,20A,2900-2999,HAYES,RD,-,1
2,2011-01-10,19,Murder,13D20,20R,8400-8499,GLENSCOT,-,-,1
3,2011-04-03,23,Murder,19G50,18N,10700-10799,BELLFORT,ST,W,1
4,2011-04-17,23,Murder,11H20,20A,2500-2599,BROADWAY,ST,-,1


## Lets create a copy

In [6]:
df1 = df.copy()

In [7]:
df1.columns

Index(['Date', 'Hour', 'Offense Type', 'Beat', 'Premise', 'Block Range',
       'Street Name', 'Type', 'Suffix', '# Of Offenses'],
      dtype='object')

In [8]:
df1.head(5)

Unnamed: 0,Date,Hour,Offense Type,Beat,Premise,Block Range,Street Name,Type,Suffix,# Of Offenses
0,2011-04-13,1,Murder,17E40,20A,6400-6499,BANKSIDE,DR,-,1
1,2011-04-06,14,Murder,20G30,20A,2900-2999,HAYES,RD,-,1
2,2011-01-10,19,Murder,13D20,20R,8400-8499,GLENSCOT,-,-,1
3,2011-04-03,23,Murder,19G50,18N,10700-10799,BELLFORT,ST,W,1
4,2011-04-17,23,Murder,11H20,20A,2500-2599,BROADWAY,ST,-,1


## Rename columns

In [9]:
df1.rename(columns={
    '# Of Offenses': '# offenses',
    'Block Range': 'BlockRange',
    'Street Name': 'StreetName',}, inplace=True)

In [10]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127659 entries, 0 to 127658
Data columns (total 10 columns):
Date            127659 non-null datetime64[ns]
Hour            127659 non-null int64
Offense Type    127659 non-null object
Beat            127659 non-null object
Premise         127659 non-null object
BlockRange      127659 non-null object
StreetName      127659 non-null object
Type            127659 non-null object
Suffix          127659 non-null object
# offenses      127659 non-null int64
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 9.7+ MB


In [11]:
df1.head(5)

Unnamed: 0,Date,Hour,Offense Type,Beat,Premise,BlockRange,StreetName,Type,Suffix,# offenses
0,2011-04-13,1,Murder,17E40,20A,6400-6499,BANKSIDE,DR,-,1
1,2011-04-06,14,Murder,20G30,20A,2900-2999,HAYES,RD,-,1
2,2011-01-10,19,Murder,13D20,20R,8400-8499,GLENSCOT,-,-,1
3,2011-04-03,23,Murder,19G50,18N,10700-10799,BELLFORT,ST,W,1
4,2011-04-17,23,Murder,11H20,20A,2500-2599,BROADWAY,ST,-,1


## create a subdataframe with the columns that we want

In [12]:
df11 = df1[['Date','Beat','BlockRange','StreetName','Offense Type','Premise','# offenses','Hour']]

In [13]:
df11.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127659 entries, 0 to 127658
Data columns (total 8 columns):
Date            127659 non-null datetime64[ns]
Beat            127659 non-null object
BlockRange      127659 non-null object
StreetName      127659 non-null object
Offense Type    127659 non-null object
Premise         127659 non-null object
# offenses      127659 non-null int64
Hour            127659 non-null int64
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 7.8+ MB


In [14]:
df11.tail()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
127654,2011-09-30,20G50,11300-11399,KATY,Theft,18R,1,17
127655,2011-09-30,12D70,18100-18199,GULF,Theft,18T,1,22
127656,2011-09-30,2A60,1500-1599,NORTH LP W,Theft,070,1,23
127657,2011-09-30,2A20,900-999,CORDELL,Theft,070,1,23
127658,2011-09-30,18F20,5000-5099,WESTHEIMER,Theft,18A,1,21


## Now we can inspect df

In [15]:
df11.Date.unique() ## timestamp

array(['2011-04-13T00:00:00.000000000', '2011-04-06T00:00:00.000000000',
       '2011-01-10T00:00:00.000000000', '2011-04-03T00:00:00.000000000',
       '2011-04-17T00:00:00.000000000', '2011-04-12T00:00:00.000000000',
       '2011-04-02T00:00:00.000000000', '2011-04-21T00:00:00.000000000',
       '2011-03-20T00:00:00.000000000', '2011-04-05T00:00:00.000000000',
       '2011-04-23T00:00:00.000000000', '2011-02-11T00:00:00.000000000',
       '2011-04-22T00:00:00.000000000', '2011-04-08T00:00:00.000000000',
       '2011-03-28T00:00:00.000000000', '2011-04-10T00:00:00.000000000',
       '2007-04-16T00:00:00.000000000', '2009-10-30T00:00:00.000000000',
       '2010-12-01T00:00:00.000000000', '2011-04-07T00:00:00.000000000',
       '2011-04-26T00:00:00.000000000', '2011-04-01T00:00:00.000000000',
       '2011-04-20T00:00:00.000000000', '2011-04-28T00:00:00.000000000',
       '2007-01-01T00:00:00.000000000', '2011-04-19T00:00:00.000000000',
       '2010-06-01T00:00:00.000000000', '2011-04-24

In [16]:
df11.Beat.unique()  ##

array(['17E40', '20G30', '13D20', '19G50', '11H20', '7C30', '16E30',
       '17E10', '24C10', '18F30', '2A10', '19G30', '5F20', '2A60', 'UNK',
       '14D40', '12D40', '18F60', '18F20', '7C10', '7C20', '18F50',
       '6B30', '16E10', '15E20', '19G40', '3B40', '5F10', '12D10',
       '14D20', '2A20', '9C40', '10H30', '5F40', '13D40', '6B50', '17E30',
       '17E20', '19G10', '20G70', '10H50', '10H40', '11H50', '1A20',
       '24C20', '6B20', '1A10', '8C60', '11H40', '10H80', '16E40', '4F30',
       '14D10', '13D10', '18F10', '10H60', '14D30', '9C30', '20G50',
       '20G40', '3B10', '24C50', '8C40', '19G20', '11H30', '2A50', '4F10',
       '7C40', '1A50', '5F30', '12D60', '12D50', '20G10', '15E40',
       '15E10', '20G20', '18F40', '6B10', '11H10', '2A30', '4F20', '6B40',
       '8C50', '16E20', '9C20', '6B60', '1A30', '12D30', '12D70', '15E30',
       '20G80', '7C50', '8C30', '10H70', '12D20', '8C10', '14D50',
       '20G60', '10H10', '3B50', '24C30', '8C20', '10H20', '3B30', '9C10',


In [17]:
df11.Beat.value_counts(dropna=False)  # UNK : 86

19G10    3029
12D10    2761
6B60     2717
3B10     2578
13D20    2390
14D20    2318
18F30    2248
5F30     2208
17E10    2201
18F20    2176
1A20     2175
18F40    2172
20G30    2056
6B10     1976
20G50    1953
18F50    1920
7C20     1891
1A30     1882
15E40    1867
9C40     1859
5F40     1816
19G40    1779
1A50     1767
2A50     1766
3B50     1747
1A10     1707
17E40    1701
6B30     1630
20G10    1606
18F60    1584
         ... 
8C20      471
12D40     431
10H20     426
1A40      397
11H50     370
14D50     370
12D50     342
24C10     310
8C40      303
24C20     298
24C50     298
24C30     286
11H40     281
9C10      265
18F10     256
21I50     148
13D30     119
24C40     117
23J50     107
21I30     106
21I10      86
UNK        86
21I20      48
21I60      47
21I70      23
21I40      11
23J40       7
24C60       6
23J30       4
23J20       1
Name: Beat, Length: 121, dtype: int64

In [18]:
df11.BlockRange.unique()  #

array(['6400-6499', '2900-2999', '8400-8499', '10700-10799', '2500-2599',
       '6800-6899', '8900-8999', '5500-5599', '5900-5999', '1000-1099',
       '4500-4599', '12300-12399', '9400-9499', '7900-7999', '8500-8599',
       '12500-12599', '1500-1599', '300-399', '3800-3899', '12200-12299',
       '7500-7599', '5400-5499', '10900-10999', '4300-4399', '6600-6699',
       '6100-6199', '9200-9299', '7700-7799', '13100-13199', '9500-9599',
       '10400-10499', '700-799', '1700-1799', '100-199', '7200-7299',
       '3100-3199', '1200-1299', '400-499', '2400-2499', '7400-7499',
       '7600-7699', '8300-8399', '200-299', '10100-10199', '9300-9399',
       '11800-11899', '8600-8699', '12700-12799', '2800-2899',
       '8100-8199', '11500-11599', '1300-1399', '12000-12099',
       '10000-10099', '5600-5699', '5200-5299', '4700-4799', '5700-5799',
       '900-999', '8000-8099', '7300-7399', '8700-8799', '7000-7099',
       '6700-6799', '6900-6999', '12600-12699', '500-599', '6500-6599',
    

In [19]:
df11.BlockRange.value_counts(dropna=False)  #

900-999        1731
100-199        1638
800-899        1631
9400-9499      1603
700-799        1601
7900-7999      1597
4400-4499      1550
200-299        1467
9500-9599      1437
7500-7599      1410
1000-1099      1372
300-399        1360
2700-2799      1346
500-599        1334
2400-2499      1333
1100-1199      1320
1300-1399      1262
1500-1599      1257
600-699        1251
1200-1299      1248
5800-5899      1219
1400-1499      1182
2500-2599      1181
2300-2399      1178
6000-6099      1167
2100-2199      1164
5900-5999      1151
5100-5199      1140
2600-2699      1138
400-499        1136
               ... 
22300-22399       2
18900-18999       2
22800-22899       2
21400-21499       2
29800-29899       2
55000-55099       1
24800-24899       1
23800-23899       1
66200-66299       1
77300-77399       1
20700-20799       1
21000-21099       1
20100-20199       1
26000-26099       1
31300-31399       1
27000-27099       1
61400-61499       1
20800-20899       1
28400-28499       1


In [20]:
df11.StreetName.unique()  #

array(['BANKSIDE', 'HAYES', 'GLENSCOT', ..., 'RIVERLILLY', 'AVE OF OAKS',
       'LAKEMERE'], dtype=object)

In [21]:
df11.StreetName.value_counts(dropna=False)  # 

WESTHEIMER             3587
GESSNER                1846
GULF                   1801
NORTH                  1763
RICHMOND               1713
WEST SAM HOUSTON       1336
SOUTHWEST              1273
BISSONNET              1270
BELLAIRE               1226
MAIN                   1202
NORTHWEST              1194
FM 1960                1103
KATY                   1080
GREENS                 1034
BEECHNUT                969
BELLFORT                949
LITTLE YORK             868
EAST                    817
POST OAK                807
FONDREN                 804
TIDWELL                 789
BROADWAY                771
AIRLINE                 702
KIRBY                   658
WILCREST                653
SHEPHERD                621
HILLCROFT               610
FUQUA                   549
IMPERIAL VALLEY         549
TELEPHONE               540
                       ... 
SALISBURY                 1
STONEGATE                 1
HOLLYGROVE                1
LIGHTCLIFFE               1
DIAMOND GROVE       

In [22]:
df11['Offense Type'].unique()  # good

array(['Murder', 'Rape', 'Robbery', 'Aggravated Assault', 'Burglary',
       'Auto Theft', 'Theft'], dtype=object)

In [23]:
df11['Offense Type'].value_counts(dropna=False) # good, that its clean not that there were 73591 thefts!! 

Theft                 68596
Burglary              27459
Auto Theft            11893
Aggravated Assault    10695
Robbery                8054
Rape                    770
Murder                  192
Name: Offense Type, dtype: int64

In [24]:
df11.Premise.unique()  # lots of weird values

array(['20A', '20R', '18N', '20D', '13R', '11R', '250', '18O', '20P',
       '22H', '20W', '09H', '22M', '140', '13S', '18A', '240', '03B',
       '100', '20V', '20C', '09R', '01P', '120', '18C', '13H', '18B',
       '070', '18G', '05Z', '080', '20G', '23S', '05M', '18T', '02B',
       '18L', '05O', '05X', '18R', '13B', '24E', '18U', '05C', '210',
       '09D', '05B', '20M', '18P', '18W', '24C', '18S', '02S', '160',
       '01K', '05E', '040', '24P', '05D', '05P', '18M', '05Q', '170',
       '22P', '05L', '18H', '03S', '22E', '05R', '11G', '25R', '05H',
       '13A', '150', '05S', '01B', '22D', '12V', '060', '24V', '05V',
       '190', '05G', '25V', '05F', '05N', '05Y', '09P', '19V', '24T',
       '11L', '05W', '04V', '11F', '22C', '11S', '05T', '24J', '24F',
       '20L', '21V', '24A', '---', '20N', '24S', '22U', '11P', '22V',
       '05U', '01A', '01R', '05A', '18D', '11C', '13T', '24G', '20H',
       '14V', '11V', '23C', '02C', '01T', '24B', '09V', '24M', '06', 'N'],
      dtype=obj

In [25]:
df11.Premise.value_counts(dropna=False)  # srip extra spaces

20A    17514
20R    17271
18A    13946
13R     8758
20D     7690
080     6573
18O     6104
250     4686
18R     3785
18P     3557
120     2559
18T     2506
070     1955
210     1685
18M     1481
18G     1400
18N     1358
03B     1342
23S     1158
18C     1049
05C     1044
05Z     1026
140      990
18U      976
20V      852
20C      825
240      823
05O      764
20G      753
20P      734
       ...  
11F       27
18L       27
09R       27
22C       26
05Y       25
24G       22
24V       22
21V       18
24B       16
20H       16
24T       15
150       15
02S       15
04V       14
01K       11
160       10
01T        9
14V        9
12V        9
22V        8
11V        7
13T        6
02C        5
25R        4
09V        4
06         2
24M        1
23C        1
01R        1
N          1
Name: Premise, Length: 127, dtype: int64

In [26]:
df11['# offenses'].unique()

array([ 1,  2,  3,  5,  4,  6, 13])

In [27]:
df11['# offenses'].value_counts(dropna=False)  # 2 locations with 10 offenses?

1     126367
2       1055
3        215
4         14
5          5
6          2
13         1
Name: # offenses, dtype: int64

In [28]:
df11.Hour.unique()

array([ 1, 14, 19, 23, 22,  4, 15,  2, 10,  0, 11, 21, 17,  8, 20, 12,  7,
       18,  9,  3,  5, 16,  6, 13])

In [29]:
df11.Hour.value_counts(dropna=False)

18    7895
0     7855
12    7261
19    7167
20    7103
22    7097
17    7090
21    6717
15    6242
16    6099
23    6038
14    5660
13    5398
10    5111
11    5058
8     5057
9     4673
7     4191
1     3561
2     3234
6     2855
3     2493
5     1946
4     1858
Name: Hour, dtype: int64

# Cleanup
## Premise Column

- strip empty spaces :not needed

In [30]:
len(df11.Premise.unique())

127

In [31]:
df11['Premise'] = df11['Premise'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [32]:
len(df11.Premise.unique())

127

In [33]:
df11.Premise.value_counts(dropna=False)

20A    17514
20R    17271
18A    13946
13R     8758
20D     7690
080     6573
18O     6104
250     4686
18R     3785
18P     3557
120     2559
18T     2506
070     1955
210     1685
18M     1481
18G     1400
18N     1358
03B     1342
23S     1158
18C     1049
05C     1044
05Z     1026
140      990
18U      976
20V      852
20C      825
240      823
05O      764
20G      753
20P      734
       ...  
11F       27
18L       27
09R       27
22C       26
05Y       25
24G       22
24V       22
21V       18
24B       16
20H       16
24T       15
150       15
02S       15
04V       14
01K       11
160       10
01T        9
14V        9
12V        9
22V        8
11V        7
13T        6
02C        5
25R        4
09V        4
06         2
24M        1
23C        1
01R        1
N          1
Name: Premise, Length: 127, dtype: int64

# Cleanup
## Offense Type Column

preatty clean!


In [34]:
df11['Offense Type'].value_counts(dropna=False)

Theft                 68596
Burglary              27459
Auto Theft            11893
Aggravated Assault    10695
Robbery                8054
Rape                    770
Murder                  192
Name: Offense Type, dtype: int64

In [35]:
df11['Offense Type'].unique()

array(['Murder', 'Rape', 'Robbery', 'Aggravated Assault', 'Burglary',
       'Auto Theft', 'Theft'], dtype=object)

In [36]:
len(df11['Offense Type'].unique())

7

In [37]:
df11['Offense Type'] = df11['Offense Type'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [38]:
df11['Offense Type'].value_counts(dropna=False)

Theft                 68596
Burglary              27459
Auto Theft            11893
Aggravated Assault    10695
Robbery                8054
Rape                    770
Murder                  192
Name: Offense Type, dtype: int64

In [39]:
len(df11['Offense Type'].unique())

7

# Cleanup
## StreetName Column

preatty clean

In [40]:
df11.StreetName.value_counts(dropna=False).head()

WESTHEIMER    3587
GESSNER       1846
GULF          1801
NORTH         1763
RICHMOND      1713
Name: StreetName, dtype: int64

In [41]:
len(df11.StreetName.unique())

7537

In [42]:
df11['StreetName'] = df11['StreetName'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [43]:
len(df11.StreetName.unique())

7537

In [44]:
df11[df11.StreetName.isnull()].head()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour


In [45]:
df11.StreetName.value_counts(dropna=False).head()

WESTHEIMER    3587
GESSNER       1846
GULF          1801
NORTH         1763
RICHMOND      1713
Name: StreetName, dtype: int64

# Cleanup
## BlockRange Column

- create mask to find 'UNK' values
- match with similar beat value (Needs to be done)

In [46]:
df11.BlockRange.value_counts(dropna=False).head()  # find UNK

900-999      1731
100-199      1638
800-899      1631
9400-9499    1603
700-799      1601
Name: BlockRange, dtype: int64

In [47]:
unk = df11.BlockRange == 'UNK'  # boolean mask

In [48]:
df11[unk]  # 0 rows

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour


In [49]:
df11[df11.BlockRange.isnull()].head()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour


# Cleanup
## Beat Column

preatty clean

In [50]:
df11.Beat.unique()

array(['17E40', '20G30', '13D20', '19G50', '11H20', '7C30', '16E30',
       '17E10', '24C10', '18F30', '2A10', '19G30', '5F20', '2A60', 'UNK',
       '14D40', '12D40', '18F60', '18F20', '7C10', '7C20', '18F50',
       '6B30', '16E10', '15E20', '19G40', '3B40', '5F10', '12D10',
       '14D20', '2A20', '9C40', '10H30', '5F40', '13D40', '6B50', '17E30',
       '17E20', '19G10', '20G70', '10H50', '10H40', '11H50', '1A20',
       '24C20', '6B20', '1A10', '8C60', '11H40', '10H80', '16E40', '4F30',
       '14D10', '13D10', '18F10', '10H60', '14D30', '9C30', '20G50',
       '20G40', '3B10', '24C50', '8C40', '19G20', '11H30', '2A50', '4F10',
       '7C40', '1A50', '5F30', '12D60', '12D50', '20G10', '15E40',
       '15E10', '20G20', '18F40', '6B10', '11H10', '2A30', '4F20', '6B40',
       '8C50', '16E20', '9C20', '6B60', '1A30', '12D30', '12D70', '15E30',
       '20G80', '7C50', '8C30', '10H70', '12D20', '8C10', '14D50',
       '20G60', '10H10', '3B50', '24C30', '8C20', '10H20', '3B30', '9C10',


In [51]:
len(df11.Beat.unique())

121

In [52]:
df11['Beat'] = df11['Beat'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [53]:
len(df11.Beat.unique())

121

In [54]:
df11[df11.Beat.isnull()].head()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour


In [55]:
df11.Beat.value_counts(dropna=False).head()

19G10    3029
12D10    2761
6B60     2717
3B10     2578
13D20    2390
Name: Beat, dtype: int64

In [56]:
df11.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127659 entries, 0 to 127658
Data columns (total 8 columns):
Date            127659 non-null datetime64[ns]
Beat            127659 non-null object
BlockRange      127659 non-null object
StreetName      127659 non-null object
Offense Type    127659 non-null object
Premise         127659 non-null object
# offenses      127659 non-null int64
Hour            127659 non-null int64
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 7.8+ MB


## Cleanup

### Date column
- convert to datetime
- index date colimn
- sort index

In [57]:
df11.head(5)

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
0,2011-04-13,17E40,6400-6499,BANKSIDE,Murder,20A,1,1
1,2011-04-06,20G30,2900-2999,HAYES,Murder,20A,1,14
2,2011-01-10,13D20,8400-8499,GLENSCOT,Murder,20R,1,19
3,2011-04-03,19G50,10700-10799,BELLFORT,Murder,18N,1,23
4,2011-04-17,11H20,2500-2599,BROADWAY,Murder,20A,1,23


In [58]:
df11['Date'] = pd.to_datetime(df11['Date'])

df11 = df11.set_index('Date').sort_index(ascending=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [59]:
df11.head(5)

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1964-10-09,1A40,3700-3799,WESTHEIMER,Aggravated Assault,18O,1,5
1965-03-20,6B60,12400-12499,GREENSPOINT,Theft,09H,1,23
1966-10-27,6B50,10300-10399,NORTH,Theft,210,1,0
1970-11-27,19G50,10200-10299,LANDS END,Theft,250,1,13
1971-08-16,14D20,9000-9099,BRANDON,Theft,20D,1,22


In [60]:
df11.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 127659 entries, 1964-10-09 to 2011-12-31
Data columns (total 7 columns):
Beat            127659 non-null object
BlockRange      127659 non-null object
StreetName      127659 non-null object
Offense Type    127659 non-null object
Premise         127659 non-null object
# offenses      127659 non-null int64
Hour            127659 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.8+ MB


## Odd dates
`DatetimeIndex: 127659 entries, 1964-10-09 to 2011-12-31
`
- some values are not from this year, lets look

In [61]:
df2011 = df11.loc['2011-01-01':'2011-12-31']  # rows with date from 01,01,11 - 12,31,11

In [62]:
df2011_wrong_date = df11[:"2010"]  # rows with year 0  upto 2011
df2011_wrong_date.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1882 entries, 1964-10-09 to 2010-12-31
Data columns (total 7 columns):
Beat            1882 non-null object
BlockRange      1882 non-null object
StreetName      1882 non-null object
Offense Type    1882 non-null object
Premise         1882 non-null object
# offenses      1882 non-null int64
Hour            1882 non-null int64
dtypes: int64(2), object(5)
memory usage: 117.6+ KB


In [63]:
df2011.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 125777 entries, 2011-01-01 to 2011-12-31
Data columns (total 7 columns):
Beat            125777 non-null object
BlockRange      125777 non-null object
StreetName      125777 non-null object
Offense Type    125777 non-null object
Premise         125777 non-null object
# offenses      125777 non-null int64
Hour            125777 non-null int64
dtypes: int64(2), object(5)
memory usage: 7.7+ MB


# NAN values

- Beat: 0
- BlockRange: 0
- StreetName: 0
- Offense Type: 0
- Premise: 0
- Hour: 0

In [64]:
beat_nan = df2011.Beat.isnull()
block_nan = df2011.BlockRange.isnull()
str_nan = df2011.StreetName.isnull()
off_nan = df2011['Offense Type'].isnull()
premise_nan = df2011.Premise.isnull()  #
hour_nan = df2011.Hour.isnull()  #

In [65]:
df2011[beat_nan]
df2011[block_nan]
df2011[str_nan]
df2011[off_nan]
df2011[hour_nan]

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


## Save clean data to  to csv

In [66]:
df2011.to_csv('crime_data_clean/crime11_clean.csv')

In [67]:
ls crime_data_clean/

[0m[01;32mcrime11_clean.csv[0m*  [01;32mcrime13_clean.csv[0m*  [01;32mcrime15_clean.csv[0m*  [01;32mcrime17_clean.csv[0m*
[01;32mcrime12_clean.csv[0m*  [01;32mcrime14_clean.csv[0m*  [01;32mcrime16_clean.csv[0m*
