In [1]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt

## Source
- [Houston Police Department Crime Statistics](http://www.houstontx.gov/police/cs/crime-stats-archives.htm)
	- years: 2008 - 2017
	- format: Access or Excel

In [2]:
ls crime_data/2010

[0m[01;32mapr10.xls[0m*  [01;32mdec10.xls[0m*  [01;32mjan10.xls[0m*  [01;32mjun10.xls[0m*  [01;32mmay10.xls[0m*  [01;32moct10.xls[0m*
[01;32maug10.xls[0m*  [01;32mfeb10.xls[0m*  [01;32mjul10.xls[0m*  [01;32mmar10.xls[0m*  [01;32mnov10.xls[0m*  [01;32msep10.xls[0m*


## combine all files into one dataframe

In [3]:
path = 'crime_data/2010'
all_files = glob.glob(os.path.join(path, "*.xls")) 

df_from_each_file = (pd.read_excel(f) for f in all_files)
df   = pd.concat(df_from_each_file, ignore_index=True)



In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136263 entries, 0 to 136262
Data columns (total 12 columns):
# Of Offenses    136262 non-null float64
Beat             136262 non-null object
Block Range      136262 non-null object
Date             136262 non-null datetime64[ns]
Field11          0 non-null float64
Hour             136262 non-null float64
Offense Type     136262 non-null object
Premise          136262 non-null object
Street Name      136262 non-null object
Suffix           136262 non-null object
Type             136262 non-null object
Unnamed: 1       0 non-null float64
dtypes: datetime64[ns](1), float64(4), object(7)
memory usage: 12.5+ MB


In [5]:
df.head(5)

Unnamed: 0,# Of Offenses,Beat,Block Range,Date,Field11,Hour,Offense Type,Premise,Street Name,Suffix,Type,Unnamed: 1
0,1.0,13D20,6600-6699,2010-04-17,,0.0,Murder,05W,HEFFERNAN,-,-,
1,1.0,11H40,10100-10199,2010-04-08,,20.0,Murder,20R,LUCORE,-,-,
2,2.0,19G20,11400-11499,2010-04-01,,22.0,Murder,13R,CARVEL,-,LN,
3,1.0,10H60,3700-3799,2010-04-17,,1.0,Murder,13R,WHEELER,-,-,
4,1.0,14D30,5100-5199,2010-04-08,,23.0,Murder,20R,MYRTLEWOOD,-,DR,


## Lets create a copy

In [6]:
df1 = df.copy()

In [7]:
df1.columns

Index(['# Of Offenses', 'Beat', 'Block Range', 'Date', 'Field11', 'Hour',
       'Offense Type', 'Premise', 'Street Name', 'Suffix', 'Type',
       'Unnamed: 1'],
      dtype='object')

In [None]:
df1.head(5)

In [11]:
df1['Unnamed: 1'].value_counts(dropna=False)

NaN    136263
Name: Unnamed: 1, dtype: int64

In [12]:
df1['Field11'].value_counts(dropna=False)

NaN    136263
Name: Field11, dtype: int64

## Rename columns

In [13]:
df1.rename(columns={
    '# Of Offenses': '# offenses',
    'Block Range': 'BlockRange',
    'Street Name': 'StreetName',}, inplace=True)

In [14]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136263 entries, 0 to 136262
Data columns (total 12 columns):
# offenses      136262 non-null float64
Beat            136262 non-null object
BlockRange      136262 non-null object
Date            136262 non-null datetime64[ns]
Field11         0 non-null float64
Hour            136262 non-null float64
Offense Type    136262 non-null object
Premise         136262 non-null object
StreetName      136262 non-null object
Suffix          136262 non-null object
Type            136262 non-null object
Unnamed: 1      0 non-null float64
dtypes: datetime64[ns](1), float64(4), object(7)
memory usage: 12.5+ MB


In [15]:
df1.head(5)

Unnamed: 0,# offenses,Beat,BlockRange,Date,Field11,Hour,Offense Type,Premise,StreetName,Suffix,Type,Unnamed: 1
0,1.0,13D20,6600-6699,2010-04-17,,0.0,Murder,05W,HEFFERNAN,-,-,
1,1.0,11H40,10100-10199,2010-04-08,,20.0,Murder,20R,LUCORE,-,-,
2,2.0,19G20,11400-11499,2010-04-01,,22.0,Murder,13R,CARVEL,-,LN,
3,1.0,10H60,3700-3799,2010-04-17,,1.0,Murder,13R,WHEELER,-,-,
4,1.0,14D30,5100-5199,2010-04-08,,23.0,Murder,20R,MYRTLEWOOD,-,DR,


## create a subdataframe with the columns that we want

In [16]:
df10 = df1[['Date','Beat','BlockRange','StreetName','Offense Type','Premise','# offenses','Hour']]

In [17]:
df10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136263 entries, 0 to 136262
Data columns (total 8 columns):
Date            136262 non-null datetime64[ns]
Beat            136262 non-null object
BlockRange      136262 non-null object
StreetName      136262 non-null object
Offense Type    136262 non-null object
Premise         136262 non-null object
# offenses      136262 non-null float64
Hour            136262 non-null float64
dtypes: datetime64[ns](1), float64(2), object(5)
memory usage: 8.3+ MB


In [18]:
df10.tail()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
136258,2010-09-30,12D10,9500-9599,ROWLETT,Theft,120,1.0,23.0
136259,2010-09-30,20G30,11000-11099,WESTHEIMER,Theft,120,1.0,23.0
136260,2010-09-30,18F50,6500-6599,SOUTHWEST,Theft,18C,1.0,23.0
136261,2010-09-30,20G10,2900-2999,ELMSIDE,Theft,250,1.0,16.0
136262,NaT,,,,,,,


## Now we can inspect df

In [19]:
df10.Date.unique() ## timestamp

array(['2010-04-17T00:00:00.000000000', '2010-04-08T00:00:00.000000000',
       '2010-04-01T00:00:00.000000000', '2010-04-20T00:00:00.000000000',
       '2010-04-25T00:00:00.000000000', '2010-04-21T00:00:00.000000000',
       '2010-04-11T00:00:00.000000000', '2009-02-07T00:00:00.000000000',
       '2010-04-22T00:00:00.000000000', '2010-04-18T00:00:00.000000000',
       '2010-04-30T00:00:00.000000000', '1995-11-29T00:00:00.000000000',
       '2010-04-07T00:00:00.000000000', '2010-04-05T00:00:00.000000000',
       '2010-03-16T00:00:00.000000000', '2010-02-28T00:00:00.000000000',
       '2006-08-15T00:00:00.000000000', '2010-04-26T00:00:00.000000000',
       '2009-11-01T00:00:00.000000000', '2010-04-27T00:00:00.000000000',
       '2008-04-07T00:00:00.000000000', '2010-04-06T00:00:00.000000000',
       '2010-04-12T00:00:00.000000000', '2010-04-13T00:00:00.000000000',
       '2010-04-23T00:00:00.000000000', '2010-04-29T00:00:00.000000000',
       '2010-04-24T00:00:00.000000000', '2010-04-09

In [20]:
df10.Beat.unique()  ##

array(['13D20', '11H40', '19G20', '10H60', '14D30', '17E30', '20G40',
       '18F40', '10H20', '9C20', '11H10', '19G30', '16E40', '19G10',
       '16E20', '19G50', '15E10', '13D40', '14D10', 'UNK', '9C40',
       '10H80', '5F30', '18F50', '7C30', '3B50', '12D10', '20G10',
       '14D20', '17E20', '10H40', '6B60', '3B10', '8C20', '5F20', '7C50',
       '15E30', '16E30', '12D40', '17E40', '18F30', '7C20', '8C60',
       '8C30', '11H50', '20G60', '20G70', '17E10', '3B30', '11H30',
       '2A60', '9C10', '6B10', '6B30', '6B20', '10H50', '12D20', '12D70',
       '6B50', '1A10', '20G30', '6B40', '5F10', '2A20', '13D10', '4F10',
       '9C30', '18F60', '1A30', '19G40', '1A20', '2A10', '7C10', '3B40',
       '7C40', '1A50', '24C30', '14D50', '10H10', '15E20', '10H70',
       '8C10', '20G50', '2A50', '20G80', '15E40', '24C50', '5F40',
       '12D60', '2A30', '4F20', '8C50', '4F30', '18F20', '14D40', '24C10',
       '18F10', '11H20', '10H30', '16E10', '24C40', '12D30', '20G20',
       '12D50', '

In [22]:
df10.Beat.value_counts(dropna=False)  # UNK : 229

19G10    3010
12D10    2889
6B60     2732
3B10     2702
1A20     2612
13D20    2506
18F20    2500
20G50    2466
1A30     2426
17E10    2417
18F30    2298
5F30     2291
1A10     2246
6B10     2122
5F40     2111
15E40    2104
14D20    2101
18F40    2064
20G30    2039
6B30     1999
17E40    1973
18F50    1973
1A50     1948
7C20     1943
19G40    1873
2A50     1793
20G10    1740
9C40     1700
17E30    1637
2A30     1633
         ... 
12D40     450
8C40      417
14D50     399
11H50     395
24C10     390
24C30     369
9C10      358
11H40     357
1A40      353
24C50     344
24C20     335
12D50     312
18F10     269
UNK       229
21I50     157
24C40     156
13D30     115
21I30     102
23J50     101
21I10      68
21I60      66
21I20      49
21I70      38
21I40      17
23J40       6
23J10       2
7C60        1
23J20       1
23J30       1
NaN         1
Name: Beat, Length: 123, dtype: int64

In [23]:
df10.BlockRange.unique()  #

array(['6600-6699', '10100-10199', '11400-11499', '3700-3799',
       '5100-5199', '11500-11599', '6400-6499', '2300-2399', '7900-7999',
       '8000-8099', '1100-1199', '7300-7399', '6700-6799', '7400-7499',
       '7100-7199', '16300-16399', '15800-15899', '10000-10099',
       '8100-8199', '3100-3199', '10400-10499', '8900-8999', '6300-6399',
       '12600-12699', '1900-1999', '5000-5099', '5800-5899', '5900-5999',
       '2700-2799', '1200-1299', '9800-9899', '4300-4399', '7600-7699',
       '3000-3099', '7200-7299', '1300-1399', '4200-4299', '9000-9099',
       '100-199', '14800-14899', '13300-13399', '0-99', '4800-4899',
       '2200-2299', '11800-11899', '3800-3899', '8300-8399', '7500-7599',
       '12700-12799', '6100-6199', '10200-10299', '14400-14499',
       '8800-8899', '4100-4199', '15500-15599', '9200-9299', '5700-5799',
       '6000-6099', '2000-2099', '6200-6299', '9400-9499', '13200-13299',
       '7800-7899', '5500-5599', '2400-2499', '12800-12899', '1700-1799',
    

In [24]:
df10.BlockRange.value_counts(dropna=False)  #

900-999        2079
7900-7999      1741
100-199        1734
800-899        1679
9400-9499      1669
700-799        1611
1200-1299      1518
200-299        1517
1000-1099      1499
1500-1599      1464
1300-1399      1463
2400-2499      1463
300-399        1428
500-599        1427
7500-7599      1402
1100-1199      1400
400-499        1382
9500-9599      1369
600-699        1318
2700-2799      1306
5100-5199      1292
6000-6099      1286
1400-1499      1275
5800-5899      1275
2000-2099      1272
2300-2399      1268
5900-5999      1255
5000-5099      1252
2500-2599      1245
6400-6499      1197
               ... 
24800-24899       3
23800-23899       3
24300-24399       2
22000-22099       2
21800-21899       2
25000-25099       2
21600-21699       2
21000-21099       2
19800-19899       2
24600-24699       2
20500-20599       2
20000-20099       2
21700-21799       2
20200-20299       2
83000-83099       1
25100-25199       1
99900-99999       1
96000-96099       1
21500-21599       1


In [25]:
df10.StreetName.unique()  #

array(['HEFFERNAN', 'LUCORE', 'CARVEL', ..., 'GUILFORD', 'ALPHA', nan],
      dtype=object)

In [26]:
df10.StreetName.value_counts(dropna=False)  # 

WESTHEIMER            3716
GESSNER               2226
GULF                  1931
RICHMOND              1683
SOUTHWEST             1515
NORTH                 1457
KATY                  1421
WEST SAM HOUSTON      1403
NORTHWEST             1291
BISSONNET             1272
FM 1960               1260
MAIN                  1225
BELLFORT              1090
BELLAIRE              1088
GREENS                1007
POST OAK              1005
FONDREN                994
LITTLE YORK            975
BEECHNUT               912
TIDWELL                862
BROADWAY               792
AIRLINE                764
SHEPHERD               713
KIRBY                  701
EAST                   691
HILLCROFT              676
ALABAMA                645
ANTOINE                628
TELEPHONE              622
FANNIN                 617
                      ... 
BAYOU RIVER              1
NAMORA                   1
HOLT                     1
ADOLPH                   1
POST OAK GREEN           1
MAPLECREEK               1
T

In [27]:
df10['Offense Type'].unique() 

array(['Murder', 'Rape', 'Robbery', 'Aggravated Assault', 'Burglary',
       'Auto Theft', 'Theft', nan], dtype=object)

In [28]:
df10['Offense Type'].value_counts(dropna=False) # good, that its clean not that there were 73591 thefts!! 

Theft                 74581
Burglary              27924
Auto Theft            12424
Aggravated Assault    10915
Robbery                9449
Rape                    709
Murder                  260
NaN                       1
Name: Offense Type, dtype: int64

In [29]:
df10.Premise.unique()  # lots of weird values

array(['05W', '20R', '13R', '20A', '18A', '100', '05E', '20D', '18T',
       '250', '040', '13A', '20P', '22M', '140', '070', '14V', '05V',
       '18R', '18C', '18G', '18O', '18M', '18S', '13B', '05D', '22E',
       '240', '18U', '13S', '13H', '210', '120', '080', '24A', '24J',
       '24E', '03B', '05Q', '05B', '05Z', '18N', '05L', '05C', '24F',
       '09D', '24P', '23S', '18P', '18H', '01K', '20G', '05P', '05F',
       '02B', '09P', '05R', '05O', '22P', '11R', '170', '20M', '20L',
       '05M', '18B', '09H', '20V', '22D', '18L', '05X', '13T', '20W',
       '03S', '18D', '20C', '150', '18W', '22H', '20N', '22C', '05Y',
       '20H', '25V', '05A', '190', '24G', '24V', '24C', '05G', '12V',
       '09R', '21V', '060', '19V', '05H', '05U', '11S', '05N', '11V',
       '24S', '11G', '160', '05T', '01T', '01A', '05S', '01B', '22U',
       '24B', '---', '24T', '11L', '02C', '11P', '24M', '02S', '11F',
       '09V', '22V', '01P', '25R', '11C', '04V', '23C', '01R', 'RES',
       '13C', '02V',

In [30]:
df10.Premise.value_counts(dropna=False)  # srip extra spaces

20A    17848
20R    17426
18A    15614
13R    10029
20D     9793
080     6821
18O     6263
250     4690
18R     4009
18P     3405
120     2680
18T     2469
18M     2305
210     2077
070     2041
18N     1542
18G     1461
03B     1408
23S     1242
05C     1095
18U     1084
18C     1069
05Z      946
20G      939
140      916
05O      870
240      828
20C      772
20V      766
20P      708
       ...  
24A       29
20H       29
24G       28
05U       26
24B       25
24T       25
11C       24
21V       23
150       23
02S       17
02C       16
05Y       15
160       14
04V       11
14V       10
01K        9
11V        8
13T        8
01T        7
12V        6
22V        5
09V        5
25R        5
24M        4
23C        3
02V        1
13C        1
RES        1
01R        1
NaN        1
Name: Premise, Length: 129, dtype: int64

In [31]:
df10['# offenses'].unique()

array([ 1.,  2.,  3.,  4.,  6., 16.,  5.,  9., nan])

In [33]:
df10['# offenses'].value_counts(dropna=False)  #

 1.0     134987
 2.0       1046
 3.0        212
 4.0          9
 5.0          3
 6.0          2
 9.0          2
NaN           1
 16.0         1
Name: # offenses, dtype: int64

In [34]:
df10.Hour.unique()

array([ 0., 20., 22.,  1., 23., 12.,  8., 16., 17., 14.,  2., 19.,  6.,
        7.,  5., 18.,  9.,  4., 11., 13., 21., 10.,  3., 15., nan])

In [35]:
df10.Hour.value_counts(dropna=False)

 18.0    8492
 0.0     8491
 22.0    8097
 19.0    7870
 17.0    7707
 20.0    7662
 12.0    7467
 21.0    7288
 15.0    6568
 23.0    6480
 16.0    6447
 14.0    6065
 13.0    5819
 8.0     5496
 11.0    5164
 10.0    5138
 9.0     4672
 7.0     4596
 1.0     3896
 2.0     3332
 6.0     2985
 3.0     2636
 5.0     1994
 4.0     1900
NaN         1
Name: Hour, dtype: int64

# Cleanup
## Premise Column

- strip empty spaces :not needed

In [36]:
len(df10.Premise.unique())

129

In [37]:
df10['Premise'] = df10['Premise'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [38]:
len(df10.Premise.unique())

129

In [39]:
df10.Premise.value_counts(dropna=False)

20A    17848
20R    17426
18A    15614
13R    10029
20D     9793
080     6821
18O     6263
250     4690
18R     4009
18P     3405
120     2680
18T     2469
18M     2305
210     2077
070     2041
18N     1542
18G     1461
03B     1408
23S     1242
05C     1095
18U     1084
18C     1069
05Z      946
20G      939
140      916
05O      870
240      828
20C      772
20V      766
20P      708
       ...  
24A       29
20H       29
24G       28
05U       26
24B       25
24T       25
11C       24
21V       23
150       23
02S       17
02C       16
05Y       15
160       14
04V       11
14V       10
01K        9
11V        8
13T        8
01T        7
12V        6
22V        5
09V        5
25R        5
24M        4
23C        3
02V        1
13C        1
RES        1
01R        1
NaN        1
Name: Premise, Length: 129, dtype: int64

# Cleanup
## Offense Type Column

preatty clean!


In [40]:
df10['Offense Type'].value_counts(dropna=False)

Theft                 74581
Burglary              27924
Auto Theft            12424
Aggravated Assault    10915
Robbery                9449
Rape                    709
Murder                  260
NaN                       1
Name: Offense Type, dtype: int64

In [41]:
df10['Offense Type'].unique()

array(['Murder', 'Rape', 'Robbery', 'Aggravated Assault', 'Burglary',
       'Auto Theft', 'Theft', nan], dtype=object)

In [42]:
len(df10['Offense Type'].unique())

8

In [44]:
df10['Offense Type'] = df10['Offense Type'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [45]:
df10['Offense Type'].value_counts(dropna=False)

Theft                 74581
Burglary              27924
Auto Theft            12424
Aggravated Assault    10915
Robbery                9449
Rape                    709
Murder                  260
NaN                       1
Name: Offense Type, dtype: int64

In [46]:
len(df10['Offense Type'].unique())

8

# Cleanup
## StreetName Column

preatty clean

In [47]:
df10.StreetName.value_counts(dropna=False).head()

WESTHEIMER    3716
GESSNER       2226
GULF          1931
RICHMOND      1683
SOUTHWEST     1515
Name: StreetName, dtype: int64

In [48]:
len(df10.StreetName.unique())

7833

In [49]:
df10['StreetName'] = df10['StreetName'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [50]:
len(df10.StreetName.unique())

7833

In [52]:
df10[df10.StreetName.isnull()].head()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
136262,NaT,,,,,,,


In [53]:
df10.StreetName.value_counts(dropna=False).head()

WESTHEIMER    3716
GESSNER       2226
GULF          1931
RICHMOND      1683
SOUTHWEST     1515
Name: StreetName, dtype: int64

# Cleanup
## BlockRange Column

- create mask to find 'UNK' values
- match with similar beat value (Needs to be done)

In [54]:
df10.BlockRange.value_counts(dropna=False).head()  # find UNK

900-999      2079
7900-7999    1741
100-199      1734
800-899      1679
9400-9499    1669
Name: BlockRange, dtype: int64

In [55]:
unk = df10.BlockRange == 'UNK'  # boolean mask

In [56]:
df10[unk]  # 0 rows

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour


In [57]:
df10[df10.BlockRange.isnull()].head()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
136262,NaT,,,,,,,


# Cleanup
## Beat Column

preatty clean

In [58]:
df10.Beat.unique()

array(['13D20', '11H40', '19G20', '10H60', '14D30', '17E30', '20G40',
       '18F40', '10H20', '9C20', '11H10', '19G30', '16E40', '19G10',
       '16E20', '19G50', '15E10', '13D40', '14D10', 'UNK', '9C40',
       '10H80', '5F30', '18F50', '7C30', '3B50', '12D10', '20G10',
       '14D20', '17E20', '10H40', '6B60', '3B10', '8C20', '5F20', '7C50',
       '15E30', '16E30', '12D40', '17E40', '18F30', '7C20', '8C60',
       '8C30', '11H50', '20G60', '20G70', '17E10', '3B30', '11H30',
       '2A60', '9C10', '6B10', '6B30', '6B20', '10H50', '12D20', '12D70',
       '6B50', '1A10', '20G30', '6B40', '5F10', '2A20', '13D10', '4F10',
       '9C30', '18F60', '1A30', '19G40', '1A20', '2A10', '7C10', '3B40',
       '7C40', '1A50', '24C30', '14D50', '10H10', '15E20', '10H70',
       '8C10', '20G50', '2A50', '20G80', '15E40', '24C50', '5F40',
       '12D60', '2A30', '4F20', '8C50', '4F30', '18F20', '14D40', '24C10',
       '18F10', '11H20', '10H30', '16E10', '24C40', '12D30', '20G20',
       '12D50', '

In [59]:
len(df10.Beat.unique())

123

In [60]:
df10['Beat'] = df10['Beat'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [61]:
len(df10.Beat.unique())

123

In [62]:
df10[df10.Beat.isnull()].head()

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
136262,NaT,,,,,,,


In [63]:
df10.Beat.value_counts(dropna=False).head()

19G10    3010
12D10    2889
6B60     2732
3B10     2702
1A20     2612
Name: Beat, dtype: int64

In [64]:
df10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136263 entries, 0 to 136262
Data columns (total 8 columns):
Date            136262 non-null datetime64[ns]
Beat            136262 non-null object
BlockRange      136262 non-null object
StreetName      136262 non-null object
Offense Type    136262 non-null object
Premise         136262 non-null object
# offenses      136262 non-null float64
Hour            136262 non-null float64
dtypes: datetime64[ns](1), float64(2), object(5)
memory usage: 8.3+ MB


## Cleanup

### Date column
- convert to datetime
- index date colimn
- sort index

In [65]:
df10.head(5)

Unnamed: 0,Date,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
0,2010-04-17,13D20,6600-6699,HEFFERNAN,Murder,05W,1.0,0.0
1,2010-04-08,11H40,10100-10199,LUCORE,Murder,20R,1.0,20.0
2,2010-04-01,19G20,11400-11499,CARVEL,Murder,13R,2.0,22.0
3,2010-04-17,10H60,3700-3799,WHEELER,Murder,13R,1.0,1.0
4,2010-04-08,14D30,5100-5199,MYRTLEWOOD,Murder,20R,1.0,23.0


In [67]:
df10['Date'] = pd.to_datetime(df10['Date'])

df10 = df10.set_index('Date').sort_index(ascending=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [68]:
df10.head(5)

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1959-06-06,18F20,5100-5199,RICHMOND,Theft,080,1.0,14.0
1966-01-01,10H50,3300-3399,ALABAMA,Rape,20A,1.0,0.0
1966-07-18,13D40,10200-10299,TELEPHONE,Auto Theft,18A,1.0,13.0
1969-07-22,2A30,1500-1599,NICHOLSON,Theft,20R,1.0,0.0
1969-11-26,3B50,7100-7199,IRVINGTON,Theft,13R,1.0,15.0


In [69]:
df10.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 136263 entries, 1959-06-06 to NaT
Data columns (total 7 columns):
Beat            136262 non-null object
BlockRange      136262 non-null object
StreetName      136262 non-null object
Offense Type    136262 non-null object
Premise         136262 non-null object
# offenses      136262 non-null float64
Hour            136262 non-null float64
dtypes: float64(2), object(5)
memory usage: 8.3+ MB


## Odd dates
`DatetimeIndex: 136263 entries, 1959-06-06 to NaT
`
- some values are not from this year, lets look

In [70]:
df2010 = df10.loc['2010-01-01':'2010-12-31']  # rows with date from 01,01,10 - 12,31,10

In [71]:
df2010_wrong_date = df10[:"2009"]  # rows with year 0  upto 2010
df2010_wrong_date.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1717 entries, 1959-06-06 to 2009-12-31
Data columns (total 7 columns):
Beat            1717 non-null object
BlockRange      1717 non-null object
StreetName      1717 non-null object
Offense Type    1717 non-null object
Premise         1717 non-null object
# offenses      1717 non-null float64
Hour            1717 non-null float64
dtypes: float64(2), object(5)
memory usage: 107.3+ KB


In [72]:
df2010.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 134543 entries, 2010-01-01 to 2010-12-31
Data columns (total 7 columns):
Beat            134543 non-null object
BlockRange      134543 non-null object
StreetName      134543 non-null object
Offense Type    134543 non-null object
Premise         134543 non-null object
# offenses      134543 non-null float64
Hour            134543 non-null float64
dtypes: float64(2), object(5)
memory usage: 8.2+ MB


# NAN values

- Beat: 0
- BlockRange: 0
- StreetName: 0
- Offense Type: 0
- Premise: 0
- Hour: 0

In [74]:
beat_nan = df2010.Beat.isnull()
block_nan = df2010.BlockRange.isnull()
str_nan = df2010.StreetName.isnull()
off_nan = df2010['Offense Type'].isnull()
premise_nan = df2010.Premise.isnull()  #
hour_nan = df2010.Hour.isnull()  #

In [76]:
df2010[beat_nan]

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [77]:
df2010[block_nan]

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [78]:
df2010[str_nan]

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [79]:
df2010[off_nan]

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [75]:
df2010[hour_nan]

Unnamed: 0_level_0,Beat,BlockRange,StreetName,Offense Type,Premise,# offenses,Hour
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


## Save clean data to  to csv

In [80]:
df2010.to_csv('clean_data/crime_data/crime10_clean.csv')

In [81]:
ls clean_data/crime_data/

[0m[01;32mcrime10_clean.csv[0m*  [01;32mcrime12_clean.csv[0m*  [01;32mcrime14_clean.csv[0m*  [01;32mcrime16_clean.csv[0m*
[01;32mcrime11_clean.csv[0m*  [01;32mcrime13_clean.csv[0m*  [01;32mcrime15_clean.csv[0m*  [01;32mcrime17_clean.csv[0m*


## DROP nan
drop nan values of StreetName

In [None]:
df2017.head()

In [None]:
df2017 = df2017.dropna(subset=['StreetName'])  # drop nan values from StreetName, 8 rows

In [None]:
df2017.info()

In [None]:
df2017['Premise'] = df2017['Premise'].fillna('unk')

In [None]:
df2017.info()

In [None]:
df2017['Offense Type'].value_counts(dropna=False)

## Fillna
will use this since Dont know what to do... :/

In [None]:
df2017['Offense Type'].fillna(method='ffill', inplace=True)

In [None]:
df2017.info()

## Save clean data to  to csv

In [None]:
df2017.to_csv('clean_data/crime_data/crime17_clean.csv')

In [None]:
ls clean_data/crime_data/