In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt
import datetime  as dt
import seaborn as sns

## Data directory

In [3]:
data_folder = 'crime'

In [4]:
# data folder path
data_directory = os.path.join('..','data','clean_data/{}'.format(data_folder))
data_directory_saves = os.path.join( '..','data','clean_data','merge_data/')

In [5]:
# combine all files into one df
all_files = glob.glob(os.path.join(data_directory, "*.csv")) 
df_from_each_file = (pd.read_csv(f) for f in all_files)
df   = pd.concat(df_from_each_file, ignore_index=True)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006231 entries, 0 to 1006230
Data columns (total 9 columns):
date            1006231 non-null object
hour            1006231 non-null int64
beat            1006231 non-null object
offense_type    1006231 non-null object
block_range     1006231 non-null object
street_name     1006229 non-null object
premise         1006231 non-null object
num_offenses    1006231 non-null int64
type            1006231 non-null object
dtypes: int64(2), object(7)
memory usage: 69.1+ MB


In [7]:
df.head()

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type
0,2010-04-17,0,13D20,Murder,6600-6699,HEFFERNAN,WAREHOUSE,1,-
1,2010-04-01,0,14D20,Burglary,3400-3499,CORDER,WAREHOUSE,1,ST
2,2010-04-08,23,6B40,Burglary,10200-10299,NORTH,WAREHOUSE,1,FWY
3,2010-04-23,19,13D10,Burglary,6100-6199,SOUTH LP E,WAREHOUSE,1,SER
4,2010-04-06,17,3B10,Burglary,5900-5999,CENTRALCREST,WAREHOUSE,1,ST


In [8]:
unk_date = df.date == 'UNK'

In [9]:
df[unk_date]

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type
593244,UNK,15,13D40,Theft,8500-8599,SAM HOUSTON,UNK,1,PKWY
595358,UNK,14,18F40,Robbery,2700-2799,DUNVALE,UNK,1,RD
603375,UNK,8,14D40,Burglary,5600-5699,SELINSKY,UNK,1,RD
608019,UNK,11,7C20,Theft,3900-3999,CAVALCADE,UNK,1,-


In [10]:
df.loc[719933]

date                        2015-03-08
hour                                 7
beat                             20G30
offense_type                     Theft
block_range                10400-10499
street_name                   RICHMOND
premise         Service or Gas Station
num_offenses                         1
type                               AVE
Name: 719933, dtype: object

## drop UNK dates

In [11]:
df = df[df.date != 'UNK']

## set date as datetime index

In [12]:
%%time
df.date = pd.to_datetime(df['date'])
df = df.set_index('date').sort_index(ascending=True)

CPU times: user 266 ms, sys: 172 ms, total: 438 ms
Wall time: 481 ms


In [13]:
df.head()

Unnamed: 0_level_0,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1914-09-08,7,24C60,Burglary,12700-12799,LAKE HOUSTON,Restaurant or Cafeteria,1,PKWY
1914-11-02,3,18F60,Burglary,8800-8899,BELLAIRE,Miscellaneous Business (Non-Specific),1,BLVD
1914-12-03,19,12D20,Auto Theft,12800-12899,GULF,UNK,1,FWY
1915-01-05,22,3B10,Theft,3200-3299,MANGUM RD 180,Other Parking Lot,1,-
1915-01-14,23,5F10,Auto Theft,7000-7099,WESTVIEW,Apartment Parking Lot,1,DR


## Display null values

In [14]:
df.apply(lambda x: sum(x.isnull()))

hour            0
beat            0
offense_type    0
block_range     0
street_name     2
premise         0
num_offenses    0
type            0
dtype: int64

# select beats

In [15]:
len(df.beat.unique())

238

In [16]:
df.beat.unique()

array(['24C60', '18F60', '12D20', '3B10', '5F10', '1A20', '11H10',
       '19G20', '9C40', '9C30', '6B30', '10H70', '4F20', '18F20', '20G30',
       '19G40', '20G20', '17E20', '9C20', '19G10', '14D20', '8C20',
       '8C30', '17E40', '6B50', '12D70', 'UNK', '1A40', '7C10', '1A50',
       '10H40', '13D20', '18F10', '6B60', '10H50', '13D40', '2A30',
       '3B50', '19G50', '14D50', '1A10', '15E30', '12D50', '14D30',
       '8C60', '1A30', '18F40', '4F30', '20G50', '7C20', '3B30', '12D10',
       '15E10', '18F50', '14D10', '11H20', '5F30', '15E20', '23J50',
       '17E10', '6B10', '19G30', '18F30', '20G10', '2A40', '24C40',
       '4F10', '14D40', '10H60', '3B40', '10H80', '5F20', '16E40',
       '12D60', '6B40', '7C40', '13D30', '20G80', '10H20', '7C50',
       '15E40', '2A50', '8C50', '24C10', '20G40', '13D10', '20G70',
       '2A20', '7C30', '16E10', '12D30', '9C10', '16E30', '6B20', '2A10',
       '8C10', '21I20', '11H30', '24C20', '17E30', '10H30', '5F40',
       '2A60', '21I10', '10

## remove extra chars

In [17]:
df.beat.replace(regex=True,inplace=True,to_replace=r'\W',value=r'')

In [18]:
len(df.beat.unique())

127

In [19]:
df.beat.unique()

array(['24C60', '18F60', '12D20', '3B10', '5F10', '1A20', '11H10',
       '19G20', '9C40', '9C30', '6B30', '10H70', '4F20', '18F20', '20G30',
       '19G40', '20G20', '17E20', '9C20', '19G10', '14D20', '8C20',
       '8C30', '17E40', '6B50', '12D70', 'UNK', '1A40', '7C10', '1A50',
       '10H40', '13D20', '18F10', '6B60', '10H50', '13D40', '2A30',
       '3B50', '19G50', '14D50', '1A10', '15E30', '12D50', '14D30',
       '8C60', '1A30', '18F40', '4F30', '20G50', '7C20', '3B30', '12D10',
       '15E10', '18F50', '14D10', '11H20', '5F30', '15E20', '23J50',
       '17E10', '6B10', '19G30', '18F30', '20G10', '2A40', '24C40',
       '4F10', '14D40', '10H60', '3B40', '10H80', '5F20', '16E40',
       '12D60', '6B40', '7C40', '13D30', '20G80', '10H20', '7C50',
       '15E40', '2A50', '8C50', '24C10', '20G40', '13D10', '20G70',
       '2A20', '7C30', '16E10', '12D30', '9C10', '16E30', '6B20', '2A10',
       '8C10', '21I20', '11H30', '24C20', '17E30', '10H30', '5F40',
       '2A60', '21I10', '10

## display null values

In [20]:
df.apply(lambda x: sum(x.isnull()))

hour            0
beat            0
offense_type    0
block_range     0
street_name     2
premise         0
num_offenses    0
type            0
dtype: int64

## Inspect cols

In [21]:
df.columns

Index(['hour', 'beat', 'offense_type', 'block_range', 'street_name', 'premise',
       'num_offenses', 'type'],
      dtype='object')

In [22]:
df.hour.unique()

array([ 7,  3, 19, 22, 23, 10, 13, 16, 15, 17, 12,  9, 14, 11, 18,  5,  8,
        6,  0, 20,  2, 21,  1,  4, 24])

In [23]:
df.beat.unique()

array(['24C60', '18F60', '12D20', '3B10', '5F10', '1A20', '11H10',
       '19G20', '9C40', '9C30', '6B30', '10H70', '4F20', '18F20', '20G30',
       '19G40', '20G20', '17E20', '9C20', '19G10', '14D20', '8C20',
       '8C30', '17E40', '6B50', '12D70', 'UNK', '1A40', '7C10', '1A50',
       '10H40', '13D20', '18F10', '6B60', '10H50', '13D40', '2A30',
       '3B50', '19G50', '14D50', '1A10', '15E30', '12D50', '14D30',
       '8C60', '1A30', '18F40', '4F30', '20G50', '7C20', '3B30', '12D10',
       '15E10', '18F50', '14D10', '11H20', '5F30', '15E20', '23J50',
       '17E10', '6B10', '19G30', '18F30', '20G10', '2A40', '24C40',
       '4F10', '14D40', '10H60', '3B40', '10H80', '5F20', '16E40',
       '12D60', '6B40', '7C40', '13D30', '20G80', '10H20', '7C50',
       '15E40', '2A50', '8C50', '24C10', '20G40', '13D10', '20G70',
       '2A20', '7C30', '16E10', '12D30', '9C10', '16E30', '6B20', '2A10',
       '8C10', '21I20', '11H30', '24C20', '17E30', '10H30', '5F40',
       '2A60', '21I10', '10

In [24]:
df.offense_type.unique()

array(['Burglary', 'Auto Theft', 'Theft', 'Rape', 'Aggravated Assault',
       'Robbery', 'Murder', '1'], dtype=object)

In [25]:
df.num_offenses.unique()

array([ 1,  2,  4,  3, 16,  5,  9,  6, 13,  7,  8, 11, 29, 12, 17, 33, 10])

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1006227 entries, 1914-09-08 to 2033-04-21
Data columns (total 8 columns):
hour            1006227 non-null int64
beat            1006227 non-null object
offense_type    1006227 non-null object
block_range     1006227 non-null object
street_name     1006225 non-null object
premise         1006227 non-null object
num_offenses    1006227 non-null int64
type            1006227 non-null object
dtypes: int64(2), object(6)
memory usage: 69.1+ MB


## save

In [27]:
df.to_csv(data_directory_saves+'crime_clean_01.csv')