In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt
import datetime  as dt
import seaborn as sns
import re

In [3]:
data_folder = 'merge_data'

In [4]:
# data folder path
data_directory = os.path.join('..','data','clean_data/{}/crime_beats_02.csv'.format(data_folder))
data_directory_saves = os.path.join( '..','data','clean_data','merge_data/')

In [5]:
df = pd.read_csv(data_directory)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85578 entries, 0 to 85577
Data columns (total 10 columns):
Unnamed: 0      85578 non-null int64
date            85578 non-null object
hour            85578 non-null int64
beat            85578 non-null object
offense_type    85578 non-null object
block_range     85578 non-null object
street_name     85578 non-null object
premise         85578 non-null object
num_offenses    85578 non-null int64
type            85578 non-null object
dtypes: int64(3), object(7)
memory usage: 6.5+ MB


In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type
0,0,1916-05-23,19,10H70,Aggravated Assault,UNK,LIBERTY ROAD,house,1,-
1,1,1917-02-20,16,10H70,Theft,7500-7599,ARDMORE,other_parking,1,ST
2,2,1963-02-02,13,10H40,Theft,3800-3899,MAIN,rehab_center,1,ST
3,3,1966-01-01,0,10H50,Rape,3300-3399,ALABAMA,apartment,1,ST
4,4,1971-02-03,6,1A10,Theft,1200-1299,TRAVIS,house,1,-


# combine `street_name` and `street_type` for full address


In [8]:
def combine_stname_sttype(row):
    '''combines street_name and street_type
    checks if it contains the value
    otherwise it appends it'''
    stname = row.street_name.split(' ')[-1] # last item, looking for RD,ST,BLVD..
    sttype =row.type  # RD, BLVD,-,..
    if row.type == '-':
            return row.street_name  # unchanged
    elif stname in sttype:
        return row.street_name  #  unchanged
    else:
        return stname + ' ' + sttype  # street_name + street_type

In [9]:
df['full_street_name'] = df.apply(combine_stname_sttype, axis=1)

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type,full_street_name
0,0,1916-05-23,19,10H70,Aggravated Assault,UNK,LIBERTY ROAD,house,1,-,LIBERTY ROAD
1,1,1917-02-20,16,10H70,Theft,7500-7599,ARDMORE,other_parking,1,ST,ARDMORE ST
2,2,1963-02-02,13,10H40,Theft,3800-3899,MAIN,rehab_center,1,ST,MAIN ST
3,3,1966-01-01,0,10H50,Rape,3300-3399,ALABAMA,apartment,1,ST,ALABAMA ST
4,4,1971-02-03,6,1A10,Theft,1200-1299,TRAVIS,house,1,-,TRAVIS


## combine block_range with full_street_name


In [11]:
def full_street_address(row):
    '''input col with block_range values
    its split, get med value then appended
    to street name
    return: full street address'''
    st1 =  int(row.block_range.split('-')[0])
    st2 = int(row.block_range.split('-')[1])
    med =  np.median([st2,st1])
    st_full =  int(np.ceil(med))
    return str(st_full) + ' ' + row['full_street_name']

In [12]:
df.replace({'block_range': {'1.1103e+006-1.1104e+006':'1001-1010'}},inplace=True)

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type,full_street_name
0,0,1916-05-23,19,10H70,Aggravated Assault,UNK,LIBERTY ROAD,house,1,-,LIBERTY ROAD
1,1,1917-02-20,16,10H70,Theft,7500-7599,ARDMORE,other_parking,1,ST,ARDMORE ST
2,2,1963-02-02,13,10H40,Theft,3800-3899,MAIN,rehab_center,1,ST,MAIN ST
3,3,1966-01-01,0,10H50,Rape,3300-3399,ALABAMA,apartment,1,ST,ALABAMA ST
4,4,1971-02-03,6,1A10,Theft,1200-1299,TRAVIS,house,1,-,TRAVIS


In [14]:
rd = df.block_range == 'UNK'

In [15]:
df = df[~rd]

In [16]:
%%time
df['full_street_address'] = df.apply(full_street_address, axis=1)

CPU times: user 5.17 s, sys: 109 ms, total: 5.28 s
Wall time: 5.33 s


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84528 entries, 1 to 85577
Data columns (total 12 columns):
Unnamed: 0             84528 non-null int64
date                   84528 non-null object
hour                   84528 non-null int64
beat                   84528 non-null object
offense_type           84528 non-null object
block_range            84528 non-null object
street_name            84528 non-null object
premise                84528 non-null object
num_offenses           84528 non-null int64
type                   84528 non-null object
full_street_name       84528 non-null object
full_street_address    84528 non-null object
dtypes: int64(3), object(9)
memory usage: 8.4+ MB


In [18]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type,full_street_name,full_street_address
1,1,1917-02-20,16,10H70,Theft,7500-7599,ARDMORE,other_parking,1,ST,ARDMORE ST,7550 ARDMORE ST
2,2,1963-02-02,13,10H40,Theft,3800-3899,MAIN,rehab_center,1,ST,MAIN ST,3850 MAIN ST
3,3,1966-01-01,0,10H50,Rape,3300-3399,ALABAMA,apartment,1,ST,ALABAMA ST,3350 ALABAMA ST
4,4,1971-02-03,6,1A10,Theft,1200-1299,TRAVIS,house,1,-,TRAVIS,1250 TRAVIS
5,5,1977-10-14,15,10H50,Theft,3200-3299,TRUXILLO,apartment_parking,1,-,TRUXILLO,3250 TRUXILLO


In [19]:
df.drop(['Unnamed: 0','type','full_street_name','block_range','street_name'],axis=1,inplace=True)

In [20]:
df.head()

Unnamed: 0,date,hour,beat,offense_type,premise,num_offenses,full_street_address
1,1917-02-20,16,10H70,Theft,other_parking,1,7550 ARDMORE ST
2,1963-02-02,13,10H40,Theft,rehab_center,1,3850 MAIN ST
3,1966-01-01,0,10H50,Rape,apartment,1,3350 ALABAMA ST
4,1971-02-03,6,1A10,Theft,house,1,1250 TRAVIS
5,1977-10-14,15,10H50,Theft,apartment_parking,1,3250 TRUXILLO


## select only 

In [21]:
df.date = pd.to_datetime(df.date)
df = df.set_index('date').sort_index(ascending=True)

In [22]:
df.head()

Unnamed: 0_level_0,hour,beat,offense_type,premise,num_offenses,full_street_address
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1917-02-20,16,10H70,Theft,other_parking,1,7550 ARDMORE ST
1963-02-02,13,10H40,Theft,rehab_center,1,3850 MAIN ST
1966-01-01,0,10H50,Rape,apartment,1,3350 ALABAMA ST
1971-02-03,6,1A10,Theft,house,1,1250 TRAVIS
1977-10-14,15,10H50,Theft,apartment_parking,1,3250 TRUXILLO


In [23]:
df = df['2010':'2017']

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 84211 entries, 2010-01-01 to 2017-12-31
Data columns (total 6 columns):
hour                   84211 non-null int64
beat                   84211 non-null object
offense_type           84211 non-null object
premise                84211 non-null object
num_offenses           84211 non-null int64
full_street_address    84211 non-null object
dtypes: int64(2), object(4)
memory usage: 4.5+ MB


In [25]:
df.head()

Unnamed: 0_level_0,hour,beat,offense_type,premise,num_offenses,full_street_address
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-01,21,10H50,Aggravated Assault,multiplex_home,1,3450 ROSALIE
2010-01-01,14,10H60,Theft,apartment,1,5550 LIVE OAK
2010-01-01,23,1A10,Burglary,"theatres,dinner theaters,auditor.",1,650 TEXAS ST
2010-01-01,6,1A10,Auto Theft,auto_repair,1,1250 TRAVIS ST
2010-01-01,6,10H60,Theft,apartment,1,3750 ODIN CT
