In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt
import datetime  as dt
import seaborn as sns

## Data directory

In [3]:
data_folder = 'merge_data'

In [4]:
# data folder path
data_directory = os.path.join('..','data','clean_data/{}/crime_clean_02.csv'.format(data_folder))
data_directory_saves = os.path.join( '..','data','clean_data','merge_data/')

In [5]:
df = pd.read_csv(data_directory)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006227 entries, 0 to 1006226
Data columns (total 9 columns):
date            1006227 non-null object
hour            1006227 non-null int64
beat            1006227 non-null object
offense_type    1006227 non-null object
block_range     1006227 non-null object
street_name     1006225 non-null object
premise         1006227 non-null object
num_offenses    1006227 non-null int64
type            1006227 non-null object
dtypes: int64(2), object(7)
memory usage: 69.1+ MB


In [7]:
df.head()

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type
0,1914-09-08,7,24C60,Burglary,12700-12799,LAKE HOUSTON,restaurant,1,PKWY
1,1914-11-02,3,18F60,Burglary,8800-8899,BELLAIRE,business,1,BLVD
2,1914-12-03,19,12D20,Auto Theft,12800-12899,GULF,unknown,1,FWY
3,1915-01-05,22,3B10,Theft,3200-3299,MANGUM RD 180,other_parking,1,-
4,1915-01-14,23,5F10,Auto Theft,7000-7099,WESTVIEW,apartment_parking,1,DR


In [8]:
df.apply(lambda x: sum(x.isnull()))

date            0
hour            0
beat            0
offense_type    0
block_range     0
street_name     2
premise         0
num_offenses    0
type            0
dtype: int64

In [9]:
df[df.isnull().any(axis=1)]

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type
928138,2017-05-04,22,8C10,Theft,9400-9499,,gas_station,1,-
931210,2017-05-13,9,20G10,Aggravated Assault,10300-10399,,restaurant,1,-


## replace null streets with string `unknown street`

In [10]:
df.street_name.fillna('unknown street',inplace=True)

In [11]:
df[df.isnull().any(axis=1)]

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type


## display unk block_rage

In [12]:
unk_br = df.block_range == 'UNK'

In [13]:
len(df[unk_br])

5467

In [14]:
ukdf = df[unk_br]
ukdf.head()

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type
11,1916-05-23,19,10H70,Aggravated Assault,UNK,LIBERTY ROAD,house,1,-
95,1986-10-29,21,2A40,Rape,UNK,RIESNER,house,1,-
249,2001-03-07,15,16E30,Theft,UNK,DARLINGHURST,house,1,DR
399,2003-01-24,15,UNK,Murder,UNK,00705 32ND 1,house,1,-
415,2003-05-18,6,18F40,Theft,UNK,WESTHEIMER RD,apartment,1,-


## will just pass street name  and not drop values 

# combine `street_name` and `street_type` for full address


In [15]:
def combine_stname_sttype(row):
    '''combines street_name and street_type
    checks if it contains the value
    otherwise it appends it'''
    stname = row.street_name.split(' ')[-1] # last item, looking for RD,ST,BLVD..
    sttype =row.type  # RD, BLVD,-,..
    if row.type == '-':
            return row.street_name  # unchanged
    elif stname in sttype:
        return row.street_name  #  unchanged
    else:
        return stname + ' ' + sttype  # street_name + street_type

In [16]:
%%time
df['full_street_name'] = df.apply(combine_stname_sttype, axis=1)  # 25 seconds

CPU times: user 26.6 s, sys: 406 ms, total: 27 s
Wall time: 27.2 s


In [17]:
df.head()

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type,full_street_name
0,1914-09-08,7,24C60,Burglary,12700-12799,LAKE HOUSTON,restaurant,1,PKWY,HOUSTON PKWY
1,1914-11-02,3,18F60,Burglary,8800-8899,BELLAIRE,business,1,BLVD,BELLAIRE BLVD
2,1914-12-03,19,12D20,Auto Theft,12800-12899,GULF,unknown,1,FWY,GULF FWY
3,1915-01-05,22,3B10,Theft,3200-3299,MANGUM RD 180,other_parking,1,-,MANGUM RD 180
4,1915-01-14,23,5F10,Auto Theft,7000-7099,WESTVIEW,apartment_parking,1,DR,WESTVIEW DR


### replace large int  in blockrange

In [18]:
df.loc[968433]

date                             2017-09-01
hour                                     21
beat                                   1A10
offense_type                          Theft
block_range         1.1103e+006-1.1104e+006
street_name         AVENIDA DE LAS AMERICAS
premise                   convention_center
num_offenses                              1
type                                      -
full_street_name    AVENIDA DE LAS AMERICAS
Name: 968433, dtype: object

In [19]:
df.replace({'block_range': {'1.1103e+006-1.1104e+006':'1001-1010'}},inplace=True)

In [20]:
df.loc[968433]

date                             2017-09-01
hour                                     21
beat                                   1A10
offense_type                          Theft
block_range                       1001-1010
street_name         AVENIDA DE LAS AMERICAS
premise                   convention_center
num_offenses                              1
type                                      -
full_street_name    AVENIDA DE LAS AMERICAS
Name: 968433, dtype: object

## combine block_range with full_street_name


In [21]:
def full_street_address(row):
    '''input col with block_range values
    its split, get med value then appended
    to street name
    return: full street address'''
    if row.block_range == 'UNK':
        return row.street_name
    else:
        st1 =  int(row.block_range.split('-')[0])
        st2 = int(row.block_range.split('-')[1])
        med =  np.median([st2,st1])
        st_full =  int(np.ceil(med))
        return str(st_full) + ' ' + row['full_street_name']

In [22]:
%%time
df['full_street_address'] = df.apply(full_street_address, axis=1)   # 1 minute

CPU times: user 1min 2s, sys: 1.56 s, total: 1min 4s
Wall time: 1min 4s


In [23]:
df.head()

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type,full_street_name,full_street_address
0,1914-09-08,7,24C60,Burglary,12700-12799,LAKE HOUSTON,restaurant,1,PKWY,HOUSTON PKWY,12750 HOUSTON PKWY
1,1914-11-02,3,18F60,Burglary,8800-8899,BELLAIRE,business,1,BLVD,BELLAIRE BLVD,8850 BELLAIRE BLVD
2,1914-12-03,19,12D20,Auto Theft,12800-12899,GULF,unknown,1,FWY,GULF FWY,12850 GULF FWY
3,1915-01-05,22,3B10,Theft,3200-3299,MANGUM RD 180,other_parking,1,-,MANGUM RD 180,3250 MANGUM RD 180
4,1915-01-14,23,5F10,Auto Theft,7000-7099,WESTVIEW,apartment_parking,1,DR,WESTVIEW DR,7050 WESTVIEW DR


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006227 entries, 0 to 1006226
Data columns (total 11 columns):
date                   1006227 non-null object
hour                   1006227 non-null int64
beat                   1006227 non-null object
offense_type           1006227 non-null object
block_range            1006227 non-null object
street_name            1006227 non-null object
premise                1006227 non-null object
num_offenses           1006227 non-null int64
type                   1006227 non-null object
full_street_name       1006227 non-null object
full_street_address    1006227 non-null object
dtypes: int64(2), object(9)
memory usage: 84.4+ MB


## drop unused columns

In [25]:
df.drop(['block_range','street_name','type','full_street_name'],axis=1,inplace=True)

In [26]:
df.head()

Unnamed: 0,date,hour,beat,offense_type,premise,num_offenses,full_street_address
0,1914-09-08,7,24C60,Burglary,restaurant,1,12750 HOUSTON PKWY
1,1914-11-02,3,18F60,Burglary,business,1,8850 BELLAIRE BLVD
2,1914-12-03,19,12D20,Auto Theft,unknown,1,12850 GULF FWY
3,1915-01-05,22,3B10,Theft,other_parking,1,3250 MANGUM RD 180
4,1915-01-14,23,5F10,Auto Theft,apartment_parking,1,7050 WESTVIEW DR


## save

In [27]:
df.to_csv(data_directory_saves+"crime_clean_03.csv")