In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt
import datetime  as dt
import seaborn as sns

## Data directory

In [3]:
data_folder = 'crime'

In [4]:
# data folder path
data_directory = os.path.join('..','data','clean_data/{}'.format(data_folder))
data_directory_saves = os.path.join( '..','data','clean_data','merge_data/')

In [5]:
# combine all files into one df
all_files = glob.glob(os.path.join(data_directory, "*.csv")) 
df_from_each_file = (pd.read_csv(f) for f in all_files)
df   = pd.concat(df_from_each_file, ignore_index=True)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006231 entries, 0 to 1006230
Data columns (total 8 columns):
date            1006231 non-null object
hour            1006231 non-null int64
beat            1006231 non-null object
offense_type    1006231 non-null object
block_range     1006231 non-null object
street_name     1006229 non-null object
premise         1006231 non-null object
num_offenses    1006231 non-null int64
dtypes: int64(2), object(6)
memory usage: 61.4+ MB


In [7]:
df.head()

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses
0,2017-04-10,15,10H10,Burglary,200-299,CLIFTON,Residence or House,1
1,2017-04-11,15,10H10,Theft,2300-2399,CANAL,Restaurant or Cafeteria Parking Lot,1
2,2017-04-11,17,10H10,Theft,2300-2399,CANAL,Restaurant or Cafeteria Parking Lot,1
3,2017-04-12,9,10H10,Burglary,4600-4699,CANAL,Miscellaneous Business (Non-Specific),1
4,2017-04-12,19,10H10,Theft,100-199,ADAM,"Other, Unknown, or Not Listed",1


In [8]:
unk_date = df.date == 'UNK'

In [9]:
df[unk_date]

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses
705158,UNK,15,13D40,Theft,8500-8599,SAM HOUSTON,UNK,1
707272,UNK,14,18F40,Robbery,2700-2799,DUNVALE,UNK,1
715289,UNK,8,14D40,Burglary,5600-5699,SELINSKY,UNK,1
719933,UNK,11,7C20,Theft,3900-3999,CAVALCADE,UNK,1


In [10]:
df.loc[719933]

date                  UNK
hour                   11
beat                 7C20
offense_type        Theft
block_range     3900-3999
street_name     CAVALCADE
premise               UNK
num_offenses            1
Name: 719933, dtype: object

## drop UNK dates

In [11]:
df = df[df.date != 'UNK']

## set date as datetime index

In [12]:
%%time
df.date = pd.to_datetime(df['date'])
df = df.set_index('date').sort_index(ascending=True)

CPU times: user 432 ms, sys: 272 ms, total: 704 ms
Wall time: 735 ms


In [13]:
df.head()

Unnamed: 0_level_0,hour,beat,offense_type,block_range,street_name,premise,num_offenses
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1914-09-08,7,24C60,Burglary,12700-12799,LAKE HOUSTON,Restaurant or Cafeteria,1
1914-11-02,3,18F60,Burglary,8800-8899,BELLAIRE,Miscellaneous Business (Non-Specific),1
1914-12-03,19,12D20,Auto Theft,12800-12899,GULF,UNK,1
1915-01-05,22,3B10,Theft,3200-3299,MANGUM RD 180,Other Parking Lot,1
1915-01-14,23,5F10,Auto Theft,7000-7099,WESTVIEW,Apartment Parking Lot,1


## Display null values

In [14]:
df.apply(lambda x: sum(x.isnull()))

hour            0
beat            0
offense_type    0
block_range     0
street_name     2
premise         0
num_offenses    0
dtype: int64

# select beats

In [15]:
len(df.beat.unique())

238

In [16]:
df.beat.unique()

array(['24C60', '18F60', '12D20', '3B10', '5F10', '1A20', '11H10',
       '19G20', '9C40', '9C30', '6B30', '10H70', '4F20', '18F20', '19G40',
       '20G30', '20G20', '17E20', '9C20', '19G10', '14D20', '8C20',
       '8C30', '17E40', '6B50', '12D70', 'UNK', '1A40', '7C10', '1A50',
       '10H40', '13D20', '6B60', '18F10', '10H50', '13D40', '2A30',
       '3B50', '19G50', '14D50', '1A10', '15E30', '12D50', '14D30',
       '8C60', '1A30', '18F40', '4F30', '20G50', '7C20', '3B30', '12D10',
       '15E10', '18F50', '14D10', '11H20', '5F30', '15E20', '23J50',
       '17E10', '6B10', '19G30', '18F30', '20G10', '2A40', '24C40',
       '4F10', '14D40', '10H60', '3B40', '10H80', '5F20', '16E40',
       '12D60', '6B40', '7C40', '13D30', '20G80', '10H20', '7C50',
       '15E40', '2A50', '8C50', '24C10', '20G40', '13D10', '7C30',
       '20G70', '2A20', '16E10', '12D30', '9C10', '16E30', '6B20', '2A10',
       '8C10', '21I20', '11H30', '24C20', '17E30', '10H30', '5F40',
       '2A60', '21I10', '10

## remove extra chars

In [17]:
df.beat.replace(regex=True,inplace=True,to_replace=r'\W',value=r'')

In [18]:
len(df.beat.unique())

127

In [19]:
df.beat.unique()

array(['24C60', '18F60', '12D20', '3B10', '5F10', '1A20', '11H10',
       '19G20', '9C40', '9C30', '6B30', '10H70', '4F20', '18F20', '19G40',
       '20G30', '20G20', '17E20', '9C20', '19G10', '14D20', '8C20',
       '8C30', '17E40', '6B50', '12D70', 'UNK', '1A40', '7C10', '1A50',
       '10H40', '13D20', '6B60', '18F10', '10H50', '13D40', '2A30',
       '3B50', '19G50', '14D50', '1A10', '15E30', '12D50', '14D30',
       '8C60', '1A30', '18F40', '4F30', '20G50', '7C20', '3B30', '12D10',
       '15E10', '18F50', '14D10', '11H20', '5F30', '15E20', '23J50',
       '17E10', '6B10', '19G30', '18F30', '20G10', '2A40', '24C40',
       '4F10', '14D40', '10H60', '3B40', '10H80', '5F20', '16E40',
       '12D60', '6B40', '7C40', '13D30', '20G80', '10H20', '7C50',
       '15E40', '2A50', '8C50', '24C10', '20G40', '13D10', '7C30',
       '20G70', '2A20', '16E10', '12D30', '9C10', '16E30', '6B20', '2A10',
       '8C10', '21I20', '11H30', '24C20', '17E30', '10H30', '5F40',
       '2A60', '21I10', '10

In [20]:
beats = ['10H10','10H30', '10H40', '10H50', '10H60','10H70', '10H80', '15E40', '1A10']

In [21]:
selected_beats = df.beat.isin(beats)

In [22]:
data = df[selected_beats]

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 85578 entries, 1916-05-23 to 2021-01-03
Data columns (total 7 columns):
hour            85578 non-null int64
beat            85578 non-null object
offense_type    85578 non-null object
block_range     85578 non-null object
street_name     85578 non-null object
premise         85578 non-null object
num_offenses    85578 non-null int64
dtypes: int64(2), object(5)
memory usage: 5.2+ MB


In [24]:
data.head()

Unnamed: 0_level_0,hour,beat,offense_type,block_range,street_name,premise,num_offenses
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1916-05-23,19,10H70,Aggravated Assault,UNK,LIBERTY ROAD,Residence or House,1
1917-02-20,16,10H70,Theft,7500-7599,ARDMORE,Other Parking Lot,1
1963-02-02,13,10H40,Theft,3800-3899,MAIN,REHABILITATION CENTER,1
1966-01-01,0,10H50,Rape,3300-3399,ALABAMA,APARTMENT,1
1971-02-03,6,1A10,Theft,1200-1299,TRAVIS,Residence or House,1


## display null values

In [25]:
data.apply(lambda x: sum(x.isnull()))

hour            0
beat            0
offense_type    0
block_range     0
street_name     0
premise         0
num_offenses    0
dtype: int64

## Inspect cols

In [26]:
data.columns

Index(['hour', 'beat', 'offense_type', 'block_range', 'street_name', 'premise',
       'num_offenses'],
      dtype='object')

In [27]:
data.hour.unique()

array([19, 16, 13,  0,  6, 15, 12, 20, 22, 14, 18,  8,  2,  7, 11, 21, 10,
        1, 17, 23,  4,  3,  9,  5, 24])

In [28]:
data.beat.unique()

array(['10H70', '10H40', '10H50', '1A10', '10H60', '10H80', '15E40',
       '10H30', '10H10'], dtype=object)

In [29]:
data.offense_type.unique()

array(['Aggravated Assault', 'Theft', 'Rape', 'Burglary', 'Robbery',
       'Auto Theft', 'Murder'], dtype=object)

In [32]:
data.num_offenses.unique()

array([ 1,  2,  3, 16,  6,  4,  5,  8,  7, 13])

In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 85578 entries, 1916-05-23 to 2021-01-03
Data columns (total 7 columns):
hour            85578 non-null int64
beat            85578 non-null object
offense_type    85578 non-null object
block_range     85578 non-null object
street_name     85578 non-null object
premise         85578 non-null object
num_offenses    85578 non-null int64
dtypes: int64(2), object(5)
memory usage: 5.2+ MB


## save

In [34]:
data.to_csv(data_directory_saves+'crime_beats.csv')

## inspect premise

In [31]:
len(data.premise.unique())

335

In [32]:
data.premise.value_counts(dropna=False)[:10]

Road, Street, or Sidewalk        7325
ROAD/STREET/SIDEWALK             7063
APARTMENT                        3982
APARTMENT PARKING LOT            3681
RESIDENCE/HOUSE                  3605
UNK                              3598
Residence or House               3229
OTHER PARKING LOT                3101
COMMERCIAL PARKING LOT/GARAGE    2541
Apartment Parking Lot            2492
Name: premise, dtype: int64

## lowercase all values

In [33]:
data.premise = data.premise.str.lower()

In [34]:
len(data.premise.unique())

278

In [35]:
data.premise.value_counts(dropna=False)[:10]

road, street, or sidewalk        7325
road/street/sidewalk             7063
apartment parking lot            6173
apartment                        6159
other parking lot                4891
residence/house                  3605
unk                              3598
residence or house               3229
commercial parking lot/garage    2541
hospital                         2162
Name: premise, dtype: int64

In [36]:
data.premise.value_counts(dropna=False)[-10:]

vacant jail or prison                                                                                                                                     1
vacant school or college/university                                                                                                                       1
car wash                                                                                                                                                  1
marine veh. sales,boats,sailboats                                                                                                                         1
vacant industrial or manufacturing blding                                                                                                                 1
mall parking lot                                                                                                                                          1
university or college                                           

## combine similar

In [37]:
pat1 = r"^road(.*)"

In [38]:
road = data.premise.str.contains(pat1,regex=True)

In [39]:
data[road].premise.value_counts()

road, street, or sidewalk                                                                                                                                 7325
road/street/sidewalk                                                                                                                                      7063
road, street, or sidewalk                                                                                                                                  220
Name: premise, dtype: int64

In [40]:
data.premise = data.premise.str.replace(pat1,'road_street_sidewalk',regex=True)

In [41]:
data.premise.value_counts(dropna=False)[:10]

road_street_sidewalk             14608
apartment parking lot             6173
apartment                         6159
other parking lot                 4891
residence/house                   3605
unk                               3598
residence or house                3229
commercial parking lot/garage     2541
hospital                          2162
other/unknown                     1967
Name: premise, dtype: int64

In [42]:
pat2 = r'^apartment\s\bparking(.*)'  # apartment-space-parking
#pat2 = r'^apartment\s'

In [43]:
apt = data.premise.str.contains(pat2,regex=True)

In [44]:
data[apt].premise.value_counts()

apartment parking lot                                                                                                                                     6173
apartment parking lot                                                                                                                                       75
Name: premise, dtype: int64

In [45]:
data.premise = data.premise.str.replace(pat2,'apartment_parking',regex=True)

In [46]:
data.premise.value_counts(dropna=False)[:10]

road_street_sidewalk             14608
apartment_parking                 6248
apartment                         6159
other parking lot                 4891
residence/house                   3605
unk                               3598
residence or house                3229
commercial parking lot/garage     2541
hospital                          2162
other/unknown                     1967
Name: premise, dtype: int64

## find house

In [47]:
pat3 = r'\bresidence(.*)'  #

In [48]:
house  = data.premise.str.contains(pat3,regex=True)

In [49]:
data[house].premise.value_counts()

residence/house                                                                                                                                           3605
residence or house                                                                                                                                        3229
vacant single occupancy residence (houses,townhouses,duplexes, etc.)                                                                                       187
residence or house                                                                                                                                          73
Name: premise, dtype: int64

In [50]:
data.premise = data.premise.str.replace(pat3,'house',regex=True)

In [51]:
data.premise.value_counts(dropna=False)[:10]

road_street_sidewalk             14608
house                             6907
apartment_parking                 6248
apartment                         6159
other parking lot                 4891
unk                               3598
commercial parking lot/garage     2541
hospital                          2162
other/unknown                     1967
driveway                          1857
Name: premise, dtype: int64

## find parking

In [55]:
pat4 = r"commercial\s\bparking\s+(.*)"

In [56]:
pk  = data.premise.str.contains(pat4,regex=True)

In [57]:
data[pk].premise.value_counts()

commercial parking lot/garage                                                                                                                             2541
commercial parking lot or garage                                                                                                                          1794
commercial parking lot or garage                                                                                                                            86
Name: premise, dtype: int64

In [58]:
data.premise = data.premise.str.replace(pat4,'commercial_parking_garage',regex=True)

In [98]:
data.premise.value_counts(dropna=False)[:20]

road_street_sidewalk                   14608
house                                   6907
apartment_parking                       6248
apartment                               6159
other parking lot                       4891
commercial_parking_garage               4421
unk                                     3598
hospital                                2162
other/unknown                           1967
driveway                                1857
restaurant/cafeteria parking lot        1077
restaurant or cafeteria parking lot     1060
bar/night club                          1036
department/discount store               1027
grocery/supermarket                      954
convenience store                        911
office building                          827
commercial building                      789
restaurant/cafeteria                     781
other, unknown, or not listed            778
Name: premise, dtype: int64

In [61]:
len(data.premise.value_counts(dropna=False))

271

## restaurant

In [138]:
pat5 = r"^(?=.*\brestaurant\b)(?=.*\bparking\b).*$"

In [139]:
rest  = data.premise.str.contains(pat5,regex=True)

In [140]:
data[rest].premise.value_counts()

restaurant/cafeteria parking lot                                                                                                                          1077
restaurant or cafeteria parking lot                                                                                                                       1060
restaurant or cafeteria parking lot                                                                                                                         21
Name: premise, dtype: int64

In [141]:
data.premise = data.premise.str.replace(pat5,'restaurant_parking_lot',regex=True)

In [142]:
data.premise.value_counts(dropna=False)[:20]

road_street_sidewalk             14608
house                             6907
apartment_parking                 6248
apartment                         6159
other parking lot                 4891
commercial_parking_garage         4421
unk                               3598
hospital                          2162
restaurant_parking_lot            2158
other/unknown                     1967
driveway                          1857
bar/night club                    1036
department/discount store         1027
grocery/supermarket                954
convenience store                  911
office building                    827
commercial building                789
restaurant/cafeteria               781
other, unknown, or not listed      778
service or gas station             763
Name: premise, dtype: int64

In [150]:
pat6 = r"^(?=.*\bbar\b)(?=.*\bnight\b)(?=.*\bparking\b).*$"

In [151]:
bar  = data.premise.str.contains(pat6,regex=True)

In [154]:
data[bar].premise.value_counts()

bar_club_parking_lot    1262
Name: premise, dtype: int64

In [153]:
data.premise = data.premise.str.replace(pat6,'bar_club_parking_lot',regex=True)

In [155]:
data.premise.value_counts(dropna=False)[:20]

road_street_sidewalk             14608
house                             6907
apartment_parking                 6248
apartment                         6159
other parking lot                 4891
commercial_parking_garage         4421
unk                               3598
hospital                          2162
restaurant_parking_lot            2158
other/unknown                     1967
driveway                          1857
bar_club_parking_lot              1262
bar/night club                    1036
department/discount store         1027
grocery/supermarket                954
convenience store                  911
office building                    827
commercial building                789
restaurant/cafeteria               781
other, unknown, or not listed      778
Name: premise, dtype: int64

In [156]:
pat7 = r"^(?=.*\bbar\b)(?=.*\bnight\b)(?=.*\bclub\b).*$"

In [157]:
bar2  = data.premise.str.contains(pat7,regex=True)

In [158]:
data[bar2].premise.value_counts()

bar/night club                                                                                                                                            1036
bar or night club                                                                                                                                          707
bar or night club                                                                                                                                           18
Name: premise, dtype: int64

In [159]:
data.premise = data.premise.str.replace(pat7,'bar_club',regex=True)

In [160]:
data.premise.value_counts(dropna=False)[:20]

road_street_sidewalk             14608
house                             6907
apartment_parking                 6248
apartment                         6159
other parking lot                 4891
commercial_parking_garage         4421
unk                               3598
hospital                          2162
restaurant_parking_lot            2158
other/unknown                     1967
driveway                          1857
bar_club                          1761
bar_club_parking_lot              1262
department/discount store         1027
grocery/supermarket                954
convenience store                  911
office building                    827
commercial building                789
restaurant/cafeteria               781
other, unknown, or not listed      778
Name: premise, dtype: int64

In [164]:
pat8 = r"^(?=.*\bgrocery\b)(?=.*\bparking\b).*$"

In [165]:
store  = data.premise.str.contains(pat8,regex=True)

In [166]:
data[store].premise.value_counts()

grocery/supermarket parking lot                                                                                                                           226
grocery store or supermarket parking lot                                                                                                                  208
grocery store or supermarket parking lot                                                                                                                    4
Name: premise, dtype: int64

In [167]:
data.premise = data.premise.str.replace(pat8,'supermarket_parking',regex=True)

In [168]:
data.premise.value_counts(dropna=False)[:20]

road_street_sidewalk             14608
house                             6907
apartment_parking                 6248
apartment                         6159
other parking lot                 4891
commercial_parking_garage         4421
unk                               3598
hospital                          2162
restaurant_parking_lot            2158
other/unknown                     1967
driveway                          1857
bar_club                          1761
bar_club_parking_lot              1262
department/discount store         1027
grocery/supermarket                954
convenience store                  911
office building                    827
commercial building                789
restaurant/cafeteria               781
other, unknown, or not listed      778
Name: premise, dtype: int64

In [169]:
pat9 = r"^(?=.*\bgrocery\b)(?=.*\bsupermarket\b).*$"

In [170]:
store2  = data.premise.str.contains(pat9,regex=True)

In [171]:
data[store2].premise.value_counts()

grocery/supermarket                                                                                                                                       954
grocery store or supermarket                                                                                                                              515
vacant grocery store or supermarket                                                                                                                        10
grocery store or supermarket                                                                                                                                6
vacant grocery/supermarket                                                                                                                                  1
Name: premise, dtype: int64

In [172]:
data.premise = data.premise.str.replace(pat9,'supermarket',regex=True)

In [173]:
data.premise.value_counts(dropna=False)[:20]

road_street_sidewalk             14608
house                             6907
apartment_parking                 6248
apartment                         6159
other parking lot                 4891
commercial_parking_garage         4421
unk                               3598
hospital                          2162
restaurant_parking_lot            2158
other/unknown                     1967
driveway                          1857
bar_club                          1761
supermarket                       1486
bar_club_parking_lot              1262
department/discount store         1027
convenience store                  911
office building                    827
commercial building                789
restaurant/cafeteria               781
other, unknown, or not listed      778
Name: premise, dtype: int64

In [174]:
pat10 = r"^(?=.*\bdepartment\b)(?=.*\bdiscount\b).*$"

In [175]:
store3  = data.premise.str.contains(pat10,regex=True)

In [177]:
data[store3].premise.value_counts()

department/discount store                                                                                                                                 1027
department or discount store                                                                                                                               336
department or discount store                                                                                                                                10
Name: premise, dtype: int64

In [178]:
data.premise = data.premise.str.replace(pat10,'department_store',regex=True)

In [184]:
data.premise.value_counts(dropna=False)[20:40]

service or gas station                      763
hotel/motel parking lot                     682
miscellaneous business (non-specific)       680
strip business center parking lot           632
hotel/motel/etc.                            623
restaurant or cafeteria                     623
convenience store parking lot               568
hotel, motel, inn, etc.                     562
service/gas station                         556
hospital parking lot                        545
construction site                           532
hotel or motel parking lot                  528
parks and recreation, zoo, swimming pool    514
parks & recreation, zoo, swim pool          504
condominium                                 478
garage or carport                           445
supermarket_parking                         438
bus station                                 427
bus stop                                    425
multi-plex hme(duplex,triplex etc)          422
Name: premise, dtype: int64

In [180]:
len(data.premise.value_counts(dropna=False))

257

In [188]:
pat11 = r"^(?=.*\bhotel\b)(?=.*\bmotel\b)(?=.*\bparking\b).*$"

In [189]:
bu  = data.premise.str.contains(pat11,regex=True)

In [190]:
data[bu].premise.value_counts()

hotel/motel parking lot                                                                                                                                   682
hotel or motel parking lot                                                                                                                                528
hotel or motel parking lot                                                                                                                                 25
Name: premise, dtype: int64

In [191]:
data.premise = data.premise.str.replace(pat11,'hotel_motel_parking',regex=True)

In [192]:
data.premise.value_counts(dropna=False)[20:40]

other, unknown, or not listed               778
service or gas station                      763
miscellaneous business (non-specific)       680
strip business center parking lot           632
hotel/motel/etc.                            623
restaurant or cafeteria                     623
convenience store parking lot               568
hotel, motel, inn, etc.                     562
service/gas station                         556
hospital parking lot                        545
construction site                           532
parks and recreation, zoo, swimming pool    514
parks & recreation, zoo, swim pool          504
condominium                                 478
garage or carport                           445
supermarket_parking                         438
bus station                                 427
bus stop                                    425
multi-plex hme(duplex,triplex etc)          422
specialty store (non-specific)              391
Name: premise, dtype: int64

In [193]:
pat12 = r"^(?=.*\bhotel\b)(?=.*\bmotel\b).*$"

In [194]:
bu2  = data.premise.str.contains(pat12,regex=True)

In [195]:
data[bu2].premise.value_counts()

hotel/motel/etc.                                                                                                                                          623
hotel, motel, inn, etc.                                                                                                                                   562
hotel, motel, inn, etc.                                                                                                                                    14
vacant hotel/motel/etc.                                                                                                                                     6
vacant hotel, motel, etc.                                                                                                                                   2
Name: premise, dtype: int64

In [196]:
data.premise = data.premise.str.replace(pat12,'hotel_motel',regex=True)

In [201]:
data.premise.value_counts(dropna=False)[:40]

road_street_sidewalk                        14608
house                                        6907
apartment_parking                            6248
apartment                                    6159
other parking lot                            4891
commercial_parking_garage                    4421
unk                                          3598
hospital                                     2162
restaurant_parking_lot                       2158
other/unknown                                1967
driveway                                     1857
bar_club                                     1761
supermarket                                  1486
department_store                             1373
bar_club_parking_lot                         1262
hotel_motel_parking                          1235
hotel_motel                                  1207
convenience store                             911
office building                               827
commercial building                           789


In [198]:
pat13 = r"^(?=.*\brestaurant\b)(?=.*\bcafeteria\b).*$"

In [199]:
bu3  = data.premise.str.contains(pat13,regex=True)

In [200]:
data[bu3].premise.value_counts()

restaurant/cafeteria                                                                                                                                      781
restaurant or cafeteria                                                                                                                                   623
restaurant or cafeteria                                                                                                                                    13
Name: premise, dtype: int64

In [202]:
data.premise = data.premise.str.replace(pat13,'restaurant',regex=True)

In [203]:
data.premise.value_counts(dropna=False)[:40]

road_street_sidewalk                        14608
house                                        6907
apartment_parking                            6248
apartment                                    6159
other parking lot                            4891
commercial_parking_garage                    4421
unk                                          3598
hospital                                     2162
restaurant_parking_lot                       2158
other/unknown                                1967
driveway                                     1857
bar_club                                     1761
supermarket                                  1486
restaurant                                   1417
department_store                             1373
bar_club_parking_lot                         1262
hotel_motel_parking                          1235
hotel_motel                                  1207
convenience store                             911
office building                               827


In [207]:
pat14 = r"^(?=.*\bgas\b)(?=.*\bstation\b).*$"

In [208]:
bu4  = data.premise.str.contains(pat14,regex=True)

In [209]:
data[bu4].premise.value_counts()

service or gas station                                                                                                                                    763
service/gas station                                                                                                                                       556
service or gas station                                                                                                                                     15
Name: premise, dtype: int64

In [210]:
data.premise = data.premise.str.replace(pat14,'gas_station',regex=True)

In [211]:
data.premise.value_counts(dropna=False)[:40]

road_street_sidewalk                        14608
house                                        6907
apartment_parking                            6248
apartment                                    6159
other parking lot                            4891
commercial_parking_garage                    4421
unk                                          3598
hospital                                     2162
restaurant_parking_lot                       2158
other/unknown                                1967
driveway                                     1857
bar_club                                     1761
supermarket                                  1486
restaurant                                   1417
department_store                             1373
gas_station                                  1334
bar_club_parking_lot                         1262
hotel_motel_parking                          1235
hotel_motel                                  1207
convenience store                             911


In [214]:
pat15 = r"^(?=.*\bhospital\b)(?=.*\bparking\b).*$"

In [215]:
bu5  = data.premise.str.contains(pat15,regex=True)

In [216]:
data[bu5].premise.value_counts()

hospital parking lot                                                                                                                                      545
hospital parking lot                                                                                                                                        7
Name: premise, dtype: int64

In [217]:
data.premise = data.premise.str.replace(pat15,'hospital_parking',regex=True)

In [218]:
data.premise.value_counts(dropna=False)[:40]

road_street_sidewalk                        14608
house                                        6907
apartment_parking                            6248
apartment                                    6159
other parking lot                            4891
commercial_parking_garage                    4421
unk                                          3598
hospital                                     2162
restaurant_parking_lot                       2158
other/unknown                                1967
driveway                                     1857
bar_club                                     1761
supermarket                                  1486
restaurant                                   1417
department_store                             1373
gas_station                                  1334
bar_club_parking_lot                         1262
hotel_motel_parking                          1235
hotel_motel                                  1207
convenience store                             911


In [222]:
pat16 = r"^(?=.*\bconvenience\b)(?=.*\bparking\b)(?=.*\bstore\b).*$"

In [223]:
bu6  = data.premise.str.contains(pat16,regex=True)

In [224]:
data[bu6].premise.value_counts()

convenience store parking lot                                                                                                                             568
convenience store parking lot                                                                                                                               3
Name: premise, dtype: int64

In [225]:
data.premise = data.premise.str.replace(pat16,'convenience_store_parking',regex=True)

In [226]:
data.premise.value_counts(dropna=False)[:40]

road_street_sidewalk                        14608
house                                        6907
apartment_parking                            6248
apartment                                    6159
other parking lot                            4891
commercial_parking_garage                    4421
unk                                          3598
hospital                                     2162
restaurant_parking_lot                       2158
other/unknown                                1967
driveway                                     1857
bar_club                                     1761
supermarket                                  1486
restaurant                                   1417
department_store                             1373
gas_station                                  1334
bar_club_parking_lot                         1262
hotel_motel_parking                          1235
hotel_motel                                  1207
convenience store                             911


In [227]:
pat17 = r"^(?=.*\bconvenience\b)(?=.*\bstore\b).*$"

In [228]:
bu7  = data.premise.str.contains(pat17,regex=True)

In [229]:
data[bu7].premise.value_counts()

convenience store                                                                                                                                         911
convenience store                                                                                                                                           7
Name: premise, dtype: int64

In [230]:
data.premise = data.premise.str.replace(pat17,'convenience_store',regex=True)

In [231]:
data.premise.value_counts(dropna=False)[:40]

road_street_sidewalk                        14608
house                                        6907
apartment_parking                            6248
apartment                                    6159
other parking lot                            4891
commercial_parking_garage                    4421
unk                                          3598
hospital                                     2162
restaurant_parking_lot                       2158
other/unknown                                1967
driveway                                     1857
bar_club                                     1761
supermarket                                  1486
restaurant                                   1417
department_store                             1373
gas_station                                  1334
bar_club_parking_lot                         1262
hotel_motel_parking                          1235
hotel_motel                                  1207
convenience_store                             918


In [232]:
pat18 = r"^(?=.*\bbuilding\b).*$"

In [233]:
bu8  = data.premise.str.contains(pat18,regex=True)

In [234]:
data[bu8].premise.value_counts()

office building                                                                                                                                           827
commercial building                                                                                                                                       789
government/public building                                                                                                                                124
vacant building (commercial)                                                                                                                              121
government or public building                                                                                                                              96
maintenance/building services                                                                                                                              18
maintenance or building services                    

In [235]:
data.premise = data.premise.str.replace(pat18,'building',regex=True)

In [236]:
data.premise.value_counts(dropna=False)[:40]

road_street_sidewalk                        14608
house                                        6907
apartment_parking                            6248
apartment                                    6159
other parking lot                            4891
commercial_parking_garage                    4421
unk                                          3598
hospital                                     2162
restaurant_parking_lot                       2158
building                                     2017
other/unknown                                1967
driveway                                     1857
bar_club                                     1761
supermarket                                  1486
restaurant                                   1417
department_store                             1373
gas_station                                  1334
bar_club_parking_lot                         1262
hotel_motel_parking                          1235
hotel_motel                                  1207


In [239]:
pat19 = r"^(?=.*\bconstruction\b).*$"

In [240]:
bu8  = data.premise.str.contains(pat19,regex=True)

In [241]:
data[bu8].premise.value_counts()

construction site                                                                                                                                         532
vacant other structure (out buildings,monuments,buildings under construction,etc.)                                                                         23
construction site                                                                                                                                          12
Name: premise, dtype: int64

In [242]:
data.premise = data.premise.str.replace(pat19,'construction_site',regex=True)

In [243]:
data.premise.value_counts(dropna=False)[:40]

road_street_sidewalk                        14608
house                                        6907
apartment_parking                            6248
apartment                                    6159
other parking lot                            4891
commercial_parking_garage                    4421
unk                                          3598
hospital                                     2162
restaurant_parking_lot                       2158
building                                     2017
other/unknown                                1967
driveway                                     1857
bar_club                                     1761
supermarket                                  1486
restaurant                                   1417
department_store                             1373
gas_station                                  1334
bar_club_parking_lot                         1262
hotel_motel_parking                          1235
hotel_motel                                  1207


In [244]:
len(data.premise.value_counts(dropna=False))

231

In [245]:
def clean_premise(pat,df):
    fil = df.premise.str.contains(pat,regex=True)
    result = df[fil].premise.value_counts()
    return result

In [246]:
p = r"^(?=.*\bbus\b).*$"


test = clean_premise(p,data)

In [247]:
test

bus station                                                                                                                                               427
bus stop                                                                                                                                                  425
bus stop                                                                                                                                                    7
bus station                                                                                                                                                 4
Name: premise, dtype: int64