In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt
import datetime  as dt
import seaborn as sns
import re

In [3]:
data_folder = 'merge_data'

In [4]:
# data folder path
data_directory = os.path.join('..','data','clean_data/{}/crime_beats.csv'.format(data_folder))
data_directory_saves = os.path.join( '..','data','clean_data','merge_data/')

In [5]:
df = pd.read_csv(data_directory)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85578 entries, 0 to 85577
Data columns (total 9 columns):
date            85578 non-null object
hour            85578 non-null int64
beat            85578 non-null object
offense_type    85578 non-null object
block_range     85578 non-null object
street_name     85578 non-null object
premise         85578 non-null object
num_offenses    85578 non-null int64
type            85578 non-null object
dtypes: int64(2), object(7)
memory usage: 5.9+ MB


In [7]:
df.head()

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type
0,1916-05-23,19,10H70,Aggravated Assault,UNK,LIBERTY ROAD,Residence or House,1,-
1,1917-02-20,16,10H70,Theft,7500-7599,ARDMORE,Other Parking Lot,1,ST
2,1963-02-02,13,10H40,Theft,3800-3899,MAIN,REHABILITATION CENTER,1,ST
3,1966-01-01,0,10H50,Rape,3300-3399,ALABAMA,APARTMENT,1,ST
4,1971-02-03,6,1A10,Theft,1200-1299,TRAVIS,Residence or House,1,-


## cleanup premise column

In [8]:
len(df.premise.unique())

335

In [9]:
df.premise.value_counts()[:10]

Road, Street, or Sidewalk        7325
ROAD/STREET/SIDEWALK             7063
APARTMENT                        3982
APARTMENT PARKING LOT            3681
RESIDENCE/HOUSE                  3605
UNK                              3598
Residence or House               3229
OTHER PARKING LOT                3101
COMMERCIAL PARKING LOT/GARAGE    2541
Apartment Parking Lot            2492
Name: premise, dtype: int64

## lowercase all

In [10]:
df.premise = df.premise.str.lower()

In [11]:
len(df.premise.unique())

278

In [12]:
df.premise.value_counts()[:10]

road, street, or sidewalk        7325
road/street/sidewalk             7063
apartment parking lot            6173
apartment                        6159
other parking lot                4891
residence/house                  3605
unk                              3598
residence or house               3229
commercial parking lot/garage    2541
hospital                         2162
Name: premise, dtype: int64

In [13]:
df.premise = df.premise.str.strip()

In [14]:
len(df.premise.unique())

197

In [15]:
df.premise.value_counts()[:10]

road, street, or sidewalk        7545
road/street/sidewalk             7063
apartment parking lot            6248
apartment                        6223
other parking lot                4950
residence/house                  3605
unk                              3598
residence or house               3302
commercial parking lot/garage    2541
hospital                         2192
Name: premise, dtype: int64

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85578 entries, 0 to 85577
Data columns (total 9 columns):
date            85578 non-null object
hour            85578 non-null int64
beat            85578 non-null object
offense_type    85578 non-null object
block_range     85578 non-null object
street_name     85578 non-null object
premise         85578 non-null object
num_offenses    85578 non-null int64
type            85578 non-null object
dtypes: int64(2), object(7)
memory usage: 5.9+ MB


## Create two function
- display results based on regex pattern
- change results based on regex pattern to new value

In [17]:
def clean_premise(pat,df):
    fil = df.premise.str.contains(pat,regex=True)
    result = df[fil].premise.value_counts()
    return result

In [18]:
def change_premise(pat,df,new_premise):
    df.premise = df.premise.replace(pat,new_premise,regex=True)

In [19]:
word1 = 'vacant'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

vacant single occ resd(house,townhs,dplex)                                            188
vacant single occupancy residence (houses,townhouses,duplexes, etc.)                  187
vacant building (commercial)                                                          121
vacant other residential (apartment,dorms)                                             47
vacant other residential (apartment,inn,dorms,boarding house)                          30
vacant other structure (out buildings,monuments,buildings under construction,etc.)     23
vacant restaurant                                                                      21
vacant other out build/monument/underconst                                             21
vacant school or college/university                                                    18
vacant storage fac (barn,garage,warehouse)                                             13
vacant hospital                                                                        11
vacant gro

In [20]:
change_premise(pat,df,'vacant_structure')

## road

In [21]:
word1 = 'road'
word2 = 'street'

In [22]:
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

road, street, or sidewalk    7545
road/street/sidewalk         7063
Name: premise, dtype: int64

In [23]:
change_premise(pat,df,'road_street_sidewalk')

In [24]:
df.premise.value_counts()[:10]

road_street_sidewalk             14608
apartment parking lot             6248
apartment                         6223
other parking lot                 4950
residence/house                   3605
unk                               3598
residence or house                3302
commercial parking lot/garage     2541
hospital                          2192
other/unknown                     1967
Name: premise, dtype: int64

## apartment parking

In [25]:
word1 = 'apartment'
word2 = 'parking'

In [26]:
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

apartment parking lot    6248
Name: premise, dtype: int64

In [27]:
change_premise(pat,df,'apartment_parking')

In [28]:
df.premise.value_counts()[:10]

road_street_sidewalk             14608
apartment_parking                 6248
apartment                         6223
other parking lot                 4950
residence/house                   3605
unk                               3598
residence or house                3302
commercial parking lot/garage     2541
hospital                          2192
other/unknown                     1967
Name: premise, dtype: int64

## residence

In [29]:
word1 = 'residence'
word2 = 'house'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

residence/house       3605
residence or house    3302
Name: premise, dtype: int64

In [30]:
change_premise(pat,df,'house')

In [31]:
df.premise.value_counts()[:10]

road_street_sidewalk                14608
house                                6907
apartment_parking                    6248
apartment                            6223
other parking lot                    4950
unk                                  3598
commercial parking lot/garage        2541
hospital                             2192
other/unknown                        1967
commercial parking lot or garage     1880
Name: premise, dtype: int64

## commercial parking

In [32]:
word1 = 'commercial'
word2 = 'parking'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

commercial parking lot/garage       2541
commercial parking lot or garage    1880
Name: premise, dtype: int64

In [33]:
change_premise(pat,df,'commercial_parking')

In [34]:
df.premise.value_counts()[:10]

road_street_sidewalk    14608
house                    6907
apartment_parking        6248
apartment                6223
other parking lot        4950
commercial_parking       4421
unk                      3598
hospital                 2192
other/unknown            1967
driveway                 1876
Name: premise, dtype: int64

## restaurant parking

In [35]:
word1 = 'restaurant'
word2 = 'parking'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

restaurant or cafeteria parking lot    1081
restaurant/cafeteria parking lot       1077
Name: premise, dtype: int64

In [36]:
change_premise(pat,df,'restaurant_parking')

In [37]:
df.premise.value_counts()[:10]

road_street_sidewalk    14608
house                    6907
apartment_parking        6248
apartment                6223
other parking lot        4950
commercial_parking       4421
unk                      3598
hospital                 2192
restaurant_parking       2158
other/unknown            1967
Name: premise, dtype: int64

In [38]:
word1 = 'club'
word2 = 'parking'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

bar/night club parking lot       690
bar or night club parking lot    572
Name: premise, dtype: int64

In [39]:
change_premise(pat,df,'bar_club_parking')

In [40]:
df.premise.value_counts()[:10]

road_street_sidewalk    14608
house                    6907
apartment_parking        6248
apartment                6223
other parking lot        4950
commercial_parking       4421
unk                      3598
hospital                 2192
restaurant_parking       2158
other/unknown            1967
Name: premise, dtype: int64

In [41]:
word1 = 'club'
word2 = 'bar'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

bar/night club       1036
bar or night club     725
Name: premise, dtype: int64

In [42]:
change_premise(pat,df,'bar_club')

In [43]:
df.premise.value_counts()[:10]

road_street_sidewalk    14608
house                    6907
apartment_parking        6248
apartment                6223
other parking lot        4950
commercial_parking       4421
unk                      3598
hospital                 2192
restaurant_parking       2158
other/unknown            1967
Name: premise, dtype: int64

In [44]:
word1 = 'grocery'
word2 = 'parking'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

grocery/supermarket parking lot             226
grocery store or supermarket parking lot    212
Name: premise, dtype: int64

In [45]:
change_premise(pat,df,'supermarket_parking')

In [46]:
df.premise.value_counts()[:10]

road_street_sidewalk    14608
house                    6907
apartment_parking        6248
apartment                6223
other parking lot        4950
commercial_parking       4421
unk                      3598
hospital                 2192
restaurant_parking       2158
other/unknown            1967
Name: premise, dtype: int64

In [47]:
word1 = 'grocery'
word2 = 'supermarket'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

grocery/supermarket             954
grocery store or supermarket    521
Name: premise, dtype: int64

In [48]:
change_premise(pat,df,'supermarket')

In [49]:
df.premise.value_counts()[:10]

road_street_sidewalk    14608
house                    6907
apartment_parking        6248
apartment                6223
other parking lot        4950
commercial_parking       4421
unk                      3598
hospital                 2192
restaurant_parking       2158
other/unknown            1967
Name: premise, dtype: int64

In [50]:



word1 = 'department'
word2 = 'discount'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

department/discount store       1027
department or discount store     346
Name: premise, dtype: int64

In [51]:
change_premise(pat,df,'department_store')

In [52]:
df.premise.value_counts()[:10]

road_street_sidewalk    14608
house                    6907
apartment_parking        6248
apartment                6223
other parking lot        4950
commercial_parking       4421
unk                      3598
hospital                 2192
restaurant_parking       2158
other/unknown            1967
Name: premise, dtype: int64

In [53]:
word1 = 'hotel'
word2 = 'parking'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

hotel/motel parking lot       682
hotel or motel parking lot    553
Name: premise, dtype: int64

In [54]:
change_premise(pat,df,'hotel_motel_parking')

In [55]:
df.premise.value_counts()[:10]

road_street_sidewalk    14608
house                    6907
apartment_parking        6248
apartment                6223
other parking lot        4950
commercial_parking       4421
unk                      3598
hospital                 2192
restaurant_parking       2158
other/unknown            1967
Name: premise, dtype: int64

In [56]:
word1 = 'hotel'
word2 = 'motel'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

hotel/motel/etc.           623
hotel, motel, inn, etc.    576
Name: premise, dtype: int64

In [57]:
change_premise(pat,df,'hotel_motel')

In [58]:
df.premise.value_counts()[:10]

road_street_sidewalk    14608
house                    6907
apartment_parking        6248
apartment                6223
other parking lot        4950
commercial_parking       4421
unk                      3598
hospital                 2192
restaurant_parking       2158
other/unknown            1967
Name: premise, dtype: int64

In [59]:
word1 = 'restaurant'
word2 = 'cafeteria'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

restaurant/cafeteria       781
restaurant or cafeteria    636
Name: premise, dtype: int64

In [60]:
change_premise(pat,df,'restaurant')

In [61]:
df.premise.value_counts()[:10]

road_street_sidewalk    14608
house                    6907
apartment_parking        6248
apartment                6223
other parking lot        4950
commercial_parking       4421
unk                      3598
hospital                 2192
restaurant_parking       2158
other/unknown            1967
Name: premise, dtype: int64

In [62]:
word1 = 'gas'
word2 = 'station'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

service or gas station    778
service/gas station       556
Name: premise, dtype: int64

In [63]:
change_premise(pat,df,'gas_station')

In [64]:
df.premise.value_counts()[:10]

road_street_sidewalk    14608
house                    6907
apartment_parking        6248
apartment                6223
other parking lot        4950
commercial_parking       4421
unk                      3598
hospital                 2192
restaurant_parking       2158
other/unknown            1967
Name: premise, dtype: int64

In [65]:
word1 = 'hospital'
word2 = 'parking'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

hospital parking lot    552
Name: premise, dtype: int64

In [66]:
change_premise(pat,df,'hospital_parking')

In [67]:
df.premise.value_counts()[10:20]

driveway               1876
bar_club               1761
supermarket            1475
restaurant             1417
department_store       1373
gas_station            1334
bar_club_parking       1262
hotel_motel_parking    1235
hotel_motel            1199
convenience store       918
Name: premise, dtype: int64

In [68]:
word1 = 'convenience'
word2 = 'store'
word3 = 'parking'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2,word3)
clean_premise(pat,df)

convenience store parking lot    571
Name: premise, dtype: int64

In [69]:
change_premise(pat,df,'convenience_store_parking')

In [70]:
word1 = 'convenience'
word2 = 'store'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

convenience store    918
Name: premise, dtype: int64

In [71]:
change_premise(pat,df,'convenience_store')

In [72]:
df.premise.value_counts()[20:30]

office building                             832
other, unknown, or not listed               800
commercial building                         796
vacant_structure                            739
miscellaneous business (non-specific)       702
strip business center parking lot           638
convenience_store_parking                   571
hospital_parking                            552
construction site                           544
parks and recreation, zoo, swimming pool    539
Name: premise, dtype: int64

In [73]:
word1 = 'building'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

office building                     832
commercial building                 796
government/public building          124
government or public building        98
maintenance/building services        18
maintenance or building services     16
Name: premise, dtype: int64

In [74]:
change_premise(pat,df,'building')

In [75]:
df.premise.value_counts()[20:30]

convenience_store                           918
other, unknown, or not listed               800
vacant_structure                            739
miscellaneous business (non-specific)       702
strip business center parking lot           638
convenience_store_parking                   571
hospital_parking                            552
construction site                           544
parks and recreation, zoo, swimming pool    539
parks & recreation, zoo, swim pool          504
Name: premise, dtype: int64

In [76]:
word1 = 'construction'
word2 = 'site'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

construction site    544
Name: premise, dtype: int64

In [77]:
change_premise(pat,df,'construction_site')

In [78]:
df.premise.value_counts()[20:30]

convenience_store                           918
other, unknown, or not listed               800
vacant_structure                            739
miscellaneous business (non-specific)       702
strip business center parking lot           638
convenience_store_parking                   571
hospital_parking                            552
construction_site                           544
parks and recreation, zoo, swimming pool    539
parks & recreation, zoo, swim pool          504
Name: premise, dtype: int64

In [79]:
word1 = 'bus'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

bus stop       432
bus station    431
Name: premise, dtype: int64

In [80]:
change_premise(pat,df,'bus_stop_station')

In [81]:
df.premise.value_counts()[20:30]

convenience_store                           918
bus_stop_station                            863
other, unknown, or not listed               800
vacant_structure                            739
miscellaneous business (non-specific)       702
strip business center parking lot           638
convenience_store_parking                   571
hospital_parking                            552
construction_site                           544
parks and recreation, zoo, swimming pool    539
Name: premise, dtype: int64

In [82]:
word1 = 'parks'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

parks and recreation, zoo, swimming pool    539
parks & recreation, zoo, swim pool          504
Name: premise, dtype: int64

In [83]:
change_premise(pat,df,'park_rec_pool')

In [84]:
df.premise.value_counts()[20:30]

park_rec_pool                            1043
convenience_store                         918
bus_stop_station                          863
other, unknown, or not listed             800
vacant_structure                          739
miscellaneous business (non-specific)     702
strip business center parking lot         638
convenience_store_parking                 571
hospital_parking                          552
construction_site                         544
Name: premise, dtype: int64

In [85]:
word1 = 'sexually'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

sexually oriented club                    28
sexually oriented business parking lot    24
Name: premise, dtype: int64

In [86]:
change_premise(pat,df,'sexually_oriented_business')

In [87]:
word1 = 'business'
word2 = 'parking'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

strip business center parking lot    638
Name: premise, dtype: int64

In [88]:
change_premise(pat,df,'business_center_parking')

In [89]:
word1 = 'business'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

miscellaneous business (non-specific)    702
misc. business (non-specific)            349
Name: premise, dtype: int64

In [90]:
change_premise(pat,df,'business')

In [91]:
df.premise.value_counts()[30:40]

condominium                           480
garage or carport                     462
supermarket_parking                   438
multi-plex hme(duplex,triplex etc)    422
specialty store (non-specific)        393
stadium, sports arena, race track     365
stadium/sprts arena/race track        328
garage/carport                        322
drug store/medical supply             316
clothing store                        290
Name: premise, dtype: int64

In [92]:
word1 = 'stadium'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

stadium, sports arena, race track    365
stadium/sprts arena/race track       328
Name: premise, dtype: int64

In [93]:
change_premise(pat,df,'stadium_arena_track')

In [94]:
df.premise.value_counts()[30:40]

construction_site                            544
condominium                                  480
garage or carport                            462
supermarket_parking                          438
multi-plex hme(duplex,triplex etc)           422
specialty store (non-specific)               393
garage/carport                               322
drug store/medical supply                    316
clothing store                               290
vehicle/auto sales/lease/auto parts store    257
Name: premise, dtype: int64

In [95]:
word1 = 'drug'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

drug store/medical supply       316
drug store or medical supply    165
Name: premise, dtype: int64

In [96]:
change_premise(pat,df,'drug_store')

In [97]:
df.premise.value_counts()[30:40]

construction_site                            544
drug_store                                   481
condominium                                  480
garage or carport                            462
supermarket_parking                          438
multi-plex hme(duplex,triplex etc)           422
specialty store (non-specific)               393
garage/carport                               322
clothing store                               290
vehicle/auto sales/lease/auto parts store    257
Name: premise, dtype: int64

In [98]:
word1 = 'liquor'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

liquor store                70
liquor store parking lot    26
Name: premise, dtype: int64

In [99]:
change_premise(pat,df,'liquor_store')

In [100]:
df.premise.value_counts()[30:40]

construction_site                            544
drug_store                                   481
condominium                                  480
garage or carport                            462
supermarket_parking                          438
multi-plex hme(duplex,triplex etc)           422
specialty store (non-specific)               393
garage/carport                               322
clothing store                               290
vehicle/auto sales/lease/auto parts store    257
Name: premise, dtype: int64

In [101]:
word1 = 'auto'
word2 = 'repair'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

auto repair    116
Name: premise, dtype: int64

In [102]:
change_premise(pat,df,'auto_repair')

In [103]:
word1 = 'auto'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

vehicle/auto sales/lease/auto parts store    257
Name: premise, dtype: int64

In [104]:
change_premise(pat,df,'auto_sale_parts_store')

In [105]:
df.premise.value_counts()[30:40]

construction_site                     544
drug_store                            481
condominium                           480
garage or carport                     462
supermarket_parking                   438
multi-plex hme(duplex,triplex etc)    422
specialty store (non-specific)        393
garage/carport                        322
clothing store                        290
auto_sale_parts_store                 257
Name: premise, dtype: int64

In [106]:
word1 = 'school'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

high school                      142
elementary school                 50
private school                    36
middle school                     23
commercial or training school     20
Name: premise, dtype: int64

In [107]:
change_premise(pat,df,'school')

In [108]:
word1 = 'libraries'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

libraries, museums    218
Name: premise, dtype: int64

In [109]:
change_premise(pat,df,'libraries_museums')

In [110]:
df.premise.value_counts()[40:50]

auto_sale_parts_store                         257
libraries_museums                             218
church/synagogue/temple                       198
rental storage facility                       180
warehouse                                     172
gym,recreat,club hse,indr pool,spa            164
university/college                            146
gym, recreat, club house, indoor pool, spa    140
convention center or exhibit halls            136
convention center/exhibit halls               136
Name: premise, dtype: int64

In [111]:
word1 = 'church'
word2 = 'parking'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

church/synagogue/temple parking lot         130
church, synagogue, or temple parking lot    106
Name: premise, dtype: int64

In [112]:
change_premise(pat,df,'church_temple_parking')

In [113]:
word1 = 'church'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

church/synagogue/temple         198
church, synagogue, or temple     87
Name: premise, dtype: int64

In [114]:
change_premise(pat,df,'church_temple')

In [115]:
df.premise.value_counts()[40:50]

school                                        271
auto_sale_parts_store                         257
church_temple_parking                         236
libraries_museums                             218
rental storage facility                       180
warehouse                                     172
gym,recreat,club hse,indr pool,spa            164
university/college                            146
gym, recreat, club house, indoor pool, spa    140
convention center or exhibit halls            136
Name: premise, dtype: int64

In [116]:
word1 = 'gym'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

gym,recreat,club hse,indr pool,spa            164
gym, recreat, club house, indoor pool, spa    140
gym, recreat, club house, indoor pool          47
Name: premise, dtype: int64

In [117]:
change_premise(pat,df,'gym_club_house')

In [118]:
df.premise.value_counts()[40:50]

church_temple                         285
school                                271
auto_sale_parts_store                 257
church_temple_parking                 236
libraries_museums                     218
rental storage facility               180
warehouse                             172
university/college                    146
convention center/exhibit halls       136
convention center or exhibit halls    136
Name: premise, dtype: int64

In [119]:
word1 = 'storage'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

rental storage facility    180
Name: premise, dtype: int64

In [120]:
change_premise(pat,df,'storage_facility')

In [121]:
df.premise.value_counts()[40:50]

church_temple                         285
school                                271
auto_sale_parts_store                 257
church_temple_parking                 236
libraries_museums                     218
storage_facility                      180
warehouse                             172
university/college                    146
convention center/exhibit halls       136
convention center or exhibit halls    136
Name: premise, dtype: int64

In [122]:
word1 = 'convention'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

convention center or exhibit halls    136
convention center/exhibit halls       136
Name: premise, dtype: int64

In [123]:
change_premise(pat,df,'convention_center')

In [124]:
df.premise.value_counts()[50:60]

light rail (metro rail) vehicle          117
auto_repair                              116
multi-plex home (duplex,triplex etc.)    114
bank                                     114
highway/freeway                          112
physician's office                       107
barber and beauty shops                  101
liquor_store                              96
field, woods, forest, park                90
park and ride terminal                    83
Name: premise, dtype: int64

In [125]:
word1 = 'bank'
word2 = 'parking'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

bank/saving institution parking lot        77
bank or savings institution parking lot    44
Name: premise, dtype: int64

In [126]:
change_premise(pat,df,'bank_parking')

In [127]:
word1 = 'bank'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

bank    114
Name: premise, dtype: int64

In [128]:
change_premise(pat,df,'bank')

In [129]:
df.premise.value_counts()[50:60]

bank_parking                             121
light rail (metro rail) vehicle          117
auto_repair                              116
multi-plex home (duplex,triplex etc.)    114
bank                                     114
highway/freeway                          112
physician's office                       107
barber and beauty shops                  101
liquor_store                              96
field, woods, forest, park                90
Name: premise, dtype: int64

In [130]:
word1 = 'rail'
word2 = 'vehicle'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

light rail (metro rail) vehicle    117
light rail vehicle                   6
Name: premise, dtype: int64

In [131]:
change_premise(pat,df,'rail_vehicle')

In [132]:
word1 = 'rail'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

light rail platform    129
Name: premise, dtype: int64

In [133]:
change_premise(pat,df,'rail_platform')

In [134]:
df.premise.value_counts()[50:60]

rail_vehicle                             123
bank_parking                             121
auto_repair                              116
bank                                     114
multi-plex home (duplex,triplex etc.)    114
highway/freeway                          112
physician's office                       107
barber and beauty shops                  101
liquor_store                              96
field, woods, forest, park                90
Name: premise, dtype: int64

In [135]:
word1 = 'unknown'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

other/unknown                    1967
other, unknown, or not listed     800
Name: premise, dtype: int64

In [136]:
change_premise(pat,df,'unknown')

In [137]:
word1 = 'unk'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

unk    3598
Name: premise, dtype: int64

In [138]:
change_premise(pat,df,'unknown')

In [139]:
df.premise.value_counts()[50:60]

auto_repair                              116
multi-plex home (duplex,triplex etc.)    114
bank                                     114
highway/freeway                          112
physician's office                       107
barber and beauty shops                  101
liquor_store                              96
field, woods, forest, park                90
park and ride terminal                    83
physician, doctor, dentist's office       80
Name: premise, dtype: int64

In [140]:
word1 = 'parking'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

other parking lot                      4950
mall parking lot                         48
laundry or dry cleaners parking lot      17
laundry/dry cleaners parking lot         10
Name: premise, dtype: int64

In [141]:
change_premise(pat,df,'other_parking')

In [142]:
df.premise.value_counts()[50:60]

auto_repair                              116
bank                                     114
multi-plex home (duplex,triplex etc.)    114
highway/freeway                          112
physician's office                       107
barber and beauty shops                  101
liquor_store                              96
field, woods, forest, park                90
park and ride terminal                    83
physician, doctor, dentist's office       80
Name: premise, dtype: int64

In [143]:
word1 = 'garage'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

garage or carport    462
garage/carport       322
Name: premise, dtype: int64

In [144]:
change_premise(pat,df,'garage_carport')

In [145]:
df.premise.value_counts()[50:60]

bank                                     114
multi-plex home (duplex,triplex etc.)    114
highway/freeway                          112
physician's office                       107
barber and beauty shops                  101
liquor_store                              96
field, woods, forest, park                90
park and ride terminal                    83
physician, doctor, dentist's office       80
field/woods                               77
Name: premise, dtype: int64

In [146]:
word1 = 'condominium'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

condominium    480
Name: premise, dtype: int64

In [147]:
change_premise(pat,df,'apartment')

In [148]:
df.premise.value_counts()[50:60]

bank                                   114
highway/freeway                        112
physician's office                     107
barber and beauty shops                101
liquor_store                            96
field, woods, forest, park              90
park and ride terminal                  83
physician, doctor, dentist's office     80
field/woods                             77
police station                          76
Name: premise, dtype: int64

In [149]:
word1 = 'adult'
word2 = 'store'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

adult book store/newsstand       1
adult book store or newsstand    1
Name: premise, dtype: int64

In [150]:
change_premise(pat,df,'sexually_oriented_business')

In [151]:
word1 = 'specialty'
word2 = 'store'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

specialty store (non-specific)    393
Name: premise, dtype: int64

In [152]:
change_premise(pat,df,'specialty_store')

In [153]:
word1 = 'clothing'
word2 = 'store'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

clothing store    290
Name: premise, dtype: int64

In [154]:
change_premise(pat,df,'clothing_store')

In [155]:
word1 = 'electronics'
word2 = 'store'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

electronics store, electrical sup.        55
electronics store, electrical supplies    33
Name: premise, dtype: int64

In [156]:
change_premise(pat,df,'electronic_store')

In [157]:
word1 = 'store'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

furniture, appliances, radios, tv store    5
Name: premise, dtype: int64

In [158]:
change_premise(pat,df,'electronic_store')

In [159]:
df.premise.value_counts()[50:60]

multi-plex home (duplex,triplex etc.)    114
highway/freeway                          112
physician's office                       107
barber and beauty shops                  101
liquor_store                              96
electronic_store                          93
field, woods, forest, park                90
park and ride terminal                    83
physician, doctor, dentist's office       80
field/woods                               77
Name: premise, dtype: int64

In [160]:
word1 = 'plex'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

multi-plex hme(duplex,triplex etc)       422
multi-plex home (duplex,triplex etc.)    114
Name: premise, dtype: int64

In [161]:
change_premise(pat,df,'multiplex_home')

In [162]:
df.premise.value_counts()[50:60]

highway/freeway                        112
physician's office                     107
barber and beauty shops                101
liquor_store                            96
electronic_store                        93
field, woods, forest, park              90
park and ride terminal                  83
physician, doctor, dentist's office     80
field/woods                             77
university or college                   76
Name: premise, dtype: int64

In [163]:
# word1 = 'vacant'
# pat = r"^(?=.*\b{}\b).*$".format(word1)
# clean_premise(pat,df)

In [164]:
df.premise.value_counts()[50:60]

highway/freeway                        112
physician's office                     107
barber and beauty shops                101
liquor_store                            96
electronic_store                        93
field, woods, forest, park              90
park and ride terminal                  83
physician, doctor, dentist's office     80
field/woods                             77
university or college                   76
Name: premise, dtype: int64

In [165]:
word1 = 'warehouse'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

warehouse    172
Name: premise, dtype: int64

In [166]:
change_premise(pat,df,'warehouse')

In [167]:
word1 = 'college'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

university/college       146
university or college     76
Name: premise, dtype: int64

In [168]:
change_premise(pat,df,'college')

In [169]:
word1 = 'highway'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

highway/freeway       112
highway or freeway     59
Name: premise, dtype: int64

In [170]:
change_premise(pat,df,'highway_freeway')

In [171]:
word1 = 'rental'
word2 = 'office'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

apartment/rental office    64
apartment rental office    63
Name: premise, dtype: int64

In [172]:
change_premise(pat,df,'rental_office')

In [173]:
word1 = 'doctor'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

physician, doctor, dentist's office    80
Name: premise, dtype: int64

In [174]:
change_premise(pat,df,'doctor_office')

In [175]:
word1 = 'office'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

physician's office                           107
book,record,stationary,office sup.            13
book, record, stationary, office supplies      3
Name: premise, dtype: int64

In [176]:
word1 = 'book'
word2 = 'office'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

book,record,stationary,office sup.           13
book, record, stationary, office supplies     3
Name: premise, dtype: int64

In [177]:
change_premise(pat,df,'office_supplies')

In [178]:
word1 = 'physician'
word2 = 'office'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

physician's office    107
Name: premise, dtype: int64

In [179]:
change_premise(pat,df,'doctor_office')

In [180]:
df.premise.value_counts()[60:70]

social services/public charities             71
laundry/dry cleaners/washaterias             66
alley                                        66
freeway service road                         62
amusement park, bowling alley, skate rink    59
sexually_oriented_business                   54
high rise                                    53
car wash                                     51
theatres,dinner theaters,auditor.            50
---                                          48
Name: premise, dtype: int64

In [181]:
word1 = 'field'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

field, woods, forest, park    90
field/woods                   77
Name: premise, dtype: int64

In [182]:
change_premise(pat,df,'field_woods_park')

In [183]:
df.premise.value_counts()[:60]

road_street_sidewalk                14608
house                                6907
apartment                            6703
unknown                              6365
apartment_parking                    6248
other_parking                        5025
commercial_parking                   4421
hospital                             2192
restaurant_parking                   2158
building                             1884
driveway                             1876
bar_club                             1761
supermarket                          1475
restaurant                           1417
department_store                     1373
gas_station                          1334
bar_club_parking                     1262
hotel_motel_parking                  1235
hotel_motel                          1199
business                             1051
park_rec_pool                        1043
convenience_store                     918
bus_stop_station                      863
garage_carport                    

In [184]:
word1 = 'cleaners'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

laundry/dry cleaners/washaterias      66
laundry, dry cleaners, washaterias    39
Name: premise, dtype: int64

In [185]:
change_premise(pat,df,'dry_cleaners')

In [186]:
word1 = 'park'
word2 = 'ride'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

park and ride terminal    83
park & ride terminal      25
Name: premise, dtype: int64

In [187]:
change_premise(pat,df,'park_ride')

In [188]:
word1 = 'police'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

police station    76
Name: premise, dtype: int64

In [189]:
change_premise(pat,df,'police_station')

In [190]:
word1 = 'credit'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

credit union    7
Name: premise, dtype: int64

In [191]:
change_premise(pat,df,'bank')

In [192]:
word1 = 'package'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

package facility (fedex,ups,dhl)    5
Name: premise, dtype: int64

In [193]:
change_premise(pat,df,'package_facility')

In [194]:
word1 = 'barber'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

barber and beauty shops    101
Name: premise, dtype: int64

In [195]:
change_premise(pat,df,'barber_shop')

In [196]:
word1 = 'pool'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

pool hall/game room       9
pool hall or game room    3
Name: premise, dtype: int64

In [197]:
change_premise(pat,df,'pool_hall')

In [198]:
word1 = 'video'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

video rental & sales    3
Name: premise, dtype: int64

In [199]:
change_premise(pat,df,'video_rental')

In [200]:
word1 = 'care'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

daycare/child care/kindergarten         34
daycare, child care, or kindergarten    22
Name: premise, dtype: int64

In [201]:
change_premise(pat,df,'daycare')

In [202]:
len(df.premise.value_counts())

115

In [203]:
df.premise.value_counts()[-30:]

pawn/resale shop/flea market             20
jail/prison                              19
utility company,electric,gas,water       18
office_supplies                          16
pawn, resale shop, or flea market        16
toys,arts & craft,musical,bike,pet       14
body shop                                13
pool_hall                                12
jail or prison                           11
train (not metro) terminal               10
railroad track/right of way              10
lake/waterway/bayou                       9
mobile home                               9
savings and loan institutions             8
railroad track or right of way            7
furniture, appliances, radios, tv         7
sporting goods/gun shops                  7
not listed                                5
package_facility                          5
garden supply, nursery, florist           5
lake, pond, waterway, bayou, river        5
garden supply, nursery, or florist        4
n                               

In [204]:
word1 = 'prison'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

jail/prison       19
jail or prison    11
Name: premise, dtype: int64

In [205]:
change_premise(pat,df,'jail_prison')

In [206]:
word1 = 'loan'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

savings and loan institutions    8
Name: premise, dtype: int64

In [207]:
change_premise(pat,df,'bank')

In [208]:
word1 = 'pawn'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

pawn/resale shop/flea market         20
pawn, resale shop, or flea market    16
Name: premise, dtype: int64

In [209]:
change_premise(pat,df,'pawn_shop')

In [210]:
word1 = 'mobile'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

mobile home    9
Name: premise, dtype: int64

In [211]:
change_premise(pat,df,'mobile_home')

In [212]:
word1 = 'shop'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

body shop    13
Name: premise, dtype: int64

In [213]:
change_premise(pat,df,'body_shop')

In [214]:
df.premise.value_counts()[60:]

social services/public charities             71
alley                                        66
freeway service road                         62
amusement park, bowling alley, skate rink    59
daycare                                      56
sexually_oriented_business                   54
high rise                                    53
car wash                                     51
theatres,dinner theaters,auditor.            50
---                                          48
nursing home                                 48
theatres, dinner theaters, auditoriums       46
amuse. park,bowl. alley,skate rink           43
rehabilitation center                        42
social services or public charities          39
pawn_shop                                    36
mall common area                             34
factory/manufacturing/industrial             32
apartment laundry                            31
jail_prison                                  30
airport terminal                        

In [215]:
word1 = 'railroad'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

railroad track/right of way       10
railroad track or right of way     7
Name: premise, dtype: int64

In [216]:
change_premise(pat,df,'rail_road')

In [217]:
word1 = 'rise'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

high rise    53
Name: premise, dtype: int64

In [218]:
change_premise(pat,df,'high_rise')

In [219]:
word1 = 'check'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

check cashing places    20
Name: premise, dtype: int64

In [220]:
change_premise(pat,df,'check_cash')

In [221]:
word1 = 'center'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

rehabilitation center    42
Name: premise, dtype: int64

In [222]:
change_premise(pat,df,'rehab_center')

In [223]:
len(df.premise.value_counts())

111

In [224]:
df.head()

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type
0,1916-05-23,19,10H70,Aggravated Assault,UNK,LIBERTY ROAD,house,1,-
1,1917-02-20,16,10H70,Theft,7500-7599,ARDMORE,other_parking,1,ST
2,1963-02-02,13,10H40,Theft,3800-3899,MAIN,rehab_center,1,ST
3,1966-01-01,0,10H50,Rape,3300-3399,ALABAMA,apartment,1,ST
4,1971-02-03,6,1A10,Theft,1200-1299,TRAVIS,house,1,-


In [225]:
df.to_csv(data_directory_saves+"crime_beats_02.csv")