In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt
import datetime  as dt
import seaborn as sns
import re

In [3]:
data_folder = 'merge_data'

In [4]:
# data folder path
data_directory = os.path.join('..','data','clean_data/{}/crime_clean_01.csv'.format(data_folder))
data_directory_saves = os.path.join( '..','data','clean_data','merge_data/')

In [5]:
df = pd.read_csv(data_directory)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006227 entries, 0 to 1006226
Data columns (total 9 columns):
date            1006227 non-null object
hour            1006227 non-null int64
beat            1006227 non-null object
offense_type    1006227 non-null object
block_range     1006227 non-null object
street_name     1006225 non-null object
premise         1006227 non-null object
num_offenses    1006227 non-null int64
type            1006227 non-null object
dtypes: int64(2), object(7)
memory usage: 69.1+ MB


In [7]:
df.head()

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type
0,1914-09-08,7,24C60,Burglary,12700-12799,LAKE HOUSTON,Restaurant or Cafeteria,1,PKWY
1,1914-11-02,3,18F60,Burglary,8800-8899,BELLAIRE,Miscellaneous Business (Non-Specific),1,BLVD
2,1914-12-03,19,12D20,Auto Theft,12800-12899,GULF,UNK,1,FWY
3,1915-01-05,22,3B10,Theft,3200-3299,MANGUM RD 180,Other Parking Lot,1,-
4,1915-01-14,23,5F10,Auto Theft,7000-7099,WESTVIEW,Apartment Parking Lot,1,DR


## cleanup premise column

In [8]:
len(df.premise.unique())

384

In [9]:
df.premise.value_counts()[:10]

APARTMENT                    74377
RESIDENCE/HOUSE              74346
APARTMENT PARKING LOT        65458
Residence or House           54141
Apartment Parking Lot        44505
ROAD/STREET/SIDEWALK         41659
Road, Street, or Sidewalk    38983
Apartment                    38773
DRIVEWAY                     37014
UNK                          35433
Name: premise, dtype: int64

## lowercase all

In [10]:
df.premise = df.premise.str.lower()

In [11]:
len(df.premise.unique())

326

In [12]:
df.premise.value_counts()[:10]

apartment                    113150
apartment parking lot        109963
residence/house               74346
residence or house            54141
driveway                      52267
other parking lot             44210
road/street/sidewalk          41659
road, street, or sidewalk     38983
unk                           35433
department/discount store     24960
Name: premise, dtype: int64

In [13]:
df.premise = df.premise.str.strip()

In [14]:
len(df.premise.unique())

206

In [15]:
df.premise.value_counts()[:10]

apartment                    114203
apartment parking lot        111134
residence/house               74346
residence or house            55371
driveway                      52736
other parking lot             44765
road/street/sidewalk          41659
road, street, or sidewalk     40095
unk                           35433
department/discount store     24960
Name: premise, dtype: int64

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006227 entries, 0 to 1006226
Data columns (total 9 columns):
date            1006227 non-null object
hour            1006227 non-null int64
beat            1006227 non-null object
offense_type    1006227 non-null object
block_range     1006227 non-null object
street_name     1006225 non-null object
premise         1006227 non-null object
num_offenses    1006227 non-null int64
type            1006227 non-null object
dtypes: int64(2), object(7)
memory usage: 69.1+ MB


## Create two function
- display results based on regex pattern
- change results based on regex pattern to new value

In [17]:
def clean_premise(pat,df):
    fil = df.premise.str.contains(pat,regex=True)
    result = df[fil].premise.value_counts()
    return result

In [18]:
def change_premise(pat,df,new_premise):
    df.premise = df.premise.replace(pat,new_premise,regex=True)

In [19]:
word1 = 'vacant'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

vacant single occupancy residence (houses,townhouses,duplexes, etc.)                  2104
vacant single occ resd(house,townhs,dplex)                                            1547
vacant building (commercial)                                                           836
vacant other residential (apartment,dorms)                                             420
vacant other residential (apartment,inn,dorms,boarding house)                          382
vacant storage fac (barn,garage,warehouse)                                             342
vacant grocery store or supermarket                                                    172
vacant storage facility (barns,garages,warehouses,etc.)                                159
vacant other out build/monument/underconst                                             157
vacant restaurant                                                                      151
vacant other structure (out buildings,monuments,buildings under construction,etc.)     140

In [20]:
change_premise(pat,df,'vacant_structure')

## road

In [21]:
word1 = 'road'
word2 = 'street'

In [22]:
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

road/street/sidewalk         41659
road, street, or sidewalk    40095
Name: premise, dtype: int64

In [23]:
change_premise(pat,df,'road_street_sidewalk')

In [24]:
df.premise.value_counts()[:10]

apartment                            114203
apartment parking lot                111134
road_street_sidewalk                  81754
residence/house                       74346
residence or house                    55371
driveway                              52736
other parking lot                     44765
unk                                   35433
department/discount store             24960
strip business center parking lot     18597
Name: premise, dtype: int64

## apartment parking

In [25]:
word1 = 'apartment'
word2 = 'parking'

In [26]:
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

apartment parking lot    111134
Name: premise, dtype: int64

In [27]:
change_premise(pat,df,'apartment_parking')

In [28]:
df.premise.value_counts()[:10]

apartment                            114203
apartment_parking                    111134
road_street_sidewalk                  81754
residence/house                       74346
residence or house                    55371
driveway                              52736
other parking lot                     44765
unk                                   35433
department/discount store             24960
strip business center parking lot     18597
Name: premise, dtype: int64

## residence

In [29]:
word1 = 'residence'
word2 = 'house'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

residence/house       74346
residence or house    55371
Name: premise, dtype: int64

In [30]:
change_premise(pat,df,'house')

In [31]:
df.premise.value_counts()[:10]

house                                129717
apartment                            114203
apartment_parking                    111134
road_street_sidewalk                  81754
driveway                              52736
other parking lot                     44765
unk                                   35433
department/discount store             24960
strip business center parking lot     18597
restaurant/cafeteria parking lot      16952
Name: premise, dtype: int64

## commercial parking

In [32]:
word1 = 'commercial'
word2 = 'parking'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

commercial parking lot/garage       13878
commercial parking lot or garage    10451
Name: premise, dtype: int64

In [33]:
change_premise(pat,df,'commercial_parking')

In [34]:
df.premise.value_counts()[:10]

house                                129717
apartment                            114203
apartment_parking                    111134
road_street_sidewalk                  81754
driveway                              52736
other parking lot                     44765
unk                                   35433
department/discount store             24960
commercial_parking                    24329
strip business center parking lot     18597
Name: premise, dtype: int64

## restaurant parking

In [35]:
word1 = 'restaurant'
word2 = 'parking'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

restaurant/cafeteria parking lot       16952
restaurant or cafeteria parking lot    15517
Name: premise, dtype: int64

In [36]:
change_premise(pat,df,'restaurant_parking')

In [37]:
df.premise.value_counts()[:10]

house                        129717
apartment                    114203
apartment_parking            111134
road_street_sidewalk          81754
driveway                      52736
other parking lot             44765
unk                           35433
restaurant_parking            32469
department/discount store     24960
commercial_parking            24329
Name: premise, dtype: int64

In [38]:
word1 = 'club'
word2 = 'parking'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

bar/night club parking lot       6172
bar or night club parking lot    3927
Name: premise, dtype: int64

In [39]:
change_premise(pat,df,'bar_club_parking')

In [40]:
df.premise.value_counts()[:10]

house                        129717
apartment                    114203
apartment_parking            111134
road_street_sidewalk          81754
driveway                      52736
other parking lot             44765
unk                           35433
restaurant_parking            32469
department/discount store     24960
commercial_parking            24329
Name: premise, dtype: int64

In [41]:
word1 = 'club'
word2 = 'bar'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

bar/night club       5603
bar or night club    3403
Name: premise, dtype: int64

In [42]:
change_premise(pat,df,'bar_club')

In [43]:
df.premise.value_counts()[:10]

house                        129717
apartment                    114203
apartment_parking            111134
road_street_sidewalk          81754
driveway                      52736
other parking lot             44765
unk                           35433
restaurant_parking            32469
department/discount store     24960
commercial_parking            24329
Name: premise, dtype: int64

In [44]:
word1 = 'grocery'
word2 = 'parking'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

grocery/supermarket parking lot             6596
grocery store or supermarket parking lot    5298
Name: premise, dtype: int64

In [45]:
change_premise(pat,df,'supermarket_parking')

In [46]:
df.premise.value_counts()[:10]

house                        129717
apartment                    114203
apartment_parking            111134
road_street_sidewalk          81754
driveway                      52736
other parking lot             44765
unk                           35433
restaurant_parking            32469
department/discount store     24960
commercial_parking            24329
Name: premise, dtype: int64

In [47]:
word1 = 'grocery'
word2 = 'supermarket'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

grocery/supermarket             13173
grocery store or supermarket     7931
Name: premise, dtype: int64

In [48]:
change_premise(pat,df,'supermarket')

In [49]:
df.premise.value_counts()[:10]

house                        129717
apartment                    114203
apartment_parking            111134
road_street_sidewalk          81754
driveway                      52736
other parking lot             44765
unk                           35433
restaurant_parking            32469
department/discount store     24960
commercial_parking            24329
Name: premise, dtype: int64

In [50]:



word1 = 'department'
word2 = 'discount'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

department/discount store       24960
department or discount store    13194
Name: premise, dtype: int64

In [51]:
change_premise(pat,df,'department_store')

In [52]:
df.premise.value_counts()[:10]

house                   129717
apartment               114203
apartment_parking       111134
road_street_sidewalk     81754
driveway                 52736
other parking lot        44765
department_store         38154
unk                      35433
restaurant_parking       32469
commercial_parking       24329
Name: premise, dtype: int64

In [53]:
word1 = 'hotel'
word2 = 'parking'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

hotel/motel parking lot       7476
hotel or motel parking lot    5923
Name: premise, dtype: int64

In [54]:
change_premise(pat,df,'hotel_motel_parking')

In [55]:
df.premise.value_counts()[:10]

house                   129717
apartment               114203
apartment_parking       111134
road_street_sidewalk     81754
driveway                 52736
other parking lot        44765
department_store         38154
unk                      35433
restaurant_parking       32469
commercial_parking       24329
Name: premise, dtype: int64

In [56]:
word1 = 'hotel'
word2 = 'motel'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

hotel/motel/etc.           4630
hotel, motel, inn, etc.    4150
Name: premise, dtype: int64

In [57]:
change_premise(pat,df,'hotel_motel')

In [58]:
df.premise.value_counts()[:10]

house                   129717
apartment               114203
apartment_parking       111134
road_street_sidewalk     81754
driveway                 52736
other parking lot        44765
department_store         38154
unk                      35433
restaurant_parking       32469
commercial_parking       24329
Name: premise, dtype: int64

In [59]:
word1 = 'restaurant'
word2 = 'cafeteria'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

restaurant/cafeteria       9108
restaurant or cafeteria    6198
Name: premise, dtype: int64

In [60]:
change_premise(pat,df,'restaurant')

In [61]:
df.premise.value_counts()[:10]

house                   129717
apartment               114203
apartment_parking       111134
road_street_sidewalk     81754
driveway                 52736
other parking lot        44765
department_store         38154
unk                      35433
restaurant_parking       32469
commercial_parking       24329
Name: premise, dtype: int64

In [62]:
word1 = 'gas'
word2 = 'station'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

service or gas station    9570
service/gas station       7680
Name: premise, dtype: int64

In [63]:
change_premise(pat,df,'gas_station')

In [64]:
df.premise.value_counts()[:10]

house                   129717
apartment               114203
apartment_parking       111134
road_street_sidewalk     81754
driveway                 52736
other parking lot        44765
department_store         38154
unk                      35433
restaurant_parking       32469
commercial_parking       24329
Name: premise, dtype: int64

In [65]:
word1 = 'hospital'
word2 = 'parking'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

hospital parking lot    2071
Name: premise, dtype: int64

In [66]:
change_premise(pat,df,'hospital_parking')

In [67]:
df.premise.value_counts()[10:20]

supermarket                              21104
strip business center parking lot        18597
gas_station                              17250
other/unknown                            16634
restaurant                               15306
hotel_motel_parking                      13399
convenience store                        12957
supermarket_parking                      11894
miscellaneous business (non-specific)    10682
bar_club_parking                         10099
Name: premise, dtype: int64

In [68]:
word1 = 'convenience'
word2 = 'store'
word3 = 'parking'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2,word3)
clean_premise(pat,df)

convenience store parking lot    8474
Name: premise, dtype: int64

In [69]:
change_premise(pat,df,'convenience_store_parking')

In [70]:
word1 = 'convenience'
word2 = 'store'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

convenience store    12957
Name: premise, dtype: int64

In [71]:
change_premise(pat,df,'convenience_store')

In [72]:
df.premise.value_counts()[20:30]

bar_club                          9006
hotel_motel                       8780
convenience_store_parking         8474
commercial building               7771
specialty store (non-specific)    7757
mall parking lot                  7445
other, unknown, or not listed     7197
vacant_structure                  6851
office building                   5436
clothing store                    4987
Name: premise, dtype: int64

In [73]:
word1 = 'building'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

commercial building                 7771
office building                     5436
government/public building           497
government or public building        406
maintenance/building services        341
maintenance or building services     238
Name: premise, dtype: int64

In [74]:
change_premise(pat,df,'building')

In [75]:
df.premise.value_counts()[20:30]

bar_club_parking                  10099
bar_club                           9006
hotel_motel                        8780
convenience_store_parking          8474
specialty store (non-specific)     7757
mall parking lot                   7445
other, unknown, or not listed      7197
vacant_structure                   6851
clothing store                     4987
misc. business (non-specific)      4857
Name: premise, dtype: int64

In [76]:
word1 = 'construction'
word2 = 'site'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

construction site    3849
Name: premise, dtype: int64

In [77]:
change_premise(pat,df,'construction_site')

In [78]:
df.premise.value_counts()[20:30]

bar_club_parking                  10099
bar_club                           9006
hotel_motel                        8780
convenience_store_parking          8474
specialty store (non-specific)     7757
mall parking lot                   7445
other, unknown, or not listed      7197
vacant_structure                   6851
clothing store                     4987
misc. business (non-specific)      4857
Name: premise, dtype: int64

In [79]:
word1 = 'bus'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

bus stop       2169
bus station    1006
Name: premise, dtype: int64

In [80]:
change_premise(pat,df,'bus_stop_station')

In [81]:
df.premise.value_counts()[20:30]

bar_club_parking                  10099
bar_club                           9006
hotel_motel                        8780
convenience_store_parking          8474
specialty store (non-specific)     7757
mall parking lot                   7445
other, unknown, or not listed      7197
vacant_structure                   6851
clothing store                     4987
misc. business (non-specific)      4857
Name: premise, dtype: int64

In [82]:
word1 = 'parks'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

parks and recreation, zoo, swimming pool    3336
parks & recreation, zoo, swim pool          3023
Name: premise, dtype: int64

In [83]:
change_premise(pat,df,'park_rec_pool')

In [84]:
df.premise.value_counts()[20:30]

bar_club_parking                  10099
bar_club                           9006
hotel_motel                        8780
convenience_store_parking          8474
specialty store (non-specific)     7757
mall parking lot                   7445
other, unknown, or not listed      7197
vacant_structure                   6851
park_rec_pool                      6359
clothing store                     4987
Name: premise, dtype: int64

In [85]:
word1 = 'sexually'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

sexually oriented club                    758
sexually oriented business parking lot    659
Name: premise, dtype: int64

In [86]:
change_premise(pat,df,'sexually_oriented_business')

In [87]:
word1 = 'business'
word2 = 'parking'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

strip business center parking lot    18597
Name: premise, dtype: int64

In [88]:
change_premise(pat,df,'business_center_parking')

In [89]:
word1 = 'business'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

miscellaneous business (non-specific)    10682
misc. business (non-specific)             4857
Name: premise, dtype: int64

In [90]:
change_premise(pat,df,'business')

In [91]:
df.premise.value_counts()[30:40]

vehicle/auto sales/lease/auto parts store    4447
rental storage facility                      4120
condominium                                  3882
construction_site                            3849
airport terminal                             3781
garage/carport                               3720
hospital                                     3498
multi-plex hme(duplex,triplex etc)           3188
bus_stop_station                             3175
drug store/medical supply                    3126
Name: premise, dtype: int64

In [92]:
word1 = 'stadium'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

stadium, sports arena, race track    449
stadium/sprts arena/race track       423
Name: premise, dtype: int64

In [93]:
change_premise(pat,df,'stadium_arena_track')

In [94]:
df.premise.value_counts()[30:40]

vehicle/auto sales/lease/auto parts store    4447
rental storage facility                      4120
condominium                                  3882
construction_site                            3849
airport terminal                             3781
garage/carport                               3720
hospital                                     3498
multi-plex hme(duplex,triplex etc)           3188
bus_stop_station                             3175
drug store/medical supply                    3126
Name: premise, dtype: int64

In [95]:
word1 = 'drug'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

drug store/medical supply       3126
drug store or medical supply    1739
Name: premise, dtype: int64

In [96]:
change_premise(pat,df,'drug_store')

In [97]:
df.premise.value_counts()[30:40]

drug_store                                   4865
vehicle/auto sales/lease/auto parts store    4447
rental storage facility                      4120
condominium                                  3882
construction_site                            3849
airport terminal                             3781
garage/carport                               3720
hospital                                     3498
multi-plex hme(duplex,triplex etc)           3188
bus_stop_station                             3175
Name: premise, dtype: int64

In [98]:
word1 = 'liquor'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

liquor store                971
liquor store parking lot    289
Name: premise, dtype: int64

In [99]:
change_premise(pat,df,'liquor_store')

In [100]:
df.premise.value_counts()[30:40]

drug_store                                   4865
vehicle/auto sales/lease/auto parts store    4447
rental storage facility                      4120
condominium                                  3882
construction_site                            3849
airport terminal                             3781
garage/carport                               3720
hospital                                     3498
multi-plex hme(duplex,triplex etc)           3188
bus_stop_station                             3175
Name: premise, dtype: int64

In [101]:
word1 = 'auto'
word2 = 'repair'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

auto repair    2816
Name: premise, dtype: int64

In [102]:
change_premise(pat,df,'auto_repair')

In [103]:
word1 = 'auto'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

vehicle/auto sales/lease/auto parts store    4447
Name: premise, dtype: int64

In [104]:
change_premise(pat,df,'auto_sale_parts_store')

In [105]:
df.premise.value_counts()[30:40]

drug_store                            4865
auto_sale_parts_store                 4447
rental storage facility               4120
condominium                           3882
construction_site                     3849
airport terminal                      3781
garage/carport                        3720
hospital                              3498
multi-plex hme(duplex,triplex etc)    3188
bus_stop_station                      3175
Name: premise, dtype: int64

In [106]:
word1 = 'school'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

high school                      2061
middle school                    1199
elementary school                1102
private school                    548
commercial or training school     221
Name: premise, dtype: int64

In [107]:
change_premise(pat,df,'school')

In [108]:
word1 = 'libraries'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

libraries, museums    624
Name: premise, dtype: int64

In [109]:
change_premise(pat,df,'libraries_museums')

In [110]:
df.premise.value_counts()[40:50]

bus_stop_station                       3175
garage or carport                      3038
mall common area                       2937
auto_repair                            2816
warehouse                              2503
barber and beauty shops                2136
church/synagogue/temple                2073
hospital_parking                       2071
church/synagogue/temple parking lot    1871
electronics store, electrical sup.     1787
Name: premise, dtype: int64

In [111]:
word1 = 'church'
word2 = 'parking'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

church/synagogue/temple parking lot         1871
church, synagogue, or temple parking lot    1037
Name: premise, dtype: int64

In [112]:
change_premise(pat,df,'church_temple_parking')

In [113]:
word1 = 'church'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

church/synagogue/temple         2073
church, synagogue, or temple    1185
Name: premise, dtype: int64

In [114]:
change_premise(pat,df,'church_temple')

In [115]:
df.premise.value_counts()[40:50]

multi-plex hme(duplex,triplex etc)    3188
bus_stop_station                      3175
garage or carport                     3038
mall common area                      2937
church_temple_parking                 2908
auto_repair                           2816
warehouse                             2503
barber and beauty shops               2136
hospital_parking                      2071
electronics store, electrical sup.    1787
Name: premise, dtype: int64

In [116]:
word1 = 'gym'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

gym,recreat,club hse,indr pool,spa            1545
gym, recreat, club house, indoor pool, spa    1421
gym, recreat, club house, indoor pool          428
Name: premise, dtype: int64

In [117]:
change_premise(pat,df,'gym_club_house')

In [118]:
df.premise.value_counts()[40:50]

church_temple                         3258
multi-plex hme(duplex,triplex etc)    3188
bus_stop_station                      3175
garage or carport                     3038
mall common area                      2937
church_temple_parking                 2908
auto_repair                           2816
warehouse                             2503
barber and beauty shops               2136
hospital_parking                      2071
Name: premise, dtype: int64

In [119]:
word1 = 'storage'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

rental storage facility    4120
Name: premise, dtype: int64

In [120]:
change_premise(pat,df,'storage_facility')

In [121]:
df.premise.value_counts()[40:50]

church_temple                         3258
multi-plex hme(duplex,triplex etc)    3188
bus_stop_station                      3175
garage or carport                     3038
mall common area                      2937
church_temple_parking                 2908
auto_repair                           2816
warehouse                             2503
barber and beauty shops               2136
hospital_parking                      2071
Name: premise, dtype: int64

In [122]:
word1 = 'convention'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

convention center or exhibit halls    187
convention center/exhibit halls       164
Name: premise, dtype: int64

In [123]:
change_premise(pat,df,'convention_center')

In [124]:
df.premise.value_counts()[50:60]

electronics store, electrical sup.        1787
bank                                      1511
sexually_oriented_business                1417
car wash                                  1274
liquor_store                              1260
bank/saving institution parking lot       1135
apartment/rental office                   1067
electronics store, electrical supplies    1065
laundry/dry cleaners/washaterias          1057
mobile home                               1053
Name: premise, dtype: int64

In [125]:
word1 = 'bank'
word2 = 'parking'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

bank/saving institution parking lot        1135
bank or savings institution parking lot     779
Name: premise, dtype: int64

In [126]:
change_premise(pat,df,'bank_parking')

In [127]:
word1 = 'bank'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

bank    1511
Name: premise, dtype: int64

In [128]:
change_premise(pat,df,'bank')

In [129]:
df.premise.value_counts()[50:60]

bank_parking                              1914
electronics store, electrical sup.        1787
bank                                      1511
sexually_oriented_business                1417
car wash                                  1274
liquor_store                              1260
apartment/rental office                   1067
electronics store, electrical supplies    1065
laundry/dry cleaners/washaterias          1057
mobile home                               1053
Name: premise, dtype: int64

In [130]:
word1 = 'rail'
word2 = 'vehicle'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

light rail (metro rail) vehicle    183
light rail vehicle                   6
Name: premise, dtype: int64

In [131]:
change_premise(pat,df,'rail_vehicle')

In [132]:
word1 = 'rail'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

light rail platform    182
Name: premise, dtype: int64

In [133]:
change_premise(pat,df,'rail_platform')

In [134]:
df.premise.value_counts()[50:60]

bank_parking                              1914
electronics store, electrical sup.        1787
bank                                      1511
sexually_oriented_business                1417
car wash                                  1274
liquor_store                              1260
apartment/rental office                   1067
electronics store, electrical supplies    1065
laundry/dry cleaners/washaterias          1057
mobile home                               1053
Name: premise, dtype: int64

In [135]:
word1 = 'unknown'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

other/unknown                    16634
other, unknown, or not listed     7197
Name: premise, dtype: int64

In [136]:
change_premise(pat,df,'unknown')

In [137]:
word1 = 'unk'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

unk    35433
Name: premise, dtype: int64

In [138]:
change_premise(pat,df,'unknown')

In [139]:
df.premise.value_counts()[50:60]

bank                                      1511
sexually_oriented_business                1417
car wash                                  1274
liquor_store                              1260
apartment/rental office                   1067
electronics store, electrical supplies    1065
laundry/dry cleaners/washaterias          1057
mobile home                               1053
field, woods, forest, park                1003
apartment rental office                    933
Name: premise, dtype: int64

In [140]:
word1 = 'parking'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

other parking lot                      44765
mall parking lot                        7445
laundry/dry cleaners parking lot         300
laundry or dry cleaners parking lot      238
Name: premise, dtype: int64

In [141]:
change_premise(pat,df,'other_parking')

In [142]:
df.premise.value_counts()[50:60]

sexually_oriented_business                1417
car wash                                  1274
liquor_store                              1260
apartment/rental office                   1067
electronics store, electrical supplies    1065
laundry/dry cleaners/washaterias          1057
mobile home                               1053
field, woods, forest, park                1003
apartment rental office                    933
highway/freeway                            926
Name: premise, dtype: int64

In [143]:
word1 = 'garage'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

garage/carport       3720
garage or carport    3038
Name: premise, dtype: int64

In [144]:
change_premise(pat,df,'garage_carport')

In [145]:
df.premise.value_counts()[50:60]

car wash                                  1274
liquor_store                              1260
apartment/rental office                   1067
electronics store, electrical supplies    1065
laundry/dry cleaners/washaterias          1057
mobile home                               1053
field, woods, forest, park                1003
apartment rental office                    933
highway/freeway                            926
freeway service road                       887
Name: premise, dtype: int64

In [146]:
word1 = 'condominium'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

condominium    3882
Name: premise, dtype: int64

In [147]:
change_premise(pat,df,'apartment')

In [148]:
df.premise.value_counts()[50:60]

liquor_store                              1260
apartment/rental office                   1067
electronics store, electrical supplies    1065
laundry/dry cleaners/washaterias          1057
mobile home                               1053
field, woods, forest, park                1003
apartment rental office                    933
highway/freeway                            926
freeway service road                       887
stadium_arena_track                        872
Name: premise, dtype: int64

In [149]:
word1 = 'adult'
word2 = 'store'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

adult book store/newsstand       132
adult book store or newsstand     83
adult novelty store               15
Name: premise, dtype: int64

In [150]:
change_premise(pat,df,'sexually_oriented_business')

In [151]:
word1 = 'specialty'
word2 = 'store'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

specialty store (non-specific)    7757
Name: premise, dtype: int64

In [152]:
change_premise(pat,df,'specialty_store')

In [153]:
word1 = 'clothing'
word2 = 'store'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

clothing store    4987
Name: premise, dtype: int64

In [154]:
change_premise(pat,df,'clothing_store')

In [155]:
word1 = 'electronics'
word2 = 'store'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

electronics store, electrical sup.        1787
electronics store, electrical supplies    1065
Name: premise, dtype: int64

In [156]:
change_premise(pat,df,'electronic_store')

In [157]:
word1 = 'store'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

furniture, appliances, radios, tv store    226
Name: premise, dtype: int64

In [158]:
change_premise(pat,df,'electronic_store')

In [159]:
df.premise.value_counts()[50:60]

liquor_store                             1260
apartment/rental office                  1067
laundry/dry cleaners/washaterias         1057
mobile home                              1053
field, woods, forest, park               1003
apartment rental office                   933
highway/freeway                           926
freeway service road                      887
stadium_arena_track                       872
multi-plex home (duplex,triplex etc.)     860
Name: premise, dtype: int64

In [160]:
word1 = 'plex'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

multi-plex hme(duplex,triplex etc)       3188
multi-plex home (duplex,triplex etc.)     860
Name: premise, dtype: int64

In [161]:
change_premise(pat,df,'multiplex_home')

In [162]:
df.premise.value_counts()[50:60]

liquor_store                         1260
apartment/rental office              1067
laundry/dry cleaners/washaterias     1057
mobile home                          1053
field, woods, forest, park           1003
apartment rental office               933
highway/freeway                       926
freeway service road                  887
stadium_arena_track                   872
pawn, resale shop, or flea market     846
Name: premise, dtype: int64

In [163]:
# word1 = 'vacant'
# pat = r"^(?=.*\b{}\b).*$".format(word1)
# clean_premise(pat,df)

In [164]:
df.premise.value_counts()[50:60]

liquor_store                         1260
apartment/rental office              1067
laundry/dry cleaners/washaterias     1057
mobile home                          1053
field, woods, forest, park           1003
apartment rental office               933
highway/freeway                       926
freeway service road                  887
stadium_arena_track                   872
pawn, resale shop, or flea market     846
Name: premise, dtype: int64

In [165]:
word1 = 'warehouse'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

warehouse    2503
Name: premise, dtype: int64

In [166]:
change_premise(pat,df,'warehouse')

In [167]:
word1 = 'college'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

university/college       289
university or college    138
Name: premise, dtype: int64

In [168]:
change_premise(pat,df,'college')

In [169]:
word1 = 'highway'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

highway/freeway       926
highway or freeway    519
Name: premise, dtype: int64

In [170]:
change_premise(pat,df,'highway_freeway')

In [171]:
word1 = 'rental'
word2 = 'office'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

apartment/rental office    1067
apartment rental office     933
Name: premise, dtype: int64

In [172]:
change_premise(pat,df,'rental_office')

In [173]:
word1 = 'doctor'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

physician, doctor, dentist's office    639
Name: premise, dtype: int64

In [174]:
change_premise(pat,df,'doctor_office')

In [175]:
word1 = 'office'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

physician's office                           672
book,record,stationary,office sup.            94
book, record, stationary, office supplies     55
Name: premise, dtype: int64

In [176]:
word1 = 'book'
word2 = 'office'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

book,record,stationary,office sup.           94
book, record, stationary, office supplies    55
Name: premise, dtype: int64

In [177]:
change_premise(pat,df,'office_supplies')

In [178]:
word1 = 'physician'
word2 = 'office'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

physician's office    672
Name: premise, dtype: int64

In [179]:
change_premise(pat,df,'doctor_office')

In [180]:
df.premise.value_counts()[60:70]

laundry, dry cleaners, washaterias    767
factory/manufacturing/industrial      729
field/woods                           711
nursing home                          703
pawn/resale shop/flea market          700
libraries_museums                     624
check cashing places                  618
apartment laundry                     589
jewelry stores                        567
---                                   492
Name: premise, dtype: int64

In [181]:
word1 = 'field'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

field, woods, forest, park    1003
field/woods                    711
Name: premise, dtype: int64

In [182]:
change_premise(pat,df,'field_woods_park')

In [183]:
df.premise.value_counts()[:60]

house                                129717
apartment                            118085
apartment_parking                    111134
road_street_sidewalk                  81754
unknown                               59264
other_parking                         52748
driveway                              52736
department_store                      38154
restaurant_parking                    32469
commercial_parking                    24329
supermarket                           21104
business_center_parking               18597
gas_station                           17250
business                              15539
restaurant                            15306
building                              14689
hotel_motel_parking                   13399
convenience_store                     12957
supermarket_parking                   11894
bar_club_parking                      10099
bar_club                               9006
hotel_motel                            8780
convenience_store_parking       

In [184]:
word1 = 'cleaners'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

laundry/dry cleaners/washaterias      1057
laundry, dry cleaners, washaterias     767
Name: premise, dtype: int64

In [185]:
change_premise(pat,df,'dry_cleaners')

In [186]:
word1 = 'park'
word2 = 'ride'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

park and ride terminal    412
park & ride terminal      184
Name: premise, dtype: int64

In [187]:
change_premise(pat,df,'park_ride')

In [188]:
word1 = 'police'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

police station    351
Name: premise, dtype: int64

In [189]:
change_premise(pat,df,'police_station')

In [190]:
word1 = 'credit'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

credit union    79
Name: premise, dtype: int64

In [191]:
change_premise(pat,df,'bank')

In [192]:
word1 = 'package'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

package facility (fedex,ups,dhl)    25
Name: premise, dtype: int64

In [193]:
change_premise(pat,df,'package_facility')

In [194]:
word1 = 'barber'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

barber and beauty shops    2136
Name: premise, dtype: int64

In [195]:
change_premise(pat,df,'barber_shop')

In [196]:
word1 = 'pool'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

pool hall/game room       277
pool hall or game room    247
Name: premise, dtype: int64

In [197]:
change_premise(pat,df,'pool_hall')

In [198]:
word1 = 'video'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

video rental & sales    133
Name: premise, dtype: int64

In [199]:
change_premise(pat,df,'video_rental')

In [200]:
word1 = 'care'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

daycare/child care/kindergarten         376
daycare, child care, or kindergarten    300
Name: premise, dtype: int64

In [201]:
change_premise(pat,df,'daycare')

In [202]:
len(df.premise.value_counts())

122

In [203]:
df.premise.value_counts()[-30:]

amuse. park,bowl. alley,skate rink        150
office_supplies                           149
utility company, electric, gas, water     133
video_rental                              133
garden supply, nursery, florist           128
toys,arts & craft,musical,bike,pet        105
garden supply, nursery, or florist         99
jail/prison                                90
social services or public charities        78
lake, pond, waterway, bayou, river         73
tollway                                    63
railroad track or right of way             46
train terminal                             45
railroad track/right of way                45
lake/waterway/bayou                        39
not listed                                 35
train (not metro) terminal                 34
jail or prison                             33
package_facility                           25
n                                          18
illicit massage parlor or spa              12
marine veh. sales,boats,sailboats 

In [204]:
word1 = 'prison'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

jail/prison       90
jail or prison    33
Name: premise, dtype: int64

In [205]:
change_premise(pat,df,'jail_prison')

In [206]:
word1 = 'loan'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

savings and loan institutions    150
Name: premise, dtype: int64

In [207]:
change_premise(pat,df,'bank')

In [208]:
word1 = 'pawn'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

pawn, resale shop, or flea market    846
pawn/resale shop/flea market         700
Name: premise, dtype: int64

In [209]:
change_premise(pat,df,'pawn_shop')

In [210]:
word1 = 'mobile'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

mobile home    1053
Name: premise, dtype: int64

In [211]:
change_premise(pat,df,'mobile_home')

In [212]:
word1 = 'shop'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

body shop    410
Name: premise, dtype: int64

In [213]:
change_premise(pat,df,'body_shop')

In [214]:
df.premise.value_counts()[60:]

factory/manufacturing/industrial             729
nursing home                                 703
daycare                                      676
libraries_museums                            624
check cashing places                         618
park_ride                                    596
apartment laundry                            589
jewelry stores                               567
pool_hall                                    524
---                                          492
sporting goods/gun shops                     461
college                                      427
factory, manufacturing, or industrial        420
alley                                        411
body_shop                                    410
theatres, dinner theaters, auditoriums       386
convention_center                            351
police_station                               351
sporting goods or gun shops                  308
theatres,dinner theaters,auditor.            306
furniture, appliance

In [215]:
word1 = 'railroad'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

railroad track or right of way    46
railroad track/right of way       45
Name: premise, dtype: int64

In [216]:
change_premise(pat,df,'rail_road')

In [217]:
word1 = 'rise'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

high rise    184
Name: premise, dtype: int64

In [218]:
change_premise(pat,df,'high_rise')

In [219]:
word1 = 'check'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

check cashing places    618
Name: premise, dtype: int64

In [220]:
change_premise(pat,df,'check_cash')

In [221]:
word1 = 'center'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

rehabilitation center    250
Name: premise, dtype: int64

In [222]:
change_premise(pat,df,'rehab_center')

In [223]:
len(df.premise.value_counts())

118

In [224]:
df.head()

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type
0,1914-09-08,7,24C60,Burglary,12700-12799,LAKE HOUSTON,restaurant,1,PKWY
1,1914-11-02,3,18F60,Burglary,8800-8899,BELLAIRE,business,1,BLVD
2,1914-12-03,19,12D20,Auto Theft,12800-12899,GULF,unknown,1,FWY
3,1915-01-05,22,3B10,Theft,3200-3299,MANGUM RD 180,other_parking,1,-
4,1915-01-14,23,5F10,Auto Theft,7000-7099,WESTVIEW,apartment_parking,1,DR


In [225]:
#df.to_csv(data_directory_saves+"crime_beats_02.csv")

In [226]:
len(df.premise.value_counts())

118

In [227]:
df.premise.value_counts()[:50]

house                        129717
apartment                    118085
apartment_parking            111134
road_street_sidewalk          81754
unknown                       59264
other_parking                 52748
driveway                      52736
department_store              38154
restaurant_parking            32469
commercial_parking            24329
supermarket                   21104
business_center_parking       18597
gas_station                   17250
business                      15539
restaurant                    15306
building                      14689
hotel_motel_parking           13399
convenience_store             12957
supermarket_parking           11894
bar_club_parking              10099
bar_club                       9006
hotel_motel                    8780
convenience_store_parking      8474
specialty_store                7757
vacant_structure               6851
garage_carport                 6758
park_rec_pool                  6359
school                      

In [228]:
df.premise.value_counts()[50:]

field_woods_park                             1714
sexually_oriented_business                   1647
pawn_shop                                    1546
highway_freeway                              1445
doctor_office                                1311
car wash                                     1274
liquor_store                                 1260
mobile_home                                  1053
freeway service road                          887
stadium_arena_track                           872
factory/manufacturing/industrial              729
nursing home                                  703
daycare                                       676
libraries_museums                             624
check_cash                                    618
park_ride                                     596
apartment laundry                             589
jewelry stores                                567
pool_hall                                     524
---                                           492


In [229]:
word1 = 'social'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

social services/public charities       167
social services or public charities     78
Name: premise, dtype: int64

In [230]:
change_premise(pat,df,'social_service')

In [231]:
word1 = 'skate'
word2 = 'park'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

amusement park, bowling alley, skate rink    253
amuse. park,bowl. alley,skate rink           150
Name: premise, dtype: int64

In [232]:
change_premise(pat,df,'bowling_skate')

In [233]:
word1 = 'car'
word2 = 'wash'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

car wash    1274
Name: premise, dtype: int64

In [234]:
change_premise(pat,df,'car_wash')

In [235]:
word1 = 'nursing'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

nursing home    703
Name: premise, dtype: int64

In [236]:
change_premise(pat,df,'nursing_home')

In [237]:
word1 = 'freeway'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

freeway service road    887
Name: premise, dtype: int64

In [238]:
change_premise(pat,df,'freeway_service_road')

In [239]:
word1 = 'fire'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

fire station    187
Name: premise, dtype: int64

In [240]:
change_premise(pat,df,'fire_station')

In [241]:
#theatres
word1 = 'theatres'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

theatres, dinner theaters, auditoriums    386
theatres,dinner theaters,auditor.         306
Name: premise, dtype: int64

In [242]:
change_premise(pat,df,'theatres_dinner')

In [243]:
#mall
word1 = 'mall'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

mall common area    2937
Name: premise, dtype: int64

In [244]:
change_premise(pat,df,'mall_area')

In [245]:
#mall
word1 = 'factory'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

factory/manufacturing/industrial         729
factory, manufacturing, or industrial    420
Name: premise, dtype: int64

In [246]:
change_premise(pat,df,'factory_industrial')

In [247]:
# jewelry
#mall
word1 = 'jewelry'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

jewelry stores    567
Name: premise, dtype: int64

In [248]:
change_premise(pat,df,'jewelry_store')

In [249]:
df.premise.value_counts()[-10:]

illicit massage parlor or spa             12
marine veh. sales,boats,sailboats         11
marine vehicle sales, boats, sailboats     8
06                                         8
adult theater/arcade                       5
contra-flow/hov                            5
cantina                                    4
modeling studio                            2
contra-flow, managed, or hov lanes         1
23c                                        1
Name: premise, dtype: int64

In [250]:
word1 = 'garden'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

garden supply, nursery, florist       128
garden supply, nursery, or florist     99
Name: premise, dtype: int64

In [251]:
change_premise(pat,df,'garden_florist')

In [252]:
word1 = 'utility'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

utility company,electric,gas,water       165
utility company, electric, gas, water    133
Name: premise, dtype: int64

In [253]:
change_premise(pat,df,'utility_company')

In [254]:
word1 = 'gun'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

sporting goods/gun shops       461
sporting goods or gun shops    308
Name: premise, dtype: int64

In [255]:
change_premise(pat,df,'gun_shop')

In [256]:
df.premise.value_counts()[-30:]

social_service                               245
garden_florist                               227
rail_vehicle                                 189
fire_station                                 187
high_rise                                    184
rail_platform                                182
toys, arts craft,musical, bike,pet stores    151
office_supplies                              149
video_rental                                 133
jail_prison                                  123
toys,arts & craft,musical,bike,pet           105
rail_road                                     91
lake, pond, waterway, bayou, river            73
tollway                                       63
train terminal                                45
lake/waterway/bayou                           39
not listed                                    35
train (not metro) terminal                    34
package_facility                              25
n                                             18
illicit massage parl

In [257]:
word1 = 'lake'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

lake, pond, waterway, bayou, river    73
lake/waterway/bayou                   39
Name: premise, dtype: int64

In [258]:
change_premise(pat,df,'lake_bayou_river')

In [259]:
change_premise(pat,df,'marine_veh')

In [260]:
rd1 = 'train'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

Series([], Name: premise, dtype: int64)

In [261]:
len(df.premise.unique())

110

In [262]:
df.premise = df.premise.str.strip()

In [263]:
len(df.premise.unique())

110

In [264]:
df.premise.value_counts()[:50]

house                        129717
apartment                    118085
apartment_parking            111134
road_street_sidewalk          81754
unknown                       59264
other_parking                 52748
driveway                      52736
department_store              38154
restaurant_parking            32469
commercial_parking            24329
supermarket                   21104
business_center_parking       18597
gas_station                   17250
business                      15539
restaurant                    15306
building                      14689
hotel_motel_parking           13399
convenience_store             12957
supermarket_parking           11894
bar_club_parking              10099
bar_club                       9006
hotel_motel                    8780
convenience_store_parking      8474
specialty_store                7757
vacant_structure               6851
garage_carport                 6758
park_rec_pool                  6359
school                      

In [265]:
df.premise.value_counts()[50:]

field_woods_park                             1714
sexually_oriented_business                   1647
pawn_shop                                    1546
highway_freeway                              1445
doctor_office                                1311
car_wash                                     1274
liquor_store                                 1260
factory_industrial                           1149
mobile_home                                  1053
freeway_service_road                          887
stadium_arena_track                           872
gun_shop                                      769
nursing_home                                  703
theatres_dinner                               692
daycare                                       676
libraries_museums                             624
check_cash                                    618
park_ride                                     596
apartment laundry                             589
jewelry_store                                 567


In [266]:
word1 = 'metro'
word2 = 'train'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

train (not metro) terminal    34
Name: premise, dtype: int64

In [267]:
change_premise(pat,df,'train_terminal')

In [268]:
word1 = 'train'
word2 = 'terminal'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

train terminal    45
Name: premise, dtype: int64

In [269]:
change_premise(pat,df,'train_terminal')

In [270]:
word1 = 'appliances'
word2 = 'radios'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

furniture, appliances, radios, tv    274
Name: premise, dtype: int64

In [271]:
change_premise(pat,df,'electronic_store')

In [272]:
word1 = 'airport'
word2 = 'terminal'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

airport terminal    3781
Name: premise, dtype: int64

In [273]:
change_premise(pat,df,'airport_terminal')

In [274]:
word1 = 'apartment'
word2 = 'laundry'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

apartment laundry    589
Name: premise, dtype: int64

In [275]:
change_premise(pat,df,'apartment_laundry')

In [276]:
df.premise.value_counts()[50:]

field_woods_park                             1714
sexually_oriented_business                   1647
pawn_shop                                    1546
highway_freeway                              1445
doctor_office                                1311
car_wash                                     1274
liquor_store                                 1260
factory_industrial                           1149
mobile_home                                  1053
freeway_service_road                          887
stadium_arena_track                           872
gun_shop                                      769
nursing_home                                  703
theatres_dinner                               692
daycare                                       676
libraries_museums                             624
check_cash                                    618
park_ride                                     596
apartment_laundry                             589
jewelry_store                                 567


In [277]:
# arts craft
word1 = 'arts'
word2 = 'craft'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

toys, arts craft,musical, bike,pet stores    151
toys,arts & craft,musical,bike,pet           105
Name: premise, dtype: int64

In [278]:
change_premise(pat,df,'arts_bike_pet_store')

In [279]:
# arts craft
word1 = 'not'
word2 = 'listed'

pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

not listed    35
Name: premise, dtype: int64

In [280]:
change_premise(pat,df,'unknown')

In [281]:
df.premise.value_counts()

house                                     129717
apartment                                 118085
apartment_parking                         111134
road_street_sidewalk                       81754
unknown                                    59299
other_parking                              52748
driveway                                   52736
department_store                           38154
restaurant_parking                         32469
commercial_parking                         24329
supermarket                                21104
business_center_parking                    18597
gas_station                                17250
business                                   15539
restaurant                                 15306
building                                   14689
hotel_motel_parking                        13399
convenience_store                          12957
supermarket_parking                        11894
bar_club_parking                           10099
bar_club            

In [282]:
word1 = 'contra'
word2 = 'hov'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

contra-flow/hov                       5
contra-flow, managed, or hov lanes    1
Name: premise, dtype: int64

In [283]:
change_premise(pat,df,'hov')

In [284]:
df.premise.value_counts()[:50]

house                        129717
apartment                    118085
apartment_parking            111134
road_street_sidewalk          81754
unknown                       59299
other_parking                 52748
driveway                      52736
department_store              38154
restaurant_parking            32469
commercial_parking            24329
supermarket                   21104
business_center_parking       18597
gas_station                   17250
business                      15539
restaurant                    15306
building                      14689
hotel_motel_parking           13399
convenience_store             12957
supermarket_parking           11894
bar_club_parking              10099
bar_club                       9006
hotel_motel                    8780
convenience_store_parking      8474
specialty_store                7757
vacant_structure               6851
garage_carport                 6758
park_rec_pool                  6359
school                      

In [285]:
df.premise.value_counts()[50:]

field_woods_park                          1714
sexually_oriented_business                1647
pawn_shop                                 1546
highway_freeway                           1445
doctor_office                             1311
car_wash                                  1274
liquor_store                              1260
factory_industrial                        1149
mobile_home                               1053
freeway_service_road                       887
stadium_arena_track                        872
gun_shop                                   769
nursing_home                               703
theatres_dinner                            692
daycare                                    676
libraries_museums                          624
check_cash                                 618
park_ride                                  596
apartment_laundry                          589
jewelry_store                              567
pool_hall                                  524
---          

## change odd values to unknown
- `---`
- `06`
- `n`

In [286]:
odd1 = df.premise =='---'

In [287]:
df[odd1][:10]

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type
177,2000-02-12,14,10H80,Theft,2400-2499,TIMES,---,1,BLVD
298270,2012-04-09,16,3B40,Theft,900-999,NORTH LP,---,1,-
398988,2013-01-16,17,13D10,Theft,6200-6299,SOUTH LP E,---,1,-
422334,2013-03-25,12,2A30,Auto Theft,300-399,NORTH LP W,---,1,-
424556,2013-04-01,8,24C40,Theft,22600-22699,LAKE HOUSTON,---,1,-
440317,2013-05-15,21,8C20,Theft,7500-7599,HOMESTEAD,---,1,RD
444886,2013-05-27,12,14D20,Theft,1200-1299,SCOTT,---,1,-
495242,2013-10-12,22,20G10,Burglary,9600-9699,CLAREWOOD,---,1,DR
501947,2013-10-31,0,1A50,Theft,2200-2299,WEST LP S,---,1,SER
504437,2013-11-06,18,3B10,Theft,9500-9599,HEMPSTEAD,---,1,HWY


In [288]:
df.premise.replace({'---':'unknown'}, inplace=True)

In [289]:
df[odd1][:10]

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type
177,2000-02-12,14,10H80,Theft,2400-2499,TIMES,unknown,1,BLVD
298270,2012-04-09,16,3B40,Theft,900-999,NORTH LP,unknown,1,-
398988,2013-01-16,17,13D10,Theft,6200-6299,SOUTH LP E,unknown,1,-
422334,2013-03-25,12,2A30,Auto Theft,300-399,NORTH LP W,unknown,1,-
424556,2013-04-01,8,24C40,Theft,22600-22699,LAKE HOUSTON,unknown,1,-
440317,2013-05-15,21,8C20,Theft,7500-7599,HOMESTEAD,unknown,1,RD
444886,2013-05-27,12,14D20,Theft,1200-1299,SCOTT,unknown,1,-
495242,2013-10-12,22,20G10,Burglary,9600-9699,CLAREWOOD,unknown,1,DR
501947,2013-10-31,0,1A50,Theft,2200-2299,WEST LP S,unknown,1,SER
504437,2013-11-06,18,3B10,Theft,9500-9599,HEMPSTEAD,unknown,1,HWY


In [290]:
df.premise.replace({'06':'unknown'}, inplace=True)

In [291]:
odd2 = df.premise =='n'

In [292]:
df[odd2]

Unnamed: 0,date,hour,beat,offense_type,block_range,street_name,premise,num_offenses,type
523829,2014-01-01,12,2A40,Theft,200-299,SABINE,n,1,-
533410,2014-01-27,8,2A10,Theft,1900-1999,MAURY,n,1,-
533950,2014-01-29,4,11H10,Burglary,7200-7299,HARRISBURG,n,1,BLVD
534558,2014-01-30,19,1A20,Theft,900-999,ALABAMA,n,1,ST
534951,2014-01-31,9,1A10,Burglary,1900-1999,MAIN,n,1,ST
538749,2014-02-11,16,19G30,Aggravated Assault,7800-7899,COOK,n,1,RD
539845,2014-02-14,15,19G10,Theft,9500-9599,SOUTHWEST,n,1,FWY SER
542452,2014-02-22,21,10H30,Theft,1000-1099,ST EMANUEL,n,1,-
547731,2014-03-11,0,10H30,Theft,1700-1799,SAMPSON,n,1,-
551143,2014-03-21,8,8C60,Auto Theft,6100-6199,ANNUNCIATION,n,1,ST


In [293]:
df.premise.replace({'n':'unknown'}, inplace=True)

## finalize

In [294]:
df.premise.value_counts()[-30:]

alley                                     411
body_shop                                 410
bowling_skate                             403
police_station                            351
convention_center                         351
utility_company                           298
arts_bike_pet_store                       256
rehab_center                              250
social_service                            245
garden_florist                            227
rail_vehicle                              189
fire_station                              187
high_rise                                 184
rail_platform                             182
office_supplies                           149
video_rental                              133
jail_prison                               123
lake_bayou_river                          112
rail_road                                  91
train_terminal                             79
tollway                                    63
package_facility                  

In [295]:
word1 = 'adult'
word2 = 'theater'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

adult theater/arcade    5
Name: premise, dtype: int64

In [296]:
change_premise(pat,df,'sexually_oriented_business')

In [297]:
word1 = 'modeling'
word2 = 'studio'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

modeling studio    2
Name: premise, dtype: int64

In [298]:
change_premise(pat,df,'sexually_oriented_business')

In [299]:
word1 = 'marine'
word2 = 'boats'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

marine veh. sales,boats,sailboats         11
marine vehicle sales, boats, sailboats     8
Name: premise, dtype: int64

In [300]:
change_premise(pat,df,'marine_veh')

In [301]:
#mall
word1 = 'cantina'
pat = r"^(?=.*\b{}\b).*$".format(word1)
clean_premise(pat,df)

cantina    4
Name: premise, dtype: int64

In [302]:
change_premise(pat,df,'bar_club')

In [303]:
len(df.premise.value_counts())

98

In [304]:
df.premise.value_counts()[:40]

house                        129717
apartment                    118085
apartment_parking            111134
road_street_sidewalk          81754
unknown                       59817
other_parking                 52748
driveway                      52736
department_store              38154
restaurant_parking            32469
commercial_parking            24329
supermarket                   21104
business_center_parking       18597
gas_station                   17250
business                      15539
restaurant                    15306
building                      14689
hotel_motel_parking           13399
convenience_store             12957
supermarket_parking           11894
bar_club_parking              10099
bar_club                       9010
hotel_motel                    8780
convenience_store_parking      8474
specialty_store                7757
vacant_structure               6851
garage_carport                 6758
park_rec_pool                  6359
school                      

In [305]:
word1 = 'massage'
word2 = 'parlor'
pat = r"^(?=.*\b{}\b)(?=.*\b{}\b).*$".format(word1,word2)
clean_premise(pat,df)

illicit massage parlor or spa    12
Name: premise, dtype: int64

In [306]:
change_premise(pat,df,'sexually_oriented_business')

In [307]:
len(df.premise.value_counts())

97

In [308]:
df.premise.replace({'23c':'gas_station'}, inplace=True)

In [309]:
df.premise.value_counts()[:40]

house                        129717
apartment                    118085
apartment_parking            111134
road_street_sidewalk          81754
unknown                       59817
other_parking                 52748
driveway                      52736
department_store              38154
restaurant_parking            32469
commercial_parking            24329
supermarket                   21104
business_center_parking       18597
gas_station                   17251
business                      15539
restaurant                    15306
building                      14689
hotel_motel_parking           13399
convenience_store             12957
supermarket_parking           11894
bar_club_parking              10099
bar_club                       9010
hotel_motel                    8780
convenience_store_parking      8474
specialty_store                7757
vacant_structure               6851
garage_carport                 6758
park_rec_pool                  6359
school                      

In [310]:
df.premise.value_counts()[40:]

mall_area                     2937
church_temple_parking         2908
auto_repair                   2816
warehouse                     2503
barber_shop                   2136
hospital_parking              2071
rental_office                 2000
bank_parking                  1914
dry_cleaners                  1824
bank                          1740
field_woods_park              1714
sexually_oriented_business    1666
pawn_shop                     1546
highway_freeway               1445
doctor_office                 1311
car_wash                      1274
liquor_store                  1260
factory_industrial            1149
mobile_home                   1053
freeway_service_road           887
stadium_arena_track            872
gun_shop                       769
nursing_home                   703
theatres_dinner                692
daycare                        676
libraries_museums              624
check_cash                     618
park_ride                      596
apartment_laundry   

# save data

In [311]:
df.to_csv(data_directory_saves+"crime_clean_02.csv")