Maryland Department of Planning
Property Map Products
https://planning.maryland.gov/Pages/OurProducts/PropertyMapProducts/FinderProduct.aspx
>> Open Data GIS Downloads
MdProperty View
Dorchester
https://planning.maryland.gov/Pages/OurProducts/DownloadFiles.aspx

In [1]:
!pip install leven
!pip install simpledbf
!pip install ngram

Collecting leven
  Downloading https://files.pythonhosted.org/packages/73/02/37084115516cfd595ee2f9a873fffe8b85c6b1538523ff6a8b8dd7ff7d46/leven-1.0.4.tar.gz
Collecting nose
[?25l  Downloading https://files.pythonhosted.org/packages/15/d8/dd071918c040f50fa1cf80da16423af51ff8ce4a0f2399b7bf8de45ac3d9/nose-1.3.7-py3-none-any.whl (154kB)
[K     |████████████████████████████████| 163kB 5.4MB/s 
[?25hBuilding wheels for collected packages: leven
  Building wheel for leven (setup.py) ... [?25l[?25hdone
  Created wheel for leven: filename=leven-1.0.4-cp37-cp37m-linux_x86_64.whl size=55469 sha256=c12cc0b9256a284e2acb179b302abee7da3c6c78cc9ee4795b4689f0c1de9013
  Stored in directory: /root/.cache/pip/wheels/54/64/a5/439db671d666a50f3b3cebd2dcab3fbbab02785adf58e47552
Successfully built leven
Installing collected packages: nose, leven
Successfully installed leven-1.0.4 nose-1.3.7
Collecting simpledbf
  Downloading https://files.pythonhosted.org/packages/8a/d3/e4c25cd8f739dd7ddd19c255cd5552e08c

In [2]:
import pandas as pd
from simpledbf import Dbf5
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Load the sdat property dataset

This data is used when trying to fix the code violation to add missing tax ids

In [6]:
df = pd.read_csv('drive/My Drive/pita 2021/SDAT-CAN-ref-202104.csv')

  interactivity=interactivity, compiler=compiler, result=result)


#### Format violations list dates and violations lists
pandas df.explode() is used to make records for each row with seprate violations, but we won't do that until needed to avoid memeory pressure when clustering.

In [7]:
!cp "drive/My Drive/pita 2020/text_cluster.py" .
from text_cluster import assign_clusters
from text_cluster import text_similarity
from text_cluster import cluster_strings

violations = pd.read_csv("drive/My Drive/pita 2020/codeviolation_20191208.csv")
violations['Open Date'] = violations.apply(lambda x: pd.to_datetime(x['Open Date']), axis = 1)
violations = violations.rename(columns={'Open Date':'Open_Date'})

In [8]:
import re

def format_contact(s):
  cinfo = re.sub(r'[,.;:-]','', s).split("\n")
  name = cinfo[0]
  address = ""
  if "Property Owner" in cinfo[0]:
    if len(cinfo) > 2:
      name = cinfo[1].replace("'","").upper()
      address = re.sub(r',* *[A-Z][A-Z] \d\d\d\d\d\'*$',"",cinfo[2]).replace("'","").upper()
    else:
      match = re.search(r' \d+ ',cinfo[1])
      if match:
        tokens = cinfo[1].upper().split(" ")
        numbers = [i for i,token in enumerate(tokens) if re.search(r'\d',token)]
        if len(numbers) > 0:
          name = " ".join(tokens[0:numbers[0]])
          address = " ".join(tokens[numbers[0]:-1])
  else:
    print("BAD RECORD ",s)
  return pd.Series([name, address], index=['Contact_Name','Contact_Address'])


violations[['Contact_Name','Contact_Address']] = violations['Contacts'].apply(format_contact)
violations['address'] = violations.fillna("").apply(lambda x: " ".join([str(x['Street Number']),str(x['Street Direction']),str(x['Street Name']),str(x['Street Type'])]).upper(),axis=1)
violations.APN = violations.apply(lambda x: x.APN.strip(u'\u200b'),axis=1)
violations.address = violations.apply(lambda x: re.sub(r' +',' ',x.address).strip(),axis=1)

#### fixup account ids

In [9]:
noacctid = violations.query('APN == ""',engine='python').address.array
found_acctids = {v[0]:v[1] for v in df.query('address in @noacctid')[['address','acctid']].values}
violations.acctid = violations.apply(lambda x: found_acctids.get(x.address,x.APN),axis=1)
violations.drop_duplicates(inplace=True)
violations = violations.query('APN.str.contains("[0-9]+")',engine='python')

  This is separate from the ipykernel package so we can avoid doing imports until


#### Summarize violations of all types.
Split the multi-line violation descriptions, and total the number of individual issues per tax account id.

In [10]:
violations['vtypes'] = violations.apply(lambda x: x['Violation(s)'].split('\n'),axis=1)
v_flat = violations.explode('vtypes')
vsummary = v_flat.groupby('APN')['Violation(s)'].value_counts().reset_index(name='vcount')
vs2 = vsummary.groupby(['APN'])['vcount'].sum().reset_index(name='violations_total')
vs2


Unnamed: 0,APN,violations_total
0,1007104057,1
1,1007104073,1
2,1007104138,3
3,1007104278,7
4,1007104316,1
...,...,...
1533,1007229348,7
1534,1007230230,1
1535,1007230664,1
1536,1007232438,1


####Cluster the violations
Assign clusters by contact name and contact address, split from the Contact column.

In [11]:
%%time
myquery = violations.query('(Open_Date > "2018-12-31")',engine='python')
names = cluster_strings(myquery.Contact_Name.array,'c_contact',metric='jaccard',threshold=0.1)
addresses = cluster_strings(myquery.Contact_Address.array,'c_address',metric='jaccard',threshold=0.1)

rows: 909 clusters: 158
rows: 909 clusters: 154
CPU times: user 2min 32s, sys: 11.1 s, total: 2min 43s
Wall time: 2min 22s


In [12]:
violations['contact_cluster'] = violations.apply(lambda x: names.get(x.Contact_Name,-1), axis=1)
violations['contactadd_cluster'] = violations.apply(lambda x: addresses.get(x.Contact_Address,-1), axis=1)

#### At this point we have clusters for all the code violations

Some tax accounts are associated with more than one cluster.  This can be due to entry errors in the complaint data, or changes in ownership over the complaint period. The default behavior, used as the fixed behavior here, is to return the least ambiguous dominant cluster id.  What this means practically is if the most often cited cluster == -1 (no cluster), but the second most frequent citation in the period is a valid cluster, that valid cluster is assinged to the tax id.  If more than one valid cluster is cited, the most frequently cited is used.

Some of the violations have no tax acctid, in some cases because of addres spelling issues.  Some of these can be backfilled.  The ones we can fix, we do.  Many of the others are just vacant lots which will drop out on the join with properties licenesed for rental.

In [13]:
def best_cluster(cnumbers):
  if len(cnumbers) > 1 and cnumbers.index[0] < 0:
    return cnumbers.index[1]
  else:
    return cnumbers.index[0]
    
vc_df = pd.DataFrame(violations.groupby(['APN'])['contactadd_cluster'].agg(lambda x:best_cluster(x.value_counts())))#.to_dict()
cadd_c = violations.groupby(['APN'])['contact_cluster'].agg(lambda x:best_cluster(x.value_counts()))
vc_df = vc_df.merge(cadd_c,left_index=True,right_index=True)

# fix tax account id's where possible
vc_df = vc_df.merge(vs2, on='APN',how='inner')
vc_df

Unnamed: 0,APN,contactadd_cluster,contact_cluster,violations_total
0,1007104057,-1,-1,1
1,1007104073,234,95,1
2,1007104138,47,41,3
3,1007104278,31,26,7
4,1007104316,-1,-1,1
...,...,...,...,...
1533,1007229348,137,-1,7
1534,1007230230,-1,85,1
1535,1007230664,-1,-1,1
1536,1007232438,-1,-1,1


### Save result as violation_clusters.csv


In [None]:
vc_df.to_csv('drive/My Drive/pita 2020/violation_clusters.csv')

In [None]:
vsample = violations.query('(Open_Date > "2018-12-31")')[['APN','address','contact_cluster','Contact_Name','contactadd_cluster','Contact_Address']]
vsample.APN = vsample.APN.astype(str)
vsample = vsample.rename(columns={'APN':'acctid'})
vsample.acctid = vsample.apply(lambda x: x.acctid.strip(u'\u200b'),axis=1)
vsample.address = vsample.apply(lambda x: re.sub(r' +',' ',x.address).strip(),axis=1)

noacctid = vsample.query('acctid == ""',engine='python').address.array
found_acctids = {v[0]:v[1] for v in df.query('address in @noacctid')[['address','acctid']].values}
vsample.acctid = vsample.apply(lambda x: found_acctids.get(x.address,x.acctid),axis=1)
vsample.query('acctid == ""',engine='python').address.array
#.query('Contact_Name.str.contains("GLG")',engine='python')

<PandasArray>
[]
Length: 0, dtype: object