In [1]:
import re
import pandas as pd
pd.options.mode.chained_assignment = None
import nmslib
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from fillpdf import fillpdfs
import urllib.request

In [29]:
#set directory to where this file is saved
os.chdir(os.path.abspath(''))

path = ["https://www.nj.gov/dca/divisions/codes/forms/pdf_ucc_stdforms/ucc_f110_bldg.pdf",
        "https://www.nj.gov/dca/divisions/codes/forms/pdf_ucc_stdforms/ucc_f120_elec.pdf",
        "https://www.nj.gov/dca/divisions/codes/forms/pdf_ucc_stdforms/ucc_f140_fire_prot.pdf",
       "https://www.nj.gov/dca/divisions/codes/forms/pdf_ucc_stdforms/ucc_f130_plumb.pdf"]

df = pd.DataFrame()

##Download the PDF from path
for i in range(len(path)):

    #Get name of pdf out of url
    name = re.search(r'\w+\.pdf', path[i]).group(0)

    #Save locally
    urllib.request.urlretrieve(path[i], name)

    #Get all fillable forms
    json = fillpdfs.get_form_fields(name)

    #convert into a dataframe
    df_json = pd.DataFrame(json, index = [0])

    df_temp = pd.melt(df_json)
    df_temp['form_name'] = name

    #reorder columns
    df_temp = df_temp[['form_name', 'variable', 'value']]
    df_temp.columns = ['form_name', 'field_name', 'value']

    #append to dataframe
    df = df.append(df_temp)


#Subset to only fields that appear in at least 2 forms
df_sub = df.groupby('field_name').count()
df_sub = df_sub[df_sub['form_name'] > 1]
df_sub = df_sub.reset_index()
df_sub = df_sub[['field_name']]

#Remove 0,Qty-Other, and Qty-Unknown from list of fields
df_sub = df_sub[~df_sub['field_name'].isin(['0', 'Qty-Other', 'Qty-Dishwasher'])]

df_sub["value"] = ""

#write.csv
df_sub.to_csv('fields_to_fill.csv', index = False)


In [None]:
#Read the csv
df_filled = pd.read_csv('fields_to_fill.csv')

for i in range(len(path)):
    #Get name of pdf out of url
    name = re.search(r'\w+\.pdf', path[i]).group(0)

    #subset df to only fields that appear in this form'
    df_form = df[df['form_name'] == name]

    #Remove value field
    df_form = df_form.drop(columns = ['value'])
    
    #left join to get all fields that appear in both df_form and df_sub
    df_form = pd.merge(df_form, df_filled, how = 'left', on = 'field_name')

    #drop rows with null values
    df_form = df_form.dropna()
    
    #convert to dictionary
    dict_form = dict(zip(df_form.field_name, df_form.value))

    #edit pdf name to have "_filled"
    name_filled = name.replace('.pdf', '_filled.pdf')

    #fill pdf
    fillpdfs.write_fillable_pdf(name, name_filled, dict_form)
    



In [None]:
forms = ['F100', 'F101-CUPW', 'F101-HECC', 'F101-LEAD', 'F102', 'F110']

df = pd.DataFrame()
for form in forms:
    form_name = form + '.pdf'
    form_url = "static/construction-form-pdfs/" + form_name
    
    #Get all fillable forms
    json = fillpdfs.get_form_fields(form_url)

    #convert into a dataframe
    df_json = pd.DataFrame(json, index = [0])

    df_temp = pd.melt(df_json)
    df_temp['form_name'] = form

    #reorder columns
    df_temp = df_temp[['form_name', 'variable', 'value']]
    df_temp.columns = ['form_name', 'field_name', 'value']

    #add current index as a column
    df_temp['form_index'] = df_temp.index

    #append to dataframe
    df = df.append(df_temp)

#Summarize the dataframe
df_summary = df.groupby('field_name').agg({'form_name':'count','form_index':'sum'})

df = df.groupby('field_name').form_name.apply(list)

#left join
df = pd.merge(df, df_summary, how = 'left', on = 'field_name')

#Rename form_name_x to form_name
df.rename(columns = {'form_name_x': 'form_name', 'form_name_y': 'form_name_count'}, inplace = True)

#convert form_name_x to string
df['form_name_string'] = df['form_name'].astype(str)

##Order by form_name_y decending, form_name_x acending, form_index decending
df = df.sort_values(by = ['form_name_count', 'form_name_string', 'form_index'], ascending = [False, False, True])

#drop form_name_string
df = df.drop(columns = ['form_name_string'])

#add field_name to dataframe
df['field_name'] = df.index

#Create JSON of data
df_json = df.to_json(orient = 'records')
print(df_json)

In [7]:
data = {'F110,F120,F130 | Block': '', 'F110,F120,F130 | Work Site2': '', 'F110,F120,F130 | Owner in Fee': '', 'F110,F120,F130 | Owner Tel': '', 'F110,F120,F130 | Owner eMail': 'Dave', 'F110,F120,F130 | Owner Address': 'Dave', 'F110,F120,F130 | Owner Muni': '', 'F110,F120,F130 | Owner Zip': '', 'F110,F120,F130 | Contractor Name': '', 'F110,F120,F130 | Contractor Tel': '', 'F110,F120,F130 | Contractor Address': '', 'F110,F120,F130 | Contractor eMail': '', 'F110,F120,F130 | Contractor Address2': '', 'F110,F120,F130 | License or Bldr Reg': '', 'F110,F120,F130 | 0': '', 'F110,F120,F130 | Expiration': '', 'F110,F120,F130 | HIC Reg or Exempt': '', 'F110,F120,F130 | FEID': '', 'F110,F120,F130 | Contractor Fax': '', 'F110,F120,F130 | Use Grp -Pres': '', 'F110,F120,F130 | Use Grp -Prop': '', 'F120,F130 | Qualif': '', 'F120,F130 | Worksite': '', 'F120,F130 | Qty -Dishwasher': '', 'F110,F120 | Contractor Print Name': '', 'F110,F120 | Other': '', 'F110,F120 | Other Descrip': '', 'F130 | Sewer Sz': '', 'F130 | Public Sewer': '', 'F130 | Private Septic': '', 'F130 | Sevice Sz': '', 'F130 | Pub Water': '', 'F130 | Priv Well': '', 'F130 | Est Cost of Plumb Wk': '', 'F130 | Print Name Here': '', 'F130 | Licensed': '', 'F130 | Exempt': ''}

#convert to a dataframe
df = pd.DataFrame(data.items(), columns = ['field_name', 'value'])

df['form_name'] = df['field_name'].apply(lambda x: x.split(' | ')[0])
df['field_name'] = df['field_name'].apply(lambda x: x.split(' | ')[1])

print(df)

               field_name value       form_name
0                   Block        F110,F120,F130
1              Work Site2        F110,F120,F130
2            Owner in Fee        F110,F120,F130
3               Owner Tel        F110,F120,F130
4             Owner eMail  Dave  F110,F120,F130
5           Owner Address  Dave  F110,F120,F130
6              Owner Muni        F110,F120,F130
7               Owner Zip        F110,F120,F130
8         Contractor Name        F110,F120,F130
9          Contractor Tel        F110,F120,F130
10     Contractor Address        F110,F120,F130
11       Contractor eMail        F110,F120,F130
12    Contractor Address2        F110,F120,F130
13    License or Bldr Reg        F110,F120,F130
14                      0        F110,F120,F130
15             Expiration        F110,F120,F130
16      HIC Reg or Exempt        F110,F120,F130
17                   FEID        F110,F120,F130
18         Contractor Fax        F110,F120,F130
19          Use Grp -Pres        F110,F1

In [14]:
#Get unique forms
forms = df.form_name.unique()
forms = [x.split(',') for x in forms]
forms = [item for sublist in forms for item in sublist]
forms = list(set(forms))


['F120', 'F130', 'F110']


In [20]:
for form in forms:
    #subset df on if form appears in form_name with a grepl
    df_form = df[df['form_name'].str.contains(form)]

    dict_form = dict(zip(df_form.field_name, df_form.value))

    folder = 'static/construction-form-pdfs/'
    filled_folder = 'static/filled_construction_form_pdfs/'
    name = form + '.pdf'
    name_filled = name.replace('.pdf', '_filled.pdf')

    #fill pdf
    fillpdfs.write_fillable_pdf(folder + name, filled_folder + name_filled, dict_form)