# Data Cleaning for Government Land Sale Notices

In [82]:
# Import modules and packages
import nltk
import string
import numpy as np  
import pandas as pd 
import re
from nltk.corpus import stopwords 
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

## Read in Text

In [85]:
# load data
filename = 'TPTL 234.txt'
file = open(filename, 'rt')
text = file.read()
file.close()

from nltk.tokenize import word_tokenize
def clean_text(text):
    # split into words
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return words

print(clean_text(text))



In [86]:
# split into sentences
from nltk import sent_tokenize
sentences = sent_tokenize(text)

# Convert to df
df = pd.DataFrame(sentences, columns=['text'])

def clean_text(text):
    sent = text.lower()
   # sent = re.sub(r'\([^)]*\)', '', sent)
   # sent = re.sub('"','', sent)
   # sent = re.sub(r"'s\b","",sent)
   # sent = re.sub("[^a-zA-Z]", " ", sent) 
    sent = re.sub("[^a-zA-Z_0-9]", " ", sent)
    tokens = [w for w in sent.split() if not w in stop_words]  
    return (" ".join(tokens)).strip()

cleaned_text = []
for t in df.text:
    cleaned_text.append(clean_text(t))
    
df['cleaned_text']=cleaned_text

df.to_csv("df_sent.csv")

df

Unnamed: 0,text,cleaned_text
0,Information Statement \n\nTai Po Town Lot No.,information statement tai po town lot
1,234 \n\nThis Information Statement is issued only for the information of prospective \npurchasers of Tai Po Town Lot No.,234 information statement issued information prospective purchasers tai po town lot
2,"234 (hereinafter referred to as ""the Lot"").",234 hereinafter referred lot
3,"It shall not form \npart of the Tender Notice, the Form of Tender and the Conditions of Sale by Public Tender for \nthe Lot (hereinafter referred to as ""the Conditions of Sale"") nor shall it be ta...",shall form part tender notice form tender conditions sale public tender lot hereinafter referred conditions sale shall taken consideration interpretation construction conditions sale
4,"Nothing in this Information \nStatement should be relied on as any representation by the Government of the Hong Kong Special \nAdministrative Region (hereinafter referred to as ""the Government"").",nothing information statement relied representation government hong kong special administrative region hereinafter referred government
...,...,...
656,Rent Amount of premium \nat which purchased \n\nTai Po Town Lot No.,rent amount premium purchased tai po town lot
657,234 As specified in General Condition No.,234 specified general condition
658,"4 $ \n\nDated this day of \n\nWitness to the signature of/execution by \nthe Purchaser: \n\nAddress \n\nWitness to the signature of \nChief Estate Surveyor/Land Supply: \n\nCivil servant, \nLands ...",4 dated day witness signature execution purchaser address witness signature chief estate surveyor land supply civil servant lands department 20 signature purchaser execution purchaser case limited...
659,AGREEMENT \n\nAND \n\nCONDITIONS OF SALE \n\nOF \n\nTAI PO TOWN LOT NO.,agreement conditions sale tai po town lot


In [87]:
# Load manually added summary to target sentences
df_sum = pd.read_csv("TPTL 234 summary.csv")

df_sum['summary'].replace('', np.nan, inplace=True)
df_sum.dropna(axis=0,inplace=True)
df_sum['summary'] = df_sum['summary'].apply(lambda x : '_START_ '+ x + ' _END_')
df_sum.reset_index(drop=True, inplace=True)
df_sum = df_sum[['cleaned_text','summary']]
df_sum

Unnamed: 0,cleaned_text,summary
0,234 particulars lot location tai po road tai po kau tai po new territories site delineated shown coloured pink pink hatched black pink hatched black stippled black pink stippled black plan annexed...,_START_ location tai po road tai po kau tai po new territories site area 14 002 square metres _END_
1,c purchaser shall expense uphold manage maintain repair existing buildings structures lifetime thereof respects satisfaction director,_START_ to maintain and repair existing buildings _END_
2,event purchaser decides demolish remove existing buildings structures part parts thereof purchaser shall bear costs expenses arising connection said demolition removal shall carry demolition remov...,_START_ demolish and remove existing buildings _END_
3,3 purchaser acknowledges date agreement protrusions structures including limited pipes metal supporting racks air conditioning units electricity meter boxes projecting buildings erected pieces par...,_START_ acknowledge projecting structures _END_
4,4 hereby excepted reserved government following strata land airspace hereinafter collectively referred reserved areas purchaser shall right title ownership possession use save except provided cond...,_START_ excepted and reserved reserved areas edged red _END_
5,7 metres hkpd area shown edged green plan annexed hereto iii stratum land airspace level 93 9 metres hkpd area shown edged purple plan annexed hereto,_START_ edged green edged purple _END_
6,purchaser shall possession orange area part thereof expense clean manage maintain repair orange area part thereof possession respects satisfaction director time possession whole orange area delive...,_START_ orange area re-delivered to the government on demand _END_
7,6 purchaser shall 30111 day september 2028 date may approved director purchaser expense manner materials standards levels alignment design director shall approve respects satisfaction director lay...,"_START_ lay and form future public road and the structures on the green area by 30 september 2028, re-delivered up to the government on demand _END_"
8,11 lot part thereof building part building erected erected thereon shall used purpose private residential purposes,_START_ private residential _END_
9,7 hereof lot paii thereof building buildings erected erected lot shall respects comply buildings ordinance compliance town planning ordinance total gross floor area design disposition sustainable ...,_START_ total gross floor area 21 003 square metres _END_


In [88]:
df_sum.to_csv("df_sum.csv")